diff --git a/.ci/dpdk-build.sh b/.ci/dpdk-build.sh new file mode 100755 index 00000000000..e1b8e3ccbb4 --- /dev/null +++ b/.ci/dpdk-build.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +set -o errexit +set -x + +function build_dpdk() +{ + local DPDK_VER=$1 + local DPDK_OPTS="" + local DPDK_INSTALL_DIR="$(pwd)/dpdk-dir" + local VERSION_FILE="$DPDK_INSTALL_DIR/cached-version" + + rm -rf dpdk-src + rm -rf $DPDK_INSTALL_DIR + + if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then + git clone --single-branch $DPDK_GIT dpdk-src -b "${DPDK_VER##refs/*/}" + pushd dpdk-src + git log -1 --oneline + else + wget https://fast.dpdk.org/rel/dpdk-$1.tar.xz + tar xvf dpdk-$1.tar.xz > /dev/null + DIR_NAME=$(tar -tf dpdk-$1.tar.xz | head -1 | cut -f1 -d"/") + mv ${DIR_NAME} dpdk-src + pushd dpdk-src + fi + + # Switching to 'generic' platform to make the dpdk cache usable on + # different CPUs. We can't be sure that all CI machines are exactly same. + DPDK_OPTS="$DPDK_OPTS -Dplatform=generic" + + # Disable building DPDK unit tests. Not needed for OVS build or tests. + DPDK_OPTS="$DPDK_OPTS -Dtests=false" + + # Disable DPDK developer mode, this results in less build checks and less + # meson verbose outputs. + DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" + + # OVS compilation and "normal" unit tests (run in the CI) do not depend on + # any DPDK driver. + # check-dpdk unit tests requires testpmd and some net/ driver. + DPDK_OPTS="$DPDK_OPTS -Denable_apps=test-pmd" + enable_drivers="net/null,net/af_xdp,net/tap,net/virtio,net/pcap" + DPDK_OPTS="$DPDK_OPTS -Denable_drivers=$enable_drivers" + # OVS depends on the vhost library (and its dependencies). + # net/tap depends on the gso library. + DPDK_OPTS="$DPDK_OPTS -Denable_libs=cryptodev,dmadev,gso,vhost" + + # Install DPDK using prefix. + DPDK_OPTS="$DPDK_OPTS --prefix=$DPDK_INSTALL_DIR" + + meson setup $DPDK_OPTS build + ninja -C build + ninja -C build install + popd + + # Remove examples sources. + rm -rf $DPDK_INSTALL_DIR/share/dpdk/examples + + echo "Installed DPDK in $DPDK_INSTALL_DIR" + echo "${DPDK_VER}" > ${VERSION_FILE} +} + +build_dpdk $DPDK_VER diff --git a/.ci/dpdk-prepare.sh b/.ci/dpdk-prepare.sh new file mode 100755 index 00000000000..4424f9eb97f --- /dev/null +++ b/.ci/dpdk-prepare.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -ev + +# Installing wheel separately because it may be needed to build some +# of the packages during dependency backtracking and pip >= 22.0 will +# abort backtracking on build failures: +# https://github.com/pypa/pip/issues/10655 +pip3 install --disable-pip-version-check --user wheel +pip3 install --disable-pip-version-check --user pyelftools +pip3 install --user 'meson>=1.4,<1.5' diff --git a/.ci/linux-build.sh b/.ci/linux-build.sh index 23c8bbb7aed..702feeb3bb3 100755 --- a/.ci/linux-build.sh +++ b/.ci/linux-build.sh @@ -6,111 +6,13 @@ set -x CFLAGS_FOR_OVS="-g -O2" SPARSE_FLAGS="" EXTRA_OPTS="--enable-Werror" - -on_exit() { - if [ $? = 0 ]; then - exit - fi - FILES_TO_PRINT="config.log" - FILES_TO_PRINT="$FILES_TO_PRINT */_build/sub/tests/testsuite.log" - - for pr_file in $FILES_TO_PRINT; do - cat "$pr_file" 2>/dev/null - done -} -# We capture the error logs as artifacts in Github Actions, no need to dump -# them via a EXIT handler. -[ -n "$GITHUB_WORKFLOW" ] || trap on_exit EXIT - -function install_kernel() -{ - if [[ "$1" =~ ^5.* ]]; then - PREFIX="v5.x" - elif [[ "$1" =~ ^4.* ]]; then - PREFIX="v4.x" - elif [[ "$1" =~ ^3.* ]]; then - PREFIX="v3.x" - else - PREFIX="v2.6/longterm/v2.6.32" - fi - - base_url="https://cdn.kernel.org/pub/linux/kernel/${PREFIX}" - # Download page with list of all available kernel versions. - wget ${base_url}/ - # Uncompress in case server returned gzipped page. - (file index* | grep ASCII) || (mv index* index.new.gz && gunzip index*) - # Get version of the latest stable release. - hi_ver=$(echo ${1} | sed 's/\./\\\./') - lo_ver=$(cat ./index* | grep -P -o "${hi_ver}\.[0-9]+" | \ - sed 's/.*\..*\.\(.*\)/\1/' | sort -h | tail -1) - version="${1}.${lo_ver}" - - rm -rf index* linux-* - - url="${base_url}/linux-${version}.tar.xz" - # Download kernel sources. Try direct link on CDN failure. - wget ${url} || - (rm -f linux-${version}.tar.xz && wget ${url}) || - (rm -f linux-${version}.tar.xz && wget ${url/cdn/www}) - - tar xvf linux-${version}.tar.xz > /dev/null - pushd linux-${version} - make allmodconfig - - # Cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler - sed -i 's/CONFIG_KCOV=y/CONFIG_KCOV=n/' .config - - # stack validation depends on tools/objtool, but objtool does not compile on travis. - # It is giving following error. - # >>> GEN arch/x86/insn/inat-tables.c - # >>> Semantic error at 40: Unknown imm opnd: AL - # So for now disable stack-validation for the build. - - sed -i 's/CONFIG_STACK_VALIDATION=y/CONFIG_STACK_VALIDATION=n/' .config - make oldconfig - - # Older kernels do not include openvswitch - if [ -d "net/openvswitch" ]; then - make net/openvswitch/ - else - make net/bridge/ - fi - - if [ "$AFXDP" ]; then - sudo make headers_install INSTALL_HDR_PATH=/usr - pushd tools/lib/bpf/ - # Bulding with gcc because there are some issues in make files - # that breaks building libbpf with clang on Travis. - CC=gcc sudo make install - CC=gcc sudo make install_headers - sudo ldconfig - popd - # The Linux kernel defines __always_inline in stddef.h (283d7573), and - # sys/cdefs.h tries to re-define it. Older libc-dev package in xenial - # doesn't have a fix for this issue. Applying it manually. - sudo sed -i '/^# define __always_inline .*/i # undef __always_inline' \ - /usr/include/x86_64-linux-gnu/sys/cdefs.h || true - EXTRA_OPTS="${EXTRA_OPTS} --enable-afxdp" - fi - popd -} +JOBS=${JOBS:-"-j4"} function install_dpdk() { - local DPDK_VER=$1 - local VERSION_FILE="dpdk-dir/travis-dpdk-cache-version" - local DPDK_OPTS="" - local DPDK_LIB="" - - if [ -z "$TRAVIS_ARCH" ] || - [ "$TRAVIS_ARCH" == "amd64" ]; then - DPDK_LIB=$(pwd)/dpdk-dir/build/lib/x86_64-linux-gnu - elif [ "$TRAVIS_ARCH" == "aarch64" ]; then - DPDK_LIB=$(pwd)/dpdk-dir/build/lib/aarch64-linux-gnu - else - echo "Target is unknown" - exit 1 - fi + local DPDK_INSTALL_DIR="$(pwd)/dpdk-dir" + local VERSION_FILE="${DPDK_INSTALL_DIR}/cached-version" + local DPDK_LIB=${DPDK_INSTALL_DIR}/lib/x86_64-linux-gnu if [ "$DPDK_SHARED" ]; then EXTRA_OPTS="$EXTRA_OPTS --with-dpdk=shared" @@ -122,58 +24,17 @@ function install_dpdk() # Export the following path for pkg-config to find the .pc file. export PKG_CONFIG_PATH=$DPDK_LIB/pkgconfig/:$PKG_CONFIG_PATH - if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then - # Avoid using cache for git tree build. - rm -rf dpdk-dir + # Expose dpdk binaries. + export PATH=$(pwd)/dpdk-dir/bin:$PATH - DPDK_GIT=${DPDK_GIT:-https://dpdk.org/git/dpdk} - git clone --single-branch $DPDK_GIT dpdk-dir -b "${DPDK_VER##refs/*/}" - pushd dpdk-dir - git log -1 --oneline - else - if [ -f "${VERSION_FILE}" ]; then - VER=$(cat ${VERSION_FILE}) - if [ "${VER}" = "${DPDK_VER}" ]; then - # Update the library paths. - sudo ldconfig - echo "Found cached DPDK ${VER} build in $(pwd)/dpdk-dir" - return - fi - fi - # No cache or version mismatch. - rm -rf dpdk-dir - wget https://fast.dpdk.org/rel/dpdk-$1.tar.xz - tar xvf dpdk-$1.tar.xz > /dev/null - DIR_NAME=$(tar -tf dpdk-$1.tar.xz | head -1 | cut -f1 -d"/") - mv ${DIR_NAME} dpdk-dir - pushd dpdk-dir + if [ ! -f "${VERSION_FILE}" ]; then + echo "Could not find DPDK in $DPDK_INSTALL_DIR" + return 1 fi - # Switching to 'default' machine to make dpdk-dir cache usable on - # different CPUs. We can't be sure that all CI machines are exactly same. - DPDK_OPTS="$DPDK_OPTS -Dmachine=default" - - # Disable building DPDK unit tests. Not needed for OVS build or tests. - DPDK_OPTS="$DPDK_OPTS -Dtests=false" - - # Disable DPDK developer mode, this results in less build checks and less - # meson verbose outputs. - DPDK_OPTS="$DPDK_OPTS -Ddeveloper_mode=disabled" - - # Install DPDK using prefix. - DPDK_OPTS="$DPDK_OPTS --prefix=$(pwd)/build" - - CC=gcc meson $DPDK_OPTS build - ninja -C build - ninja -C build install - # Update the library paths. sudo ldconfig - - - echo "Installed DPDK source in $(pwd)" - popd - echo "${DPDK_VER}" > ${VERSION_FILE} + echo "Found cached DPDK $(cat ${VERSION_FILE}) build in $DPDK_INSTALL_DIR" } function configure_ovs() @@ -187,7 +48,32 @@ function build_ovs() configure_ovs $OPTS make selinux-policy - make -j4 + make ${JOBS} +} + +function clang_analyze() +{ + [ -d "./base-clang-analyzer-results" ] && cache_build=false \ + || cache_build=true + if [ "$cache_build" = true ]; then + # If this is a cache build, proceed to the base branch's directory. + pushd base_ovs_main + fi; + + configure_ovs $OPTS + + make clean + scan-build -o ./clang-analyzer-results -sarif --use-cc=${CC} make ${JOBS} + + if [ "$cache_build" = true ]; then + # Move results, so it will be picked up by the cache. + mv ./clang-analyzer-results ../base-clang-analyzer-results + popd + else + # Only do the compare on the none cache builds. + sarif --check note diff ./base-clang-analyzer-results \ + ./clang-analyzer-results + fi; } if [ "$DEB_PACKAGE" ]; then @@ -222,15 +108,12 @@ assert ovs.json.from_string('{\"a\": 42}') == {'a': 42}" exit 0 fi -if [ "$KERNEL" ]; then - install_kernel $KERNEL +if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then + install_dpdk fi -if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then - if [ -z "$DPDK_VER" ]; then - DPDK_VER="21.11.2" - fi - install_dpdk $DPDK_VER +if [ "$STD" ]; then + CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} -std=$STD" fi if [ "$CC" = "clang" ]; then @@ -240,40 +123,53 @@ elif [ "$M32" ]; then # Adding m32 flag directly to CC to avoid any posiible issues with API/ABI # difference on 'configure' and 'make' stages. export CC="$CC -m32" -elif [ "$TRAVIS_ARCH" != "aarch64" ]; then - OPTS="--enable-sparse" - if [ "$AFXDP" ]; then - # netdev-afxdp uses memset for 64M for umem initialization. - SPARSE_FLAGS="${SPARSE_FLAGS} -Wno-memcpy-max-count" - fi +else + EXTRA_OPTS="$EXTRA_OPTS --enable-sparse" CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${SPARSE_FLAGS}" fi -if [ "$ASAN" ]; then - # This will override default option configured in tests/atlocal.in. +if [ "$SANITIZERS" ]; then + # This will override default ASAN options configured in tests/atlocal.in. export ASAN_OPTIONS='detect_leaks=1' - CFLAGS_ASAN="-fno-omit-frame-pointer -fno-common -fsanitize=address" - CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_ASAN}" -fi - -if [ "$UBSAN" ]; then - # Use the default options configured in tests/atlocal.in, in UBSAN_OPTIONS. - CFLAGS_UBSAN="-fno-omit-frame-pointer -fno-common -fsanitize=undefined" - CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_UBSAN}" + CFLAGS_FOR_SAN="-fno-omit-frame-pointer -fno-common -fsanitize=$SANITIZERS" + CFLAGS_FOR_OVS="${CFLAGS_FOR_OVS} ${CFLAGS_FOR_SAN}" fi OPTS="${EXTRA_OPTS} ${OPTS} $*" -if [ "$TESTSUITE" ]; then +if [ "$CLANG_ANALYZE" ]; then + clang_analyze + exit 0 +fi + +if [ "$TESTSUITE" = 'test' ]; then # 'distcheck' will reconfigure with required options. # Now we only need to prepare the Makefile without sparse-wrapped CC. configure_ovs export DISTCHECK_CONFIGURE_FLAGS="$OPTS" - make distcheck -j4 CFLAGS="${CFLAGS_FOR_OVS}" \ - TESTSUITEFLAGS=-j4 RECHECK=yes + make distcheck ${JOBS} CFLAGS="${CFLAGS_FOR_OVS}" \ + TESTSUITEFLAGS=${JOBS} RECHECK=yes else build_ovs + for testsuite in $TESTSUITE; do + run_as_root= + if [ "$testsuite" != "check" ] && \ + [ "$testsuite" != "check-ovsdb-cluster" ] ; then + run_as_root="sudo -E PATH=$PATH GITHUB_ACTIONS=$GITHUB_ACTIONS" + sudo ip netns add ovs-system-test-ns + # Some system tests may rely on traffic loopback. + sudo ip -netns ovs-system-test-ns link set dev lo up + run_as_root="${run_as_root} ip netns exec ovs-system-test-ns" + fi + if [ "${testsuite##*dpdk}" != "$testsuite" ]; then + sudo sh -c 'echo 1024 > /proc/sys/vm/nr_hugepages' || true + [ "$(cat /proc/sys/vm/nr_hugepages)" = '1024' ] + export DPDK_EAL_OPTIONS="--lcores 0@1,1@1,2@1" + fi + $run_as_root make $testsuite TESTSUITEFLAGS="${JOBS} ${TEST_RANGE}" \ + RECHECK=yes + done fi exit 0 diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh index 16a7aec0b5b..2a191b57fb8 100755 --- a/.ci/linux-prepare.sh +++ b/.ci/linux-prepare.sh @@ -10,14 +10,11 @@ fi # Build and install sparse. # -# Explicitly disable sparse support for llvm because some travis -# environments claim to have LLVM (llvm-config exists and works) but -# linking against it fails. # Disabling sqlite support because sindex build fails and we don't # really need this utility being installed. git clone git://git.kernel.org/pub/scm/devel/sparse/sparse.git cd sparse -make -j4 HAVE_LLVM= HAVE_SQLITE= install +make -j4 HAVE_SQLITE= install cd .. # Installing wheel separately because it may be needed to build some @@ -26,26 +23,10 @@ cd .. # https://github.com/pypa/pip/issues/10655 pip3 install --disable-pip-version-check --user wheel pip3 install --disable-pip-version-check --user \ - flake8 'hacking>=3.0' netaddr pyparsing sphinx setuptools pyelftools -pip3 install --user 'meson==0.49.2' - -if [ "$M32" ]; then - # Installing 32-bit libraries. - pkgs="gcc-multilib" - if [ -z "$GITHUB_WORKFLOW" ]; then - # 32-bit and 64-bit libunwind can not be installed at the same time. - # This will remove the 64-bit libunwind and install 32-bit version. - # GitHub Actions doesn't have 32-bit versions of these libs. - pkgs=$pkgs" libunwind-dev:i386 libunbound-dev:i386" - fi - - sudo apt-get install -y $pkgs -fi + flake8 netaddr pyparsing sarif-tools sphinx setuptools # Install python test dependencies pip3 install -r python/test_requirements.txt -# IPv6 is supported by kernel but disabled in TravisCI images: -# https://github.com/travis-ci/travis-ci/issues/8891 -# Enable it to avoid skipping of IPv6 related tests. +# Make sure IPv6 is enabled to avoid skipping of IPv6 related tests. sudo sysctl -w net.ipv6.conf.all.disable_ipv6=0 diff --git a/.ci/osx-build.sh b/.ci/osx-build.sh index f8facebeb02..b81744ec9b9 100755 --- a/.ci/osx-build.sh +++ b/.ci/osx-build.sh @@ -5,27 +5,12 @@ set -o errexit CFLAGS="-Werror $CFLAGS" EXTRA_OPTS="" -on_exit() { - if [ $? = 0 ]; then - exit - fi - FILES_TO_PRINT="config.log" - FILES_TO_PRINT="$FILES_TO_PRINT */_build/sub/tests/testsuite.log" - - for pr_file in $FILES_TO_PRINT; do - cat "$pr_file" 2>/dev/null - done -} -# We capture the error logs as artifacts in Github Actions, no need to dump -# them via a EXIT handler. -[ -n "$GITHUB_WORKFLOW" ] || trap on_exit EXIT - function configure_ovs() { ./boot.sh && ./configure $* } -configure_ovs $EXTRA_OPTS $* +configure_ovs $EXTRA_OPTS $OPTS $* if [ "$CC" = "clang" ]; then make CFLAGS="$CFLAGS -Wno-error=unused-command-line-argument" diff --git a/.ci/windows-build.sh b/.ci/windows-build.sh new file mode 100644 index 00000000000..e54fbacf446 --- /dev/null +++ b/.ci/windows-build.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -ex + +CONFIGURATION=$1 + +./boot.sh +./configure CC=build-aux/cccl LD="$(which link)" \ + LIBS="-lws2_32 -lShlwapi -liphlpapi -lwbemuuid -lole32 -loleaut32" \ + --prefix=C:/openvswitch/usr --localstatedir=C:/openvswitch/var \ + --sysconfdir=C:/openvswitch/etc --with-pthread=c:/PTHREADS-BUILT/ \ + --enable-ssl --with-openssl=C:/OpenSSL-Win64 \ + --with-vstudiotarget="${CONFIGURATION}" || (cat config.log && exit 1) + +make -j4 +make datapath_windows_analyze +make install +make windows_installer diff --git a/.ci/windows-prepare.sh b/.ci/windows-prepare.sh new file mode 100644 index 00000000000..2d76add7150 --- /dev/null +++ b/.ci/windows-prepare.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -ex + +mkdir -p /var/cache/pacman/pkg/ +pacman -S --noconfirm --needed automake autoconf libtool make patch + +# Use an MSVC linker and a Windows version of Python. +mv $(which link) $(which link)_copy +mv $(which python3) $(which python3)_copy + +cd /c/pthreads4w-code && nmake all install diff --git a/.cirrus.yml b/.cirrus.yml index e3c1cd5811d..d73154a9716 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -2,8 +2,8 @@ freebsd_build_task: freebsd_instance: matrix: - image_family: freebsd-12-3-snap - image_family: freebsd-13-1-snap + image_family: freebsd-13-3-snap + image_family: freebsd-14-1-snap cpu: 4 memory: 4G diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000000..41ba51bf305 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,51 @@ +# See https://editorconfig.org/ for syntax reference. + +root = true + +# No wildcard sections [*] and [**] because properties cannot be +# applied safely to any filetype in general. + +# Property trim_trailing_whitespace should not be defined at all +# because it is interpreted differently by editors. + +[*.{c,h}] +charset = utf-8 +end_of_line = lf +indent_style = space +indent_size = 4 +insert_final_newline = true +max_line_length = 79 + +[include/linux/**.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[include/sparse/rte_*.h] +indent_style = tab +tab_width = 8 + +[include/windows/getopt.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[include/windows/netinet/{icmp6,ip6}.h] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/getopt_long.c] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/sflow*.{c,h}] +indent_style = tab +indent_size = tab +tab_width = 8 + +[lib/strsep.c] +indent_style = tab +indent_size = tab +tab_width = 8 diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 58ab85e5d7e..9d3a13ca1c9 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -2,27 +2,98 @@ name: Build and Test on: [push, pull_request] +env: + python_default: 3.12 + jobs: + build-dpdk: + env: + dependencies: gcc libbpf-dev libnuma-dev libpcap-dev ninja-build pkgconf + CC: gcc + DPDK_GIT: https://dpdk.org/git/dpdk-stable + DPDK_VER: 23.11.1 + name: dpdk gcc + outputs: + dpdk_key: ${{ steps.gen_dpdk_key.outputs.key }} + runs-on: ubuntu-22.04 + timeout-minutes: 30 + + steps: + - name: checkout + uses: actions/checkout@v4 + + - name: update PATH + run: | + echo "$HOME/bin" >> $GITHUB_PATH + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: create ci signature file for the dpdk cache key + # This will collect most of DPDK related lines, so hash will be different + # if something changed in a way we're building DPDK including DPDK_VER. + # This also allows us to use cache from any branch as long as version + # and a way we're building DPDK stays the same. + run: | + cat .ci/dpdk-* > dpdk-ci-signature + grep -rwE 'DPDK_GIT|DPDK_VER' .github/ >> dpdk-ci-signature + if [ "${DPDK_VER##refs/*/}" != "${DPDK_VER}" ]; then + git ls-remote --heads $DPDK_GIT $DPDK_VER >> dpdk-ci-signature + fi + cat dpdk-ci-signature + + - name: generate ci DPDK key + id: gen_dpdk_key + env: + ci_key: ${{ hashFiles('dpdk-ci-signature') }} + run: echo 'key=dpdk-${{ env.ci_key }}' >> $GITHUB_OUTPUT + + - name: cache + id: dpdk_cache + uses: actions/cache@v4 + with: + path: dpdk-dir + key: ${{ steps.gen_dpdk_key.outputs.key }} + + - name: set up python + if: steps.dpdk_cache.outputs.cache-hit != 'true' + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python_default }} + + - name: update APT cache + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: sudo apt update || true + - name: install common dependencies + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: sudo apt install -y ${{ env.dependencies }} + + - name: prepare + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: ./.ci/dpdk-prepare.sh + + - name: build + if: steps.dpdk_cache.outputs.cache-hit != 'true' + run: ./.ci/dpdk-build.sh + build-linux: + needs: build-dpdk env: dependencies: | - automake libtool gcc bc libjemalloc2 libjemalloc-dev \ - libssl-dev llvm-dev libelf-dev libnuma-dev libpcap-dev \ - ninja-build selinux-policy-dev - AFXDP: ${{ matrix.afxdp }} - ASAN: ${{ matrix.asan }} - UBSAN: ${{ matrix.ubsan }} + automake libtool gcc bc libjemalloc2 libjemalloc-dev libssl-dev \ + llvm-dev libnuma-dev libpcap-dev selinux-policy-dev libbpf-dev \ + lftp libreswan CC: ${{ matrix.compiler }} DPDK: ${{ matrix.dpdk }} DPDK_SHARED: ${{ matrix.dpdk_shared }} - KERNEL: ${{ matrix.kernel }} LIBS: ${{ matrix.libs }} M32: ${{ matrix.m32 }} OPTS: ${{ matrix.opts }} + SANITIZERS: ${{ matrix.sanitizers }} + STD: ${{ matrix.std }} TESTSUITE: ${{ matrix.testsuite }} + TEST_RANGE: ${{ matrix.test_range }} name: linux ${{ join(matrix.*, ' ') }} - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 timeout-minutes: 30 strategy: @@ -35,13 +106,15 @@ jobs: opts: --disable-ssl - compiler: gcc - testsuite: test + std: c99 - compiler: clang + std: c99 + + - compiler: gcc testsuite: test - asan: asan - compiler: clang + sanitizers: address,undefined testsuite: test - ubsan: ubsan - compiler: gcc testsuite: test @@ -51,10 +124,10 @@ jobs: opts: --enable-shared - compiler: gcc - testsuite: test + testsuite: check check-dpdk dpdk: dpdk - compiler: clang - testsuite: test + testsuite: check check-dpdk dpdk: dpdk - compiler: gcc @@ -65,11 +138,9 @@ jobs: libs: -ljemalloc - compiler: gcc - afxdp: afxdp - kernel: 5.3 + opts: --enable-afxdp - compiler: clang - afxdp: afxdp - kernel: 5.3 + opts: --enable-afxdp - compiler: gcc dpdk: dpdk @@ -94,9 +165,52 @@ jobs: m32: m32 opts: --disable-ssl + - compiler: gcc + testsuite: check-ovsdb-cluster + + - compiler: gcc + testsuite: check-kernel + test_range: "-100" + - compiler: gcc + testsuite: check-kernel + test_range: "100-" + + - compiler: clang + sanitizers: address,undefined + testsuite: check-kernel + test_range: "-100" + - compiler: clang + sanitizers: address,undefined + testsuite: check-kernel + test_range: "100-" + + - compiler: gcc + testsuite: check-offloads + test_range: "-100" + - compiler: gcc + testsuite: check-offloads + test_range: "100-" + + - compiler: gcc + dpdk: dpdk + testsuite: check-system-userspace + + - compiler: clang + sanitizers: address,undefined + dpdk: dpdk + testsuite: check-system-userspace + + - compiler: gcc + dpdk: dpdk + testsuite: check-system-tso + + - compiler: gcc + dpdk: dpdk + testsuite: check-afxdp + steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: update PATH run: | @@ -104,37 +218,36 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.9' - - - name: create ci signature file for the dpdk cache key - if: matrix.dpdk != '' || matrix.dpdk_shared != '' - # This will collect most of DPDK related lines, so hash will be different - # if something changed in a way we're building DPDK including DPDK_VER. - # This also allows us to use cache from any branch as long as version - # and a way we're building DPDK stays the same. - run: | - grep -irE 'RTE_|DPDK|meson|ninja' -r .ci/ > dpdk-ci-signature - cat dpdk-ci-signature + python-version: ${{ env.python_default }} - name: cache if: matrix.dpdk != '' || matrix.dpdk_shared != '' - uses: actions/cache@v2 - env: - matrix_key: ${{ matrix.dpdk }}${{ matrix.dpdk_shared }} - ci_key: ${{ hashFiles('dpdk-ci-signature') }} + uses: actions/cache@v4 with: path: dpdk-dir - key: ${{ env.matrix_key }}-${{ env.ci_key }} + key: ${{ needs.build-dpdk.outputs.dpdk_key }} - name: update APT cache run: sudo apt update || true - name: install common dependencies run: sudo apt install -y ${{ env.dependencies }} - - name: install libunbound libunwind + - name: install libunbound libunwind python3-unbound + # GitHub Actions doesn't have 32-bit versions of these libraries. if: matrix.m32 == '' - run: sudo apt install -y libunbound-dev libunwind-dev + run: sudo apt install -y libunbound-dev libunwind-dev python3-unbound + - name: install 32-bit libraries + if: matrix.m32 != '' + run: sudo apt install -y gcc-multilib + + - name: Reduce ASLR entropy + if: matrix.sanitizers != '' + # Asan in llvm 14 provided in ubuntu-22.04 is incompatible with + # high-entropy ASLR configured in much newer kernels that GitHub + # runners are using leading to random crashes: + # https://github.com/actions/runner-images/issues/9491 + run: sudo sysctl -w vm.mmap_rnd_bits=28 - name: prepare run: ./.ci/linux-prepare.sh @@ -145,22 +258,136 @@ jobs: - name: copy logs on failure if: failure() || cancelled() run: | - # upload-artifact@v2 throws exceptions if it tries to upload socket + # upload-artifact throws exceptions if it tries to upload socket # files and we could have some socket files in testsuite.dir. - # Also, upload-artifact@v2 doesn't work well enough with wildcards. + # Also, upload-artifact doesn't work well enough with wildcards. # So, we're just archiving everything here to avoid any issues. mkdir logs cp config.log ./logs/ cp -r ./*/_build/sub/tests/testsuite.* ./logs/ || true - tar -czvf logs.tgz logs/ + sudo cp -r ./tests/*testsuite.* ./logs/ || true + sudo tar -czvf logs.tgz logs/ - name: upload logs on failure if: failure() || cancelled() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: logs-linux-${{ join(matrix.*, '-') }} path: logs.tgz + build-clang-analyze: + needs: build-dpdk + env: + dependencies: | + automake bc clang-tools libbpf-dev libnuma-dev libpcap-dev \ + libunbound-dev libunwind-dev libssl-dev libtool llvm-dev + CC: clang + DPDK: dpdk + CLANG_ANALYZE: true + name: clang-analyze + runs-on: ubuntu-22.04 + timeout-minutes: 30 + + steps: + - name: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: get base branch sha + id: base_branch + env: + BASE_SHA: ${{ github.event.pull_request.base.sha }} + EVENT_BEFORE: ${{ github.event.before }} + FORCED_PUSH: ${{ github.event.forced }} + run: | + if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then + echo "sha=$BASE_SHA" >> $GITHUB_OUTPUT + else + if [ "$EVENT_BEFORE" = "0000000000000000000000000000000000000000" ] \ + || [ "$FORCED_PUSH" = true ]; then + BASE_SHA=HEAD~1 + MIN_DISTANCE=1000 + git remote add upstream https://github.com/openvswitch/ovs.git + git fetch upstream + for upstream_head in $(git ls-remote --heads upstream main dpdk-latest branch-2.17 branch-[3456789]* | cut -f 1); do + CURR_BASE=$(git merge-base ${upstream_head} HEAD 2>/dev/null) + if [ ${CURR_BASE} ]; then + DISTANCE=$(git log --oneline ${CURR_BASE}..HEAD | wc -l); + if test ${MIN_DISTANCE} -gt ${DISTANCE}; then + BASE_SHA=${CURR_BASE} + MIN_DISTANCE=${DISTANCE} + fi + fi + done + echo "sha=$BASE_SHA" >> $GITHUB_OUTPUT + else + echo "sha=$EVENT_BEFORE" >> $GITHUB_OUTPUT + fi + fi + + - name: checkout base branch + env: + BASE_SHA: ${{ steps.base_branch.outputs.sha }} + run: | + cp -r $(pwd)/. /tmp/base_ovs_main && mv /tmp/base_ovs_main ./ + cd $(pwd)/base_ovs_main + git checkout ${BASE_SHA} + + - name: update PATH + run: | + echo "$HOME/bin" >> $GITHUB_PATH + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: generate cache key + id: cache_key + run: | + ver=$(${CC} -v 2>&1 | grep ' version ' | \ + sed 's/.*version \([0-9]*\.[0-9]*\.[0-9]*\).*/\1/g') + echo "key=${CC}-${ver}-analyze-$(git -C base_ovs_main rev-parse HEAD)" \ + >> $GITHUB_OUTPUT + + - name: check for analyzer result cache + id: clang_cache + uses: actions/cache@v4 + with: + path: base-clang-analyzer-results + key: ${{ steps.cache_key.outputs.key }} + + - name: set up python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.python_default }} + + - name: get cached dpdk-dir + uses: actions/cache/restore@v4 + with: + path: dpdk-dir + key: ${{ needs.build-dpdk.outputs.dpdk_key }} + + - name: update APT cache + run: sudo apt update || true + + - name: install common dependencies + run: sudo apt install -y ${{ env.dependencies }} + + - name: prepare + run: ./.ci/linux-prepare.sh + + - name: build base reference + if: steps.clang_cache.outputs.cache-hit != 'true' + run: ./.ci/linux-build.sh + + - name: save cache + uses: actions/cache/save@v4 + if: steps.clang_cache.outputs.cache-hit != 'true' + with: + path: base-clang-analyzer-results + key: ${{ steps.cache_key.outputs.key }} + + - name: build + run: ./.ci/linux-build.sh + build-osx: env: CC: clang @@ -175,15 +402,15 @@ jobs: steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: update PATH run: | echo "$HOME/bin" >> $GITHUB_PATH echo "$HOME/.local/bin" >> $GITHUB_PATH - name: set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: ${{ env.python_default }} - name: install dependencies run: brew install automake libtool - name: prepare @@ -192,7 +419,7 @@ jobs: run: ./.ci/osx-build.sh - name: upload logs on failure if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: logs-osx-clang---disable-ssl path: config.log @@ -213,11 +440,10 @@ jobs: matrix: include: - dpdk: no - - dpdk: shared steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: update PATH run: | @@ -239,7 +465,44 @@ jobs: run: ./.ci/linux-build.sh - name: upload deb packages - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: deb-packages-${{ matrix.dpdk }}-dpdk path: '/home/runner/work/ovs/*.deb' + + build-linux-rpm: + name: linux rpm fedora + runs-on: ubuntu-latest + container: fedora:39 + timeout-minutes: 30 + + strategy: + fail-fast: false + + steps: + - name: checkout + uses: actions/checkout@v4 + - name: install dependencies + run: | + dnf install -y rpm-build dnf-plugins-core + sed -e 's/@VERSION@/0.0.1/' rhel/openvswitch-fedora.spec.in \ + > /tmp/ovs.spec + dnf builddep -y /tmp/ovs.spec + rm -f /tmp/ovs.spec + + - name: configure + run: ./boot.sh && ./configure + + - name: build + run: make rpm-fedora + + - name: install + run: dnf install -y rpm/rpmbuild/RPMS/*/*.rpm + + - name: upload rpm packages + uses: actions/upload-artifact@v4 + with: + name: rpm-packages + path: | + rpm/rpmbuild/SRPMS/*.rpm + rpm/rpmbuild/RPMS/*/*.rpm diff --git a/.mailmap b/.mailmap index 4773c4a2454..da46dc15924 100644 --- a/.mailmap +++ b/.mailmap @@ -79,8 +79,9 @@ Sabyasachi Sengupta Shad Ansari Shih-Hao Li -Simon Horman -Simon Horman +Simon Horman +Simon Horman +Simon Horman Stephen Finucane Thomas F. Herbert Thomas Graf diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000000..7d505150ecd --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,25 @@ +# .readthedocs.yaml +# Read the Docs configuration file. +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details. + +# Required. +version: 2 + +# Set the OS, Python version, etc. +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +# Build documentation in the "Documentation/" directory with Sphinx. +sphinx: + configuration: Documentation/conf.py + builder: "dirhtml" + +# Build all formats: HTML, PDF, ePub. +formats: all + +# Declare the Python requirements. +python: + install: + - requirements: Documentation/requirements.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c7aeede06e6..00000000000 --- a/.travis.yml +++ /dev/null @@ -1,57 +0,0 @@ -language: c - -os: - - linux - -cache: - directories: - - dpdk-dir - -addons: - apt: - packages: - - bc - - libssl-dev - - llvm-dev - - libjemalloc1 - - libjemalloc-dev - - libnuma-dev - - libpcap-dev - - python3-pip - - python3-sphinx - - libelf-dev - - selinux-policy-dev - - libunbound-dev - - libunwind-dev - - python3-setuptools - - python3-wheel - - ninja-build - -before_install: ./.ci/${TRAVIS_OS_NAME}-prepare.sh - -before_script: export PATH=$PATH:$HOME/bin - -matrix: - include: - - arch: arm64 - compiler: gcc - env: TESTSUITE=1 DPDK=1 - - arch: arm64 - compiler: gcc - env: KERNEL_LIST="5.5 4.19" - - arch: arm64 - compiler: gcc - env: KERNEL_LIST="4.9 3.16" - - arch: arm64 - compiler: gcc - env: DPDK_SHARED=1 - - arch: arm64 - compiler: clang - env: OPTS="--disable-ssl" - -script: ./.ci/${TRAVIS_OS_NAME}-build.sh $OPTS - -notifications: - email: - recipients: - - ovs-build@openvswitch.org diff --git a/AUTHORS.rst b/AUTHORS.rst index f4184be8fc4..28dcce4eaf7 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -52,6 +52,7 @@ Alin Serdean aserdean@ovn.org Amber Kumar kumar.amber@intel.com Ambika Arora ambika.arora@tcs.com Amit Bose bose@noironetworks.com +Amit Prakash Shukla amitprakashs@marvell.com Amitabha Biswas azbiswas@gmail.com Anand Kumar kumaranand@vmware.com Andrea Kao eirinikos@gmail.com @@ -92,13 +93,16 @@ Billy O'Mahony billy.o.mahony@intel.com Binbin Xu xu.binbin1@zte.com.cn Bodo Petermann b.petermann@syseleven.de Boleslaw Tokarski boleslaw.tokarski@jollamobile.com +Brad Cowie brad@faucet.nz Brian Haley haleyb.dev@gmail.com Brian Kruger bkruger+ovsdev@gmail.com Bruce Davie bdavie@vmware.com Bryan Phillippe bp@toroki.com Carlo Andreotti c.andreotti@m3s.it Casey Barker crbarker@google.com +Chandan Somani csomani@redhat.com Chandra Sekhar Vejendla csvejend@us.ibm.com +Chris Riches chris.riches@nutanix.com Chris Wright chrisw@sous-sol.org Christoph Jaeger cj@linux.com Christophe Fontaine cfontain@redhat.com @@ -107,6 +111,7 @@ Chuck Short zulcss@ubuntu.com Cian Ferriter cian.ferriter@intel.com Ciara Loftus ciara.loftus@intel.com Clint Byrum clint@fewbar.com +Colin Watson cjwatson@ubuntu.com Cong Wang amwang@redhat.com Conner Herriges conner.herriges@ibm.com Damien Millescamps damien.millescamps@6wind.com @@ -117,6 +122,7 @@ Dan Wendlandt Dan Williams dcbw@redhat.com Daniel Alvarez dalvarez@redhat.com Daniel Borkmann dborkman@redhat.com +Daniel Ding zhihui.ding@easystack.cn Daniel Hiltgen daniel@netkine.com Daniel Roman Daniele Di Proietto daniele.di.proietto@gmail.com @@ -132,6 +138,7 @@ David Wilder dwilder@us.ibm.com David Yang davidy@vmware.com Dennis Sam dsam@arista.com Devendra Naga devendra.aaru@gmail.com +Dexia Li dexia.li@jaguarmicro.com Dincer Beken dbeken@blackned.de Dmitry Krivenok krivenok.dmitry@gmail.com Dominic Curran dominic.curran@citrix.com @@ -162,9 +169,14 @@ Ethan J. Jackson ejj@eecs.berkeley.edu Ethan Rahn erahn@arista.com Eziz Durdyyev ezizdurdy@gmail.com Fabrizio D'Angelo fdangelo@redhat.com +Faicker Mo faicker.mo@ucloud.cn +Fangrui Song maskray@google.com +Felix Huettner felix.huettner@mail.schwarz +Fengqi Li lifengqi@inspur.com Flavio Fernandes flavio@flaviof.com Flavio Leitner fbl@redhat.com Francesco Fusco ffusco@redhat.com +François Rigault frigo@amadeus.com Frédéric Tobias Christ fchrist@live.de Frode Nordahl frode.nordahl@gmail.com FUJITA Tomonori fujita.tomonori@lab.ntt.co.jp @@ -201,12 +213,15 @@ Ilya Maximets i.maximets@ovn.org Iman Tabrizian tabrizian@outlook.com Isaku Yamahata yamahata@valinux.co.jp Ivan Dyukov i.dyukov@samsung.com +Ivan Malov ivan.malov@arknetworks.am IWASE Yusuke iwase.yusuke@gmail.com Jaime Caamaño Ruiz jcaamano@suse.com +Jakob Meng code@jakobmeng.de Jakub Libosvar libosvar@redhat.com Jakub Sitnicki jsitnicki@gmail.com James P. roampune@gmail.com James Page james.page@ubuntu.com +James Raphael Tiovalen jamestiotio@gmail.com Jamie Lennox jamielennox@gmail.com Jan Scheurich jan.scheurich@ericsson.com Jan Vansteenkiste jan@vstone.eu @@ -231,7 +246,9 @@ Joe Stringer joe@ovn.org Jon Kohler jon@nutanix.com Jonathan Vestin jonavest@kau.se Jorge Arturo Sauma Vargas jorge.sauma@hpe.com +Jun Gu jun.gu@easystack.cn Jun Nakajima jun.nakajima@intel.com +Jun Wang junwang01@cestc.cn JunhanYan juyan@redhat.com JunoZhu zhunatuzi@gmail.com Justin Pettit jpettit@ovn.org @@ -243,6 +260,7 @@ Kenneth Duda kduda@arista.com Kentaro Ebisawa ebiken.g@gmail.com Keshav Gupta keshav.gupta@ericsson.com Kevin Lo kevlo@FreeBSD.org +Kevin Sprague ksprague0711@gmail.com Kevin Traynor kevin.traynor@intel.com Khem Raj raj.khem@gmail.com Kmindg G kmindg@gmail.com @@ -260,6 +278,7 @@ Leif Madsen lmadsen@redhat.com Leo Alterman Li RongQing lirongqing@baidu.com Lian-min Wang liang-min.wang@intel.com +Liang Mancang liangmc1@chinatelecom.cn Lin Huang linhuang@ruijie.com.cn Liu Chang liuchang@cmss.chinamobile.com Lilijun jerry.lilijun@huawei.com @@ -275,8 +294,9 @@ Lucas Alvares Gomes lucasagomes@gmail.com Lucian Petrut lpetrut@cloudbasesolutions.com Luigi Rizzo rizzo@iet.unipi.it Luis E. P. l31g@hotmail.com -Lukasz Rzasik lukasz.rzasik@gmail.com +Luca Czesla luca.czesla@mail.schwarz Lukasz Pawlik lukaszx.pawlik@intel.com +Lukasz Rzasik lukasz.rzasik@gmail.com Maciej Józefczyk mjozefcz@redhat.com Madhu Challa challa@noironetworks.com Manohar K C manukc@gmail.com @@ -290,14 +310,16 @@ Mark Michelson mmichels@redhat.com Markos Chandras mchandras@suse.de Martin Casado casado@cs.stanford.edu Martin Fong mwfong@csl.sri.com -Martino Fornasa mf@fornasa.it +Martin Kalcok martin.kalcok@canonical.com Martin Varghese martin.varghese@nokia.com Martin Xu martinxu9.ovs@gmail.com Martin Zhang martinbj2008@gmail.com +Martino Fornasa mf@fornasa.it Maryam Tahhan maryam.tahhan@intel.com Matteo Croce mcroce@redhat.com Matthias May matthias.may@neratec.com Mauricio Vásquez mauricio.vasquezbernal@studenti.polito.it +Max Lamprecht max.lamprecht@mail.schwarz Maxime Coquelin maxime.coquelin@redhat.com Mehak Mahajan Michael Arnaldi arnaldimichael@gmail.com @@ -308,6 +330,7 @@ Michal Weglicki michalx.weglicki@intel.com Michele Baldessari michele@acksyn.org Mickey Spiegel mickeys.dev@gmail.com Miguel Angel Ajo majopela@redhat.com +Miika Petäjäniemi miika.petajaniemi@solita.fi Mijo Safradin mijo@linux.vnet.ibm.com Mika Vaisanen mika.vaisanen@gmail.com Mike Pattrick mkp@redhat.com @@ -342,14 +365,16 @@ Paul Ingram Paul-Emmanuel Raoul skyper@skyplabs.net Pavithra Ramesh paramesh@vmware.com Peng He hepeng.0320@bytedance.com +Pengfei Sun sunpengfei16@huawei.com Peter Downs padowns@gmail.com Philippe Jung phil.jung@free.fr Pim van den Berg pim@nethuis.nl pritesh pritesh.kothari@cisco.com Pravin B Shelar pshelar@ovn.org Przemyslaw Szczerbik przemyslawx.szczerbik@intel.com -Quentin Monnet quentin.monnet@6wind.com +Qian Chen cq674350529@163.com Qiuyu Xiao qiuyu.xiao.qyx@gmail.com +Quentin Monnet quentin.monnet@6wind.com Raju Subramanian Rami Rosen ramirose@gmail.com Ramu Ramamurthy ramu.ramamurthy@us.ibm.com @@ -364,9 +389,11 @@ Rich Lane rlane@bigswitch.com Richard Oliver richard@richard-oliver.co.uk Rishi Bamba rishi.bamba@tcs.com Rob Adams readams@readams.net -Robert Åkerblom-Andersson Robert.nr1@gmail.com -Robert Wojciechowicz robertx.wojciechowicz@intel.com Rob Hoes rob.hoes@citrix.com +Robert Wojciechowicz robertx.wojciechowicz@intel.com +Robert Åkerblom-Andersson Robert.nr1@gmail.com +Roberto Bartzen Acosta roberto.acosta@luizalabs.com +Robin Jarry rjarry@redhat.com Rohith Basavaraja rohith.basavaraja@gmail.com Roi Dayan roid@nvidia.com Róbert Mulik robert.mulik@ericsson.com @@ -386,6 +413,7 @@ Sanjay Sane Saurabh Mohan saurabh@cplanenetworks.com Saurabh Shah Saurabh Shrivastava saurabh.shrivastava@nuagenetworks.net +Sayali Naval sanaval@cisco.com Scott Cheloha scottcheloha@gmail.com Scott Lowe scott.lowe@scottlowe.org Scott Mann sdmnix@gmail.com @@ -401,11 +429,14 @@ Shashank Ram rams@vmware.com Shashwat Srivastava shashwat.srivastava@tcs.com Shih-Hao Li shihli@vmware.com Shu Shen shu.shen@radisys.com -Simon Horman simon.horman@corigine.com +Simon Horman horms@ovn.org +Simon Jones batmanustc@gmail.com Sivaprasad Tummala sivaprasad.tummala@intel.com Somnath Chatterjee somnath.b.chatterjee@ericsson.com +Songtao Zhan zhanst1@chinatelecom.cn Sorin Vinturis svinturis@cloudbasesolutions.com Sriharsha Basavapatna sriharsha.basavapatna@broadcom.com +Stefan Hoffmann stefan.hoffmann@cloudandheat.com Steffen Gebert steffen.gebert@informatik.uni-wuerzburg.de Sten Spans sten@blinkenlights.nl Stephane A. Sezer sas@cd80.net @@ -443,9 +474,12 @@ Usman Ansari ua1422@gmail.com Valient Gough vgough@pobox.com Vasu Dasari vdasari@gmail.com Venkata Anil Kommaddi vkommadi@redhat.com +Viacheslav Galaktionov viacheslav.galaktionov@arknetworks.am +Ville Skyttä ville.skytta@upcloud.com Vishal Deep Ajmera vishal.deep.ajmera@ericsson.com Vivien Bernet-Rollande vbr@soprive.net Vlad Buslov vladbu@nvidia.com +Vladislav Odintsov odivlad@gmail.com Volkan Atlı volkan.atli@b-ulltech.com Wan Junjie wanjunjie@bytedance.com Wang Li wangli39@baidu.com @@ -459,8 +493,10 @@ Wei Yongjun yjwei@cn.fujitsu.com Wenyu Zhang wenyuz@vmware.com William Fulton William Tu u9012063@gmail.com +Wilson Peng pweisong@vmware.com Xavier Simonart xsimonar@redhat.com Xiao Liang shaw.leon@gmail.com +Xiaojie Chen jackchanx@163.com xu rong xu.rong@zte.com.cn YAMAMOTO Takashi yamamoto@midokura.com Yalei Li liyl43@chinatelecom.cn @@ -476,23 +512,28 @@ Yuanhan Liu yuanhan.liu@linux.intel.com Yunjian Wang wangyunjian@huawei.com Yousong Zhou yszhou4tech@gmail.com Zak Whittington zwhitt.vmware@gmail.com -ZhengLingyun konghuarukhr@163.com -Zoltán Balogh zoltan.balogh.eth@gmail.com -Zoltan Kiss zoltan.kiss@citrix.com -Zongkai LI zealokii@gmail.com -Zhi Yong Wu zwu.kernel@gmail.com Zang MingJie zealot0630@gmail.com +Zengyuan Wang wangzengyuan@huawei.com +ZhengLingyun konghuarukhr@163.com Zhenyu Gao sysugaozhenyu@gmail.com +Zhi Yong Wu zwu.kernel@gmail.com ZhiPeng Lu luzhipeng@uniudc.com +Zhiqi Chen chenzhiqi.123@bytedance.com Zhou Yangchao 1028519445@qq.com +Zoltan Kiss zoltan.kiss@citrix.com +Zoltán Balogh zoltan.balogh.eth@gmail.com +Zongkai LI zealokii@gmail.com aginwala amginwal@gmail.com +gordonwwang gordonwwang@tencent.com lic121 lic121@chinatelecom.cn lzhecheng lzhecheng@vmware.com parameswaran krishnamurthy parkrish@gmail.com solomon liwei.solomon@gmail.com +wangchuanlei wangchuanlei@inspur.com wenxu wenxu@ucloud.cn wisd0me ak47izatool@gmail.com xushengping shengping.xu@huawei.com +yangchang yangchang@chinatelecom.cn yinpeijun yinpeijun@huawei.com zangchuanqiang zangchuanqiang@huawei.com zhaojingjing zhao.jingjing1@zte.com.cn @@ -556,6 +597,7 @@ David Evans davidjoshuaevans@gmail.com David Palma palma@onesource.pt David van Moolenbroek dvmoolenbroek@aimvalley.nl Derek Cormier derek.cormier@lab.ntt.co.jp +Derrick Lim derrick.lim@rakuten.com Dhaval Badiani dbadiani@vmware.com DK Moon Ding Zhi zhi.ding@6wind.com @@ -631,6 +673,7 @@ Len Gao leng@vmware.com Linhaifeng haifeng.lin@huawei.com Logan Rosen logatronico@gmail.com Luca Falavigna dktrkranz@debian.org +Lucas Nussbaum lucas@debian.org Luiz Henrique Ozaki luiz.ozaki@gmail.com Madhu Venugopal mavenugo@gmail.com Malvika Gupta malvika.gupta@arm.com @@ -717,6 +760,7 @@ Tytus Kurek Tytus.Kurek@pega.com Valentin Bud valentin@hackaserver.com Vasiliy Tolstov v.tolstov@selfip.ru Vinllen Chen cvinllen@gmail.com +Vipul Ashri vipul.ashri@ericsson.com Vishal Swarankar vishal.swarnkar@gmail.com Vjekoslav Brajkovic balkan@cs.washington.edu Voravit T. voravit@kth.se diff --git a/Documentation/automake.mk b/Documentation/automake.mk index cdf3c992660..47d2e336a0b 100644 --- a/Documentation/automake.mk +++ b/Documentation/automake.mk @@ -57,6 +57,7 @@ DOC_SOURCE = \ Documentation/topics/record-replay.rst \ Documentation/topics/tracing.rst \ Documentation/topics/usdt-probes.rst \ + Documentation/topics/userspace-checksum-offloading.rst \ Documentation/topics/userspace-tso.rst \ Documentation/topics/userspace-tx-steering.rst \ Documentation/topics/windows.rst \ @@ -108,6 +109,7 @@ DOC_SOURCE = \ Documentation/internals/security.rst \ Documentation/internals/contributing/index.rst \ Documentation/internals/contributing/backporting-patches.rst \ + Documentation/internals/contributing/inclusive-language.rst \ Documentation/internals/contributing/coding-style.rst \ Documentation/internals/contributing/coding-style-windows.rst \ Documentation/internals/contributing/documentation-style.rst \ diff --git a/Documentation/conf.py b/Documentation/conf.py index 085ca2cd67c..2364405ade8 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -12,6 +12,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import os import string import sys @@ -48,7 +49,7 @@ # General information about the project. project = u'Open vSwitch' -copyright = u'2016-2021, The Open vSwitch Development Community' +copyright = u'2016-2024, The Open vSwitch Development Community' author = u'The Open vSwitch Development Community' # The version info for the project you're documenting, acts as replacement for @@ -108,6 +109,13 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +# Define the canonical URL for our domain configured on Read the Docs. +html_baseurl = os.environ.get("READTHEDOCS_CANONICAL_URL", "") + +# Tell Jinja2 templates the build is running on Read the Docs. +html_context = {} +if os.environ.get("READTHEDOCS", "") == "True": + html_context["READTHEDOCS"] = True # -- Options for manual page output --------------------------------------- diff --git a/Documentation/faq/configuration.rst b/Documentation/faq/configuration.rst index dc6c92446f9..4df390dc2d9 100644 --- a/Documentation/faq/configuration.rst +++ b/Documentation/faq/configuration.rst @@ -238,6 +238,27 @@ Q: Does Open vSwitch support GTP-U? set int gtpu0 type=gtpu options:key= \ options:remote_ip=172.31.1.1 +Q: Does Open vSwitch support SRv6? + + A: Yes. Starting with version 3.2, the Open vSwitch userspace + datapath supports SRv6 (Segment Routing over IPv6). The following + example shows tunneling to fc00:300::1 via fc00:100::1 and fc00:200::1. + In the current implementation, if "IPv6 in IPv6" or "IPv4 in IPv6" packets + are routed to this interface, and these packets are not SRv6 packets, they + may be dropped, so be careful in workloads with a mix of these tunnels. + Also note the following restrictions: + + * Segment list length is limited to 6. + * SRv6 packets with other than segments_left = 0 are simply dropped. + + :: + + $ ovs-vsctl add-br br0 + $ ovs-vsctl add-port br0 srv6_0 -- \ + set int srv6_0 type=srv6 \ + options:remote_ip=fc00:100::1 \ + options:srv6_segs="fc00:100::1,fc00:200::1,fc00:300::1" + Q: How do I connect two bridges? A: First, why do you want to do this? Two connected bridges are not much diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst index ac0001cd576..9fbee90edc1 100644 --- a/Documentation/faq/releases.rst +++ b/Documentation/faq/releases.rst @@ -88,11 +88,10 @@ Q: What Linux kernel versions does each Open vSwitch release work with? RHEL and CentOS 7 3.10 based kernels since they have diverged from the Linux kernel.org 3.10 kernels. - Starting with Open vSwitch 2.15, building the Linux kernel module from - the Open vSwitch source tree is deprecated. It will not be updated to - support Linux versions later than 5.8. We will remove the kernel module - source code from the Open vSwitch source tree for the Open vSwitch 3.0 - release. + Building the Linux kernel module from the Open vSwitch source tree was + deprecated starting with Open vSwitch 2.15. And the kernel module + source code was completely removed from the Open vSwitch source tree in + 3.0 release. Q: Are all features available with all datapaths? @@ -111,7 +110,7 @@ Q: Are all features available with all datapaths? Linux OVS tree The datapath implemented by the Linux kernel module distributed with the OVS source tree. This datapath is deprecated starting with OVS - 2.15.x and support capped at Linux kernel version 5.8. As of OVS 3.0.x + 2.15 and support capped at Linux kernel version 5.8. As of OVS 3.0 the Linux OVS tree is no longer supported. Userspace @@ -141,6 +140,7 @@ Q: Are all features available with all datapaths? Conntrack Zone Limit 4.18 2.10 2.13 YES Conntrack NAT 4.6 2.6 2.8 YES Conntrack NAT6 4.6 2.6 2.8 3.0 + Conntrack Helper Persist. YES YES 3.3 NO Tunnel - LISP NO 2.11 NO NO Tunnel - STT NO 2.4 NO YES Tunnel - GRE 3.11 1.0 2.4 YES @@ -152,6 +152,7 @@ Q: Are all features available with all datapaths? Tunnel - ERSPAN 4.18 2.10 2.10 NO Tunnel - ERSPAN-IPv6 4.18 2.10 2.10 NO Tunnel - GTP-U NO NO 2.14 NO + Tunnel - SRv6 NO NO 3.2 NO Tunnel - Bareudp 5.7 NO NO NO QoS - Policing YES 1.1 2.6 NO QoS - Shaping YES 1.1 NO NO @@ -215,8 +216,12 @@ Q: What DPDK version does each Open vSwitch release work with? 2.14.x 19.11.13 2.15.x 20.11.6 2.16.x 20.11.6 - 2.17.x 21.11.2 - 3.0.x 21.11.2 + 2.17.x 21.11.7 + 3.0.x 21.11.7 + 3.1.x 22.11.5 + 3.2.x 22.11.5 + 3.3.x 23.11.1 + 3.4.x 23.11.1 ============ ======== Q: Are all the DPDK releases that OVS versions work with maintained? @@ -233,7 +238,7 @@ Q: Are all the DPDK releases that OVS versions work with maintained? The latest information about DPDK stable and LTS releases can be found at `DPDK stable`_. -.. _DPDK stable: http://doc.dpdk.org/guides-21.11/contributing/stable.html +.. _DPDK stable: http://doc.dpdk.org/guides-23.11/contributing/stable.html Q: I get an error like this when I configure Open vSwitch: @@ -252,8 +257,11 @@ Q: I get an error like this when I configure Open vSwitch: that one, because it may support the kernel that you are building against. (To find out, consult the table in the previous FAQ.) - - The Open vSwitch "master" branch may support the kernel that you are - using, so consider building the kernel module from "master". + - For Open vSwitch releases prior to 3.0, the corresponding Open + vSwitch branch may support the kernel that you are using, so consider + building the kernel module from that branch. For Open vSwitch 2.17, + the only non EOL release to which this applies, the branch is + "branch-2.17". All versions of Open vSwitch userspace are compatible with all versions of the Open vSwitch kernel module, so you do not have to use the kernel module @@ -273,6 +281,9 @@ ships as part of the upstream Linux kernel? supported, consider upgrading to a newer upstream Linux release or using the kernel module paired with the userspace distribution. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: Why do tunnels not work when using a kernel module other than the one packaged with Open vSwitch? @@ -299,6 +310,9 @@ packaged with Open vSwitch? doing this, check to make sure that the module that is loaded is the one you expect. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: Why are UDP tunnel checksums not computed for VXLAN or Geneve? A: Generating outer UDP checksums requires kernel support that was not part @@ -307,6 +321,9 @@ Q: Why are UDP tunnel checksums not computed for VXLAN or Geneve? out-of-tree modules from Open vSwitch release 2.4 and later support UDP checksums. + Please note that as of Open vSwitch 3.0 the kernel module is no longer + part of the Open vSwitch distribution. + Q: What features are not available when using the userspace datapath? A: Tunnel virtual ports are not supported, as described in the previous diff --git a/Documentation/howto/qos.rst b/Documentation/howto/qos.rst index 376ec2514bd..7d625e00197 100644 --- a/Documentation/howto/qos.rst +++ b/Documentation/howto/qos.rst @@ -59,10 +59,10 @@ is participating in an OVS bridge, no IP address can be assigned on `eth0`. The second host, named Measurement Host, can be any host capable of measuring throughput from a VM. For this guide, we use `netperf -`__, a free tool for testing the rate at which one host -can send to another. The Measurement Host has only a single NIC, `eth0`, which -is connected to the Data Network. `eth0` has an IP address that can reach any -VM on `host1`. +`__, a free tool for testing the rate +at which one host can send to another. The Measurement Host has only a single +NIC, `eth0`, which is connected to the Data Network. `eth0` has an IP address +that can reach any VM on `host1`. Two VMs ~~~~~~~ diff --git a/Documentation/howto/sflow.rst b/Documentation/howto/sflow.rst index 74d8b8e175f..0b378c93d44 100644 --- a/Documentation/howto/sflow.rst +++ b/Documentation/howto/sflow.rst @@ -68,7 +68,7 @@ cookbook entry, we use `sFlowTrend `__, a free sFlow collector that is a simple cross-platform Java download. Other sFlow collectors should work equally well. `hostMon` has a single NIC, `eth0`, that is connected to the -Management Network. `eth0` has an IP adress that can reach `eth1` on `host1`. +Management Network. `eth0` has an IP address that can reach `eth1` on `host1`. Two Virtual Machines ~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/howto/tc-offload.rst b/Documentation/howto/tc-offload.rst index f6482c8aff0..ee7f73f8a0c 100644 --- a/Documentation/howto/tc-offload.rst +++ b/Documentation/howto/tc-offload.rst @@ -49,7 +49,7 @@ tc-police action, see ``man tc-police``. Configuration ~~~~~~~~~~~~~ -There is no parameter change in ovs-ofctl command, to configue a meter and use +There is no parameter change in ovs-ofctl command, to configure a meter and use it for a flow in the offload way. Usually the commands are like:: $ ovs-ofctl -O OpenFlow13 add-meter br0 "meter=1 pktps bands=type=drop rate=1" @@ -58,10 +58,10 @@ it for a flow in the offload way. Usually the commands are like:: For more details, see ``man ovs-ofctl``. .. note:: - Each meter is mapped to one TC police action. To avovid the conflicton, the - police action index of 0x10000000-0x1fffffff are resevered for the mapping. - You can check the police actions by the command ``tc action ls action police`` - in Linux system. + Each meter is mapped to one TC police action. To avoid conflicts, the + police action indexes 0x10000000-0x1fffffff are reserved for this mapping. + You can check the police actions using the command ``tc action ls action + police`` on Linux systems. Known TC flow offload limitations @@ -112,3 +112,14 @@ First flow packet not processed by meter Packets that are received by ovs-vswitchd through an upcall before the actual meter flow is installed, are not passing TC police action and therefore are not considered for policing. + +Conntrack Application Layer Gateways (ALG) +++++++++++++++++++++++++++++++++++++++++++ + +TC does not support conntrack helpers, i.e., ALGs. TC will not offload flows if +the ALG keyword is present within the ct() action. However, this will not allow +ALGs to work within the datapath, as the return traffic without the ALG keyword +might run through a TC rule, which internally will not call the conntrack +helper required. + +So if ALG support is required, tc offload must be disabled. diff --git a/Documentation/index.rst b/Documentation/index.rst index 3cdc87c6984..7041384733d 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -43,6 +43,7 @@ Contributing - :doc:`internals/contributing/submitting-patches` - :doc:`internals/contributing/backporting-patches` + - :doc:`internals/contributing/inclusive-language` - :doc:`internals/contributing/coding-style` - :doc:`internals/contributing/coding-style-windows` diff --git a/Documentation/internals/committer-grant-revocation.rst b/Documentation/internals/committer-grant-revocation.rst index c011df4aec0..7231762d8f1 100644 --- a/Documentation/internals/committer-grant-revocation.rst +++ b/Documentation/internals/committer-grant-revocation.rst @@ -26,7 +26,7 @@ OVS Committer Grant/Revocation Policy ===================================== An OVS committer is a participant in the project with the ability to commit -code directly to the master repository. Commit access grants a broad ability to +code directly to the main repository. Commit access grants a broad ability to affect the progress of the project as presented by its most important artifact, the code and related resources that produce working binaries of Open vSwitch. As such it represents a significant level of trust in an individual's diff --git a/Documentation/internals/committer-responsibilities.rst b/Documentation/internals/committer-responsibilities.rst index c35fd708913..eed2e017678 100644 --- a/Documentation/internals/committer-responsibilities.rst +++ b/Documentation/internals/committer-responsibilities.rst @@ -73,14 +73,14 @@ If it is someone else's change, then you can ask the original submitter to address it. Regardless, you need to ensure that the problem is fixed in a timely way. The definition of "timely" depends on the severity of the problem. -If a bug is present on master and other branches, fix it on master first, then +If a bug is present on main and other branches, fix it on main first, then backport the fix to other branches. Straightforward backports do not require -additional review (beyond that for the fix on master). +additional review (beyond that for the fix on main). -Feature development should be done only on master. Occasionally it makes sense +Feature development should be done only on main. Occasionally it makes sense to add a feature to the most recent release branch, before the first actual release of that branch. These should be handled in the same way as bug fixes, -that is, first implemented on master and then backported. +that is, first implemented on main and then backported. Keep the authorship of a commit clear by maintaining a correct list of "Signed-off-by:"s. If a confusing situation comes up, as it occasionally does, @@ -99,7 +99,7 @@ Pre-Push Hook ------------- The following script can be helpful because it provides an extra -chance to check for mistakes while pushing to the master branch of OVS +chance to check for mistakes while pushing to the main branch of OVS or OVN. If you would like to use it, install it as ``hooks/pre-push`` in your ``.git`` directory and make sure to mark it as executable with ``chmod +x``. For maximum utility, make sure ``checkpatch.py`` is in @@ -118,7 +118,7 @@ in your ``.git`` directory and make sure to mark it as executable with while read local_ref local_sha1 remote_ref remote_sha1; do case $remote_ref in - refs/heads/master) + refs/heads/main) n=0 while read sha do diff --git a/Documentation/internals/contributing/backporting-patches.rst b/Documentation/internals/contributing/backporting-patches.rst index fae416eb3cd..2007a429c7b 100644 --- a/Documentation/internals/contributing/backporting-patches.rst +++ b/Documentation/internals/contributing/backporting-patches.rst @@ -43,36 +43,39 @@ within Open vSwitch, but is broadly applied in the following fashion: - Maintainers backport changes from a development branch to release branches. With regards to Open vSwitch user space code and code that does not comprise -the Linux datapath and compat code, the development branch is `master` in the +the Linux datapath and compat code, the development branch is `main` in the Open vSwitch repository. Patches are applied first to this branch, then to the most recent `branch-X.Y`, then earlier `branch-X.Z`, and so on. The most common -kind of patch in this category is a bugfix which affects master and other +kind of patch in this category is a bugfix which affects main and other branches. For Linux datapath code, the primary development branch is in the `net-next`_ tree as described in the section below, and patch discussion occurs on the `netdev`__ mailing list. Patches are first applied to the upstream branch by the -networking maintainer, then the contributor backports the patch to the Open -vSwitch `master` development branch. Patches in this category may include -features which have been applied upstream, or bugfixes to the Open vSwitch -datapath code. For bugfixes, the patches subsequently follow the regular Open -vSwitch process as described above to reach older branches. +networking maintainers, then the contributor backports the patch to an Open +vSwitch branch. Patches in this category may include features which have +been applied upstream, or bugfixes to the Open vSwitch datapath code. -__ http://vger.kernel.org/vger-lists.html#netdev +The practice for Linux datapath code described above is currently only +applicable to bugfixes for Open vSwitch 2.17. This is because all earlier +versions are EOL and all subsequent versions do not include the Linux +datapath as it is now maintained as part of the upstream Linux kernel. + +__ https://lore.kernel.org/netdev/ Changes to userspace components ------------------------------- Patches which are fixing bugs should be considered for backporting from -`master` to release branches. Open vSwitch contributors submit their patches -targeted to the `master` branch, using the ``Fixes`` tag described in -:doc:`submitting-patches`. The maintainer first applies the patch to `master`, +`main` to release branches. Open vSwitch contributors submit their patches +targeted to the `main` branch, using the ``Fixes`` tag described in +:doc:`submitting-patches`. The maintainer first applies the patch to `main`, then backports the patch to each older affected tree, as far back as it goes or at least to all currently supported branches. This is usually each branch back to the oldest maintained LTS release branch or the last 4 release branches if the oldest LTS is newer. -If the fix only affects a particular branch and not `master`, contributors +If the fix only affects a particular branch and not `main`, contributors should submit the change with the target branch listed in the subject line of the patch. Contributors should list all versions that the bug affects. The ``git format-patch`` argument ``--subject-prefix`` may be used when posting the @@ -93,8 +96,8 @@ Changes to Linux kernel components The Linux kernel components in Open vSwitch go through initial review in the upstream Linux netdev community before they go into the Open vSwitch tree. As such, backports from upstream to the Open vSwitch tree may include bugfixes or -new features. The `netdev-FAQ`_ describes the general process for merging -patches to the upstream Linux tree. +new features. The `Netdev Maintainer Handbook`_ describes the general +process for merging patches to the upstream Linux tree. To keep track of the changes which are made upstream against the changes which have been backported to the Open vSwitch tree, backports should be done in the @@ -113,8 +116,8 @@ interests of keeping the Open vSwitch tree in sync with upstream `net-next`, contributors may send Open vSwitch kernel module changes independently of userspace changes. -.. _netdev-faq: https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt -.. _net-next: http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git +.. _Netdev Maintainer Handbook: https://docs.kernel.org/process/maintainer-netdev.html +.. _net-next: https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git How to backport kernel patches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/internals/contributing/documentation-style.rst b/Documentation/internals/contributing/documentation-style.rst index 045cdf69672..2eec4c4d269 100644 --- a/Documentation/internals/contributing/documentation-style.rst +++ b/Documentation/internals/contributing/documentation-style.rst @@ -423,10 +423,6 @@ Helpful Tools There are a number of tools, online and offline, which can be used to preview documents are you edit them: -- `rst.ninjs.org `__ - - An online rST editor/previewer - - `ReText `__ A simple but powerful editor for Markdown and reStructuredText. ReText is diff --git a/Documentation/internals/contributing/inclusive-language.rst b/Documentation/internals/contributing/inclusive-language.rst new file mode 100644 index 00000000000..e8ee0958b51 --- /dev/null +++ b/Documentation/internals/contributing/inclusive-language.rst @@ -0,0 +1,58 @@ +.. + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + Convention for heading levels in Open vSwitch documentation: + + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + + Avoid deeper levels because they do not render well. + +================== +Inclusive Language +================== + +In order to help facilitate an inclusive environment in the Open vSwitch +community we recognise the role of language in framing our +communication with each other. It is important that terms that +may exclude people through racial, cultural or other bias, are avoided +as they may make people feel excluded. + +We recognise that this is subjective, and to some extent is a journey. +But we also recognise that we cannot begin that journey without taking +positive action. To this end Open vSwitch is adopting the practice +of an inclusive word list, which helps to guide the use of language within +the project. + +.. _word list: + +Word List +--------- + +The intent of this document is to formally document the acceptance of a +inclusive word list by Open vSwitch. Accordingly, this document specifies +use of the use the `Inclusive Naming Word List +`__ v1.0 (the word list) for Open +vSwitch. + +The adoption of the word list intended that this act as a guide for +developers creating patches to the Open vSwitch repository, including both +source code and documentation. And to aid maintainers in their role of +shepherding changes into the repository. + +Further steps to align usage of language in Open vSwitch, including +clarification of application of the word list, to new and existing work, +may follow. diff --git a/Documentation/internals/contributing/index.rst b/Documentation/internals/contributing/index.rst index a46cb046a0f..91304e60bdc 100644 --- a/Documentation/internals/contributing/index.rst +++ b/Documentation/internals/contributing/index.rst @@ -35,4 +35,5 @@ The below guides provide information on contributing to Open vSwitch itself. coding-style coding-style-windows documentation-style + inclusive-language libopenvswitch-abi diff --git a/Documentation/internals/contributing/submitting-patches.rst b/Documentation/internals/contributing/submitting-patches.rst index 9d718982712..8a8bc11b0a9 100644 --- a/Documentation/internals/contributing/submitting-patches.rst +++ b/Documentation/internals/contributing/submitting-patches.rst @@ -68,10 +68,9 @@ Testing is also important: feature. A bug fix patch should preferably add a test that would fail if the bug recurs. -If you are using GitHub, then you may utilize the travis-ci.org and the GitHub -Actions CI build systems. They will run some of the above tests automatically -when you push changes to your repository. See the "Continuous Integration with -Travis-CI" in :doc:`/topics/testing` for details on how to set it up. +If you are using GitHub, then you may utilize the GitHub Actions CI build +systems. They will run some of the above tests automatically +when you push changes to your repository. Email Subject ------------- diff --git a/Documentation/internals/maintainers.rst b/Documentation/internals/maintainers.rst index 172d684df97..0203bbe9554 100644 --- a/Documentation/internals/maintainers.rst +++ b/Documentation/internals/maintainers.rst @@ -22,3 +22,8 @@ Avoid deeper levels because they do not render well. .. include:: ../../MAINTAINERS.rst + :end-before: Cut here for the Documentation/internals/maintainers.rst + +.. |responsibilities| replace:: :doc:`committer-responsibilities` +.. |grant-revocation| replace:: :doc:`committer-grant-revocation` +.. |emeritus-status| replace:: :doc:`committer-emeritus-status` diff --git a/Documentation/internals/release-process.rst b/Documentation/internals/release-process.rst index 0eb8e192a0e..f0c745dc6de 100644 --- a/Documentation/internals/release-process.rst +++ b/Documentation/internals/release-process.rst @@ -34,33 +34,33 @@ or the #openvswitch IRC channel. Release Strategy ---------------- -Open vSwitch feature development takes place on the "master" branch. -Ordinarily, new features are rebased against master and applied directly. For +Open vSwitch feature development takes place on the "main" branch. +Ordinarily, new features are rebased against main and applied directly. For features that take significant development, sometimes it is more appropriate to -merge a separate branch into master; please discuss this on ovs-dev in advance. +merge a separate branch into main; please discuss this on ovs-dev in advance. The process of making a release has the following stages. See `Release Scheduling`_ for the timing of each stage: -1. "Soft freeze" of the master branch. +1. "Soft freeze" of the main branch. During the freeze, we ask committers to refrain from applying patches that add new features unless those patches were already being publicly discussed and reviewed before the freeze began. Bug fixes are welcome at any time. Please propose and discuss exceptions on ovs-dev. -2. Fork a release branch from master, named for the expected release number, +2. Fork a release branch from main, named for the expected release number, e.g. "branch-2.3" for the branch that will yield Open vSwitch 2.3.x. Release branches are intended for testing and stabilization. At this stage and in later stages, they should receive only bug fixes, not new features. Bug fixes applied to release branches should be backports of corresponding - bug fixes to the master branch, except for bugs present only on release + bug fixes to the main branch, except for bugs present only on release branches (which are rare in practice). At this stage, sometimes there can be exceptions to the rule that a release branch receives only bug fixes. Like bug fixes, new features on release - branches should be backports of the corresponding commits on the master + branches should be backports of the corresponding commits on the main branch. Features to be added to release branches should be limited in scope and risk and discussed on ovs-dev before creating the branch. @@ -96,18 +96,22 @@ LTS designation schedule example (depends on current state of development): +---------+--------------+--------------------------------------------------+ | Version | Release Date | Actions | +---------+--------------+--------------------------------------------------+ -| 2.14 | Aug 2020 | 2.14 - new latest stable, 2.13 stable ⟶ new LTS | -+---------+--------------+--------------------------------------------------+ -| 2.15 | Feb 2021 | 2.12 - new latest stable, 2.5 LTS ⟶ EOL | -+---------+--------------+--------------------------------------------------+ -| 2.16 | Aug 2021 | 2.16 - new latest stable | -+---------+--------------+--------------------------------------------------+ | 2.17 | Feb 2022 | 2.17 - new latest stable | +---------+--------------+--------------------------------------------------+ | 3.0 | Aug 2022 | 3.0 - new latest stable, 2.17 stable ⟶ new LTS | +---------+--------------+--------------------------------------------------+ | 3.1 | Feb 2023 | 3.1 - new latest stable, 2.13 LTS ⟶ EOL | +---------+--------------+--------------------------------------------------+ +| 3.2 | Aug 2023 | 3.2 - new latest stable | ++---------+--------------+--------------------------------------------------+ +| 3.3 | Feb 2024 | 3.3 - new latest stable | ++---------+--------------+--------------------------------------------------+ +| 3.4 | Aug 2024 | 3.4 - new latest stable, 3.3 stable ⟶ new LTS | ++---------+--------------+--------------------------------------------------+ +| 3.5 | Feb 2025 | 3.5 - new latest stable, 2.17 LTS ⟶ EOL | ++---------+--------------+--------------------------------------------------+ +| 3.6 | Aug 2025 | 3.6 - new latest stable | ++---------+--------------+--------------------------------------------------+ While branches other than LTS and the latest release are not formally maintained, the OVS project usually provides stable releases for these branches @@ -121,10 +125,10 @@ intermediate branches). Release Numbering ----------------- -The version number on master should normally end in .90. This indicates that +The version number on main should normally end in .90. This indicates that the Open vSwitch version is "almost" the next version to branch. -Forking master into branch-x.y requires two commits to master. The first is +Forking main into branch-x.y requires two commits to main. The first is titled "Prepare for x.y.0" and increments the version number to x.y. This is the initial commit on branch-x.y. The second is titled "Prepare for post-x.y.0 (x.y.90)" and increments the version number to x.y.90. @@ -142,23 +146,23 @@ Release Scheduling Open vSwitch makes releases at the following six-month cadence. All dates are approximate: -+---------------+----------------+--------------------------------------+ -| Time (months) | Dates | Stage | -+---------------+----------------+--------------------------------------+ -| T | Mar 1, Sep 1 | Begin x.y release cycle | -+---------------+----------------+--------------------------------------+ -| T + 4 | Jul 1, Jan 1 | "Soft freeze" master for x.y release | -+---------------+----------------+--------------------------------------+ -| T + 4.5 | Jul 15, Jan 15 | Fork branch-x.y from master | -+---------------+----------------+--------------------------------------+ -| T + 5.5 | Aug 15, Feb 15 | Release version x.y.0 | -+---------------+----------------+--------------------------------------+ ++---------------+----------------+------------------------------------+ +| Time (months) | Dates | Stage | ++---------------+----------------+------------------------------------+ +| T | Mar 1, Sep 1 | Begin x.y release cycle | ++---------------+----------------+------------------------------------+ +| T + 4 | Jul 1, Jan 1 | "Soft freeze" main for x.y release | ++---------------+----------------+------------------------------------+ +| T + 4.5 | Jul 15, Jan 15 | Fork branch-x.y from main | ++---------------+----------------+------------------------------------+ +| T + 5.5 | Aug 15, Feb 15 | Release version x.y.0 | ++---------------+----------------+------------------------------------+ How to Branch ------------- -To branch "master" for the eventual release of OVS version x.y.0, -prepare two patches against master: +To branch "main" for the eventual release of OVS version x.y.0, +prepare two patches against main: 1. "Prepare for x.y.0." following the model of commit 836d1973c56e ("Prepare for 2.11.0."). @@ -168,12 +172,12 @@ prepare two patches against master: Post both patches to ovs-dev. Get them reviewed in the usual way. -Apply both patches to master, and create branch-x.y by pushing only +Apply both patches to main, and create branch-x.y by pushing only the first patch. The following command illustrates how to do both of these at once assuming the local repository HEAD points to the "Prepare for post-x.y.0" commit: - git push origin HEAD:master HEAD^:refs/heads/branch-x.y + git push origin HEAD:main HEAD^:refs/heads/branch-x.y Branching should be announced on ovs-dev. @@ -196,7 +200,7 @@ Follow these steps to release version x.y.z of OVS from branch-x.y. 4. Apply the patches to branch-x.y. -5. If z = 0, apply the first patch (only) to master. +5. If z = 0, apply the first patch (only) to main. 6. Sign a tag vx.y.z "Open vSwitch version x.y.z" and push it to the repo. diff --git a/Documentation/internals/security.rst b/Documentation/internals/security.rst index 444d07c3563..e211c16a431 100644 --- a/Documentation/internals/security.rst +++ b/Documentation/internals/security.rst @@ -90,11 +90,11 @@ Reporters may ask for a GPG key while initiating contact with the security team to deliver more sensitive reports. The Linux kernel has `its own vulnerability management process -`__. Handling -of vulnerabilities that affect both the Open vSwitch tree and the upstream -Linux kernel should be reported through both processes. Send your report as a -single email to both the kernel and OVS security teams to allow those teams to -most easily coordinate among themselves. +`__. +Handling of vulnerabilities that affect both the Open vSwitch tree and the +upstream Linux kernel should be reported through both processes. Send your +report as a single email to both the kernel and OVS security teams to allow +those teams to most easily coordinate among themselves. Step 2: Assessment ------------------ diff --git a/Documentation/intro/install/afxdp.rst b/Documentation/intro/install/afxdp.rst index bfef4986015..7fa8088c6ee 100644 --- a/Documentation/intro/install/afxdp.rst +++ b/Documentation/intro/install/afxdp.rst @@ -30,8 +30,7 @@ This document describes how to build and install Open vSwitch using AF_XDP netdev. .. warning:: - The AF_XDP support of Open vSwitch is considered 'experimental', - and it is not compiled in by default. + The AF_XDP support of Open vSwitch is considered 'experimental'. Introduction @@ -88,7 +87,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with AF_XDP will require the following: -- libbpf from kernel source tree (kernel 5.0.0 or later) +- ``libbpf`` and ``libxdp`` (if version of ``libbpf`` if higher than ``0.6``). - Linux kernel XDP support, with the following options (required) @@ -104,7 +103,7 @@ vSwitch with AF_XDP will require the following: * CONFIG_BPF_JIT=y (Performance) - * CONFIG_HAVE_BPF_JIT=y (Performance) + * CONFIG_HAVE_EBPF_JIT=y (Performance) * CONFIG_XDP_SOCKETS_DIAG=y (Debugging) @@ -125,40 +124,20 @@ vSwitch with AF_XDP will require the following: Installing ---------- For OVS to use AF_XDP netdev, it has to be configured with LIBBPF support. -First, clone a recent version of Linux bpf-next tree:: - git clone git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git +First, install ``libbpf`` and ``libxdp``. For example, on Fedora these +libraries along with development headers can be obtained by installing +``libbpf-devel`` and ``libxdp-devel`` packages. For Ubuntu that will be +``libbpf-dev`` package with additional ``libxdp-dev`` on Ubuntu 22.10 +or later. -Second, go into the Linux source directory and build libbpf in the tools -directory:: - - cd bpf-next/ - cd tools/lib/bpf/ - make && make install - make install_headers - -.. note:: - Make sure xsk.h and bpf.h are installed in system's library path, - e.g. /usr/local/include/bpf/ or /usr/include/bpf/ - -Make sure the libbpf.so is installed correctly:: - - ldconfig - ldconfig -p | grep libbpf - -.. note:: - Check /etc/ld.so.conf if libbpf is installed but can not be found by - ldconfig. - -Third, ensure the standard OVS requirements are installed and +Next, ensure the standard OVS requirements are installed and bootstrap/configure the package:: ./boot.sh && ./configure --enable-afxdp -.. note:: - If you encounter "WARNING: bpf/libbpf.h: present but cannot be compiled", - check the Linux headers are in line with libbpf. For example, in Ubuntu, - check the installed linux-headers* and linux-libc-dev* dpkg. +``--enable-afxdp`` here is optional, but it will ensure that all dependencies +are available at the build time. Finally, build and install OVS:: @@ -171,7 +150,7 @@ To kick start end-to-end autotesting:: make check-afxdp TESTSUITEFLAGS='1' .. note:: - Not all test cases pass at this time. Currenly all cvlan tests are skipped + Not all test cases pass at this time. Currently all cvlan tests are skipped due to kernel issues. If a test case fails, check the log at:: @@ -182,7 +161,7 @@ If a test case fails, check the log at:: Setup AF_XDP netdev ------------------- -Before running OVS with AF_XDP, make sure the libbpf, libelf, and libnuma are +Before running OVS with AF_XDP, make sure the libbpf and libnuma are set-up right:: ldd vswitchd/ovs-vswitchd @@ -240,14 +219,10 @@ Otherwise, enable debugging by:: ovs-appctl vlog/set netdev_afxdp::dbg To check which XDP mode was chosen by ``best-effort``, you can look for -``xdp-mode-in-use`` in the output of ``ovs-appctl dpctl/show``:: - - # ovs-appctl dpctl/show - netdev@ovs-netdev: - <...> - port 2: ens802f0 (afxdp: n_rxq=1, use-need-wakeup=true, - xdp-mode=best-effort, - xdp-mode-in-use=native-with-zerocopy) +``xdp-mode`` in the output of ``ovs-vsctl get interface INT status:xdp-mode``:: + + # ovs-vsctl get interface ens802f0 status:xdp-mode + "native-with-zerocopy" References ---------- diff --git a/Documentation/intro/install/documentation.rst b/Documentation/intro/install/documentation.rst index acf5b3a3ff3..049ca3d33be 100644 --- a/Documentation/intro/install/documentation.rst +++ b/Documentation/intro/install/documentation.rst @@ -79,9 +79,9 @@ Makefile targets:: .. important:: The ``docs-check`` target will fail if there are any syntax errors. - However, it won't catch more succint issues such as style or grammar issues. - As a result, you should always inspect changes visually to ensure the result - is as intended. + However, it won't catch more succinct issues such as style or grammar + issues. As a result, you should always inspect changes visually to ensure + the result is as intended. Once built, documentation is available in the ``/Documentation/_build`` folder. Open the root ``index.html`` to browse the documentation. diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index a284e68514c..63a978f0e81 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -33,7 +33,7 @@ userspace. The :doc:`releases FAQ ` lists support for the required versions of DPDK for each version of Open vSwitch. If building OVS and - DPDK outside of the master build tree users should consult this list + DPDK outside of the main build tree users should consult this list first. Build requirements @@ -42,7 +42,7 @@ Build requirements In addition to the requirements described in :doc:`general`, building Open vSwitch with DPDK will require the following: -- DPDK 21.11.2 +- DPDK 23.11.1 - A `DPDK supported NIC`_ @@ -59,8 +59,8 @@ vSwitch with DPDK will require the following: Detailed system requirements can be found at `DPDK requirements`_. -.. _DPDK supported NIC: https://doc.dpdk.org/guides-21.11/nics/index.html -.. _DPDK requirements: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html +.. _DPDK supported NIC: https://doc.dpdk.org/guides-23.11/nics/index.html +.. _DPDK requirements: https://doc.dpdk.org/guides-23.11/linux_gsg/sys_reqs.html .. _dpdk-install: @@ -73,9 +73,9 @@ Install DPDK #. Download the `DPDK sources`_, extract the file and set ``DPDK_DIR``:: $ cd /usr/src/ - $ wget https://fast.dpdk.org/rel/dpdk-21.11.2.tar.xz - $ tar xf dpdk-21.11.2.tar.xz - $ export DPDK_DIR=/usr/src/dpdk-stable-21.11.2 + $ wget https://fast.dpdk.org/rel/dpdk-23.11.1.tar.xz + $ tar xf dpdk-23.11.1.tar.xz + $ export DPDK_DIR=/usr/src/dpdk-stable-23.11.1 $ cd $DPDK_DIR #. Configure and install DPDK using Meson @@ -121,7 +121,7 @@ Install DPDK .. _DPDK sources: http://dpdk.org/rel .. _DPDK documentation: - https://doc.dpdk.org/guides-21.11/linux_gsg/build_dpdk.html + https://doc.dpdk.org/guides-23.11/linux_gsg/build_dpdk.html Install OVS ~~~~~~~~~~~ @@ -174,7 +174,7 @@ Additional information can be found in :doc:`general`. daemon will run as a non-root user. This implies that you must have a working IOMMU. Visit the `RHEL README`__ for additional information. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Possible issues when enabling AVX512 @@ -232,7 +232,7 @@ Mount the hugepages, if not already mounted by default:: Setup DPDK devices using VFIO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -VFIO is prefered to the UIO driver when using recent versions of DPDK. VFIO +VFIO is preferred to the UIO driver when using recent versions of DPDK. VFIO support required support from both the kernel and BIOS. For the former, kernel version > 3.6 must be used. For the latter, you must enable VT-d in the BIOS and ensure this is configured via grub. To ensure VT-d is enabled via the BIOS, @@ -506,17 +506,17 @@ options. Affinity ~~~~~~~~ -For superior performance, DPDK pmd threads and Qemu vCPU threads needs to be -affinitized accordingly. +For superior performance, DPDK pmd threads and Qemu vCPU threads need to +have affinity set accordingly. - PMD thread Affinity A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces assigned to it. A pmd thread shall poll the ports for incoming packets, switch the packets and send to tx port. A pmd thread is CPU bound, and needs - to be affinitized to isolated cores for optimum performance. Even though a - PMD thread may exist, the thread only starts consuming CPU cycles if there is - at least one receive queue assigned to the pmd. + to be have affinity set to isolated cores for optimum performance. Even + though a PMD thread may exist, the thread only starts consuming CPU cycles if + there is at least one receive queue assigned to the pmd. .. note:: On NUMA systems, PCI devices are also local to a NUMA node. Unbound rx @@ -722,7 +722,7 @@ Limitations release notes`_. .. _DPDK release notes: - https://doc.dpdk.org/guides-21.11/rel_notes/release_21_11.html + https://doc.dpdk.org/guides-23.11/rel_notes/release_23_11.html - Upper bound MTU: DPDK device drivers differ in how the L2 frame for a given MTU value is calculated e.g. i40e driver includes 2 x vlan headers in diff --git a/Documentation/intro/install/fedora.rst b/Documentation/intro/install/fedora.rst index 02481597ffe..f8a6bb6b603 100644 --- a/Documentation/intro/install/fedora.rst +++ b/Documentation/intro/install/fedora.rst @@ -84,8 +84,8 @@ YUM:: Once that is completed, remove the file ``/tmp/ovs.spec``. -Bootstraping ------------- +Bootstrapping +------------- Refer to :ref:`general-bootstrapping`. @@ -146,7 +146,7 @@ purpose. Refer to the `RHEL README`__ for additional usage and configuration information. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Reporting Bugs -------------- diff --git a/Documentation/intro/install/general.rst b/Documentation/intro/install/general.rst index 42b5682fd87..eb0813b97f9 100644 --- a/Documentation/intro/install/general.rst +++ b/Documentation/intro/install/general.rst @@ -37,7 +37,7 @@ repository, which you can clone into a directory named "ovs" with:: $ git clone https://github.com/openvswitch/ovs.git -Cloning the repository leaves the "master" branch initially checked +Cloning the repository leaves the "main" branch initially checked out. This is the right branch for general development. If, on the other hand, if you want to build a particular released version, you can check it out by running a command such as the following from the @@ -90,7 +90,7 @@ need the following software: If libcap-ng is installed, then Open vSwitch will automatically build with support for it. -- Python 3.4 or later. +- Python 3.6 or later. - Unbound library, from http://www.unbound.net, is optional but recommended if you want to enable ovs-vswitchd and other utilities to use DNS names when @@ -176,10 +176,7 @@ following to obtain better warnings: - clang, version 3.4 or later -- flake8 along with the hacking flake8 plugin (for Python code). The automatic - flake8 check that runs against Python code has some warnings enabled that - come from the "hacking" flake8 plugin. If it's not installed, the warnings - just won't occur until it's run on a system with "hacking" installed. +- flake8 (for Python code) - the python packages listed in "python/test_requirements.txt" (compatible with pip). If they are installed, the pytest-based Python unit tests will @@ -208,7 +205,7 @@ simply install and run Open vSwitch you require the following software: from iproute2 (part of all major distributions and available at https://wiki.linuxfoundation.org/networking/iproute2). -- Python 3.4 or later. +- Python 3.6 or later. On Linux you should ensure that ``/dev/urandom`` exists. To support TAP devices, you must also ensure that ``/dev/net/tun`` exists. @@ -344,6 +341,22 @@ you wish to link with jemalloc add it to LIBS:: $ ./configure LIBS=-ljemalloc +.. note:: + Linking Open vSwitch with the jemalloc shared library may not work as + expected in certain operating system development environments. You can + override the automatic compiler decision to avoid possible linker issues by + passing ``-fno-lto`` or ``-fno-builtin`` flag since the jemalloc override + standard built-in memory allocation functions such as malloc, calloc, etc. + Both options can solve possible jemalloc linker issues with pros and cons for + each case, feel free to choose the path that appears best to you. Disabling + LTO flag example:: + + $ ./configure LIBS=-ljemalloc CFLAGS="-g -O2 -fno-lto" + + Disabling built-in flag example:: + + $ ./configure LIBS=-ljemalloc CFLAGS="-g -O2 -fno-builtin" + .. _general-building: Building @@ -479,7 +492,7 @@ Start ovsdb-server using below command:: $ docker run -itd --net=host --name=ovsdb-server \ : ovsdb-server -Start ovs-vswitchd with priviledged mode as it needs to load kernel module in +Start ovs-vswitchd with privileged mode as it needs to load kernel module in host using below command:: $ docker run -itd --net=host --name=ovs-vswitchd \ diff --git a/Documentation/intro/install/rhel.rst b/Documentation/intro/install/rhel.rst index d1fc42021a6..e442fca0c0e 100644 --- a/Documentation/intro/install/rhel.rst +++ b/Documentation/intro/install/rhel.rst @@ -92,7 +92,7 @@ Once that is completed, remove the file ``/tmp/ovs.spec``. If python3-sphinx package is not available in your version of RHEL, you can install it via pip with 'pip install sphinx'. -Open vSwitch requires python 3.4 or newer which is not available in older +Open vSwitch requires python 3.6 or newer which is not available in older distributions. In the case of RHEL 6.x and its derivatives, one option is to install python34 from `EPEL`_. @@ -211,7 +211,7 @@ implemented. Refer to `README.RHEL.rst`__ in the source tree or /usr/share/doc/openvswitch/README.RHEL.rst in the installed openvswitch package for details. -__ https://github.com/openvswitch/ovs/blob/master/rhel/README.RHEL.rst +__ https://github.com/openvswitch/ovs/blob/main/rhel/README.RHEL.rst Reporting Bugs -------------- diff --git a/Documentation/intro/install/windows.rst b/Documentation/intro/install/windows.rst index 44fc6ae3795..efdb8aebcea 100644 --- a/Documentation/intro/install/windows.rst +++ b/Documentation/intro/install/windows.rst @@ -56,7 +56,7 @@ The following explains the steps in some detail. 'C:/MinGW /mingw'. -- Python 3.4 or later. +- Python 3.6 or later. Install the latest Python 3.x from python.org and verify that its path is part of Windows' PATH environment variable. @@ -112,7 +112,7 @@ The following explains the steps in some detail. `OpenSSL for Windows `__ Note down the directory where OpenSSL is installed (e.g.: - ``C:/OpenSSL-Win32``) for later use. + ``C:/OpenSSL-Win64``) for later use. .. note:: @@ -182,7 +182,7 @@ To configure with SSL support, add the requisite additional options: --localstatedir="C:/openvswitch/var" --sysconfdir="C:/openvswitch/etc" \ --with-pthread="C:/pthread" \ - --enable-ssl --with-openssl="C:/OpenSSL-Win32" + --enable-ssl --with-openssl="C:/OpenSSL-Win64" Finally, to the kernel module also: @@ -194,7 +194,7 @@ Finally, to the kernel module also: --localstatedir="C:/openvswitch/var" \ --sysconfdir="C:/openvswitch/etc" \ --with-pthread="C:/pthread" \ - --enable-ssl --with-openssl="C:/OpenSSL-Win32" \ + --enable-ssl --with-openssl="C:/OpenSSL-Win64" \ --with-vstudiotarget="" \ --with-vstudiotargetver="" @@ -1090,9 +1090,9 @@ To stop and delete the services, run: Windows CI Service ------------------ -`AppVeyor `__ provides a free Windows autobuild service for -open source projects. Open vSwitch has integration with AppVeyor for -continuous build. A developer can build test his changes for Windows by +`AppVeyor `__ provides a free Windows autobuild +service for open source projects. Open vSwitch has integration with AppVeyor +for continuous build. A developer can build test his changes for Windows by logging into appveyor.com using a github account, creating a new project by linking it to his development repository in github and triggering a new build. diff --git a/Documentation/intro/why-ovs.rst b/Documentation/intro/why-ovs.rst index e73066a7665..80a3f2f22f2 100644 --- a/Documentation/intro/why-ovs.rst +++ b/Documentation/intro/why-ovs.rst @@ -125,7 +125,7 @@ previous hypervisor networking stacks, focusing on the need for automated and dynamic network control in large-scale Linux-based virtualization environments. The goal with Open vSwitch is to keep the in-kernel code as small as possible -(as is necessary for performance) and to re-use existing subsystems when +(as is necessary for performance) and to reuse existing subsystems when applicable (for example Open vSwitch uses the existing QoS stack). As of Linux 3.3, Open vSwitch is included as a part of the kernel and packaging for the userspace utilities are available on most popular distributions. diff --git a/Documentation/ref/ovs-actions.7.rst b/Documentation/ref/ovs-actions.7.rst index b59b7634fa0..30d5b98ef4c 100644 --- a/Documentation/ref/ovs-actions.7.rst +++ b/Documentation/ref/ovs-actions.7.rst @@ -694,7 +694,8 @@ encapsulated in an OpenFlow ``packet-in`` message. The supported options are: Limit to *max_len* the number of bytes of the packet to send in the ``packet-in.`` A *max_len* of 0 prevents any of the packet from being sent (thus, only metadata is included). By default, the entire packet is - sent, equivalent to a *max_len* of 65535. + sent, equivalent to a *max_len* of 65535. This option has no effect in + Open vSwith 2.7 and later: the entire packet will always be sent. ``reason=``\ *reason* Specify *reason* as the reason for sending the message in the @@ -733,6 +734,12 @@ encapsulated in an OpenFlow ``packet-in`` message. The supported options are: options require the Open vSwitch ``NXAST_CONTROLLER`` extension action added in Open vSwitch 1.6. + Open vSwitch 2.7 and later is configured to not buffer packets for the + packet-in event. As a result, the full packet is always sent to + controllers. This means that the ``max_len`` option has no effect on the + ``controller`` action, and all values (even 0) are equivalent to the default + value of 65535. + The ``enqueue`` action ---------------------- @@ -1380,7 +1387,7 @@ The ``delete_field`` action | ``delete_field:``\ *field* The ``delete_field`` action deletes a *field* in the syntax described under -`Field Specifications`_ above. Currently, only the ``tun_metadta`` fields are +`Field Specifications`_ above. Currently, only the ``tun_metadata`` fields are supported. This action was added in Open vSwitch 2.14. @@ -1544,8 +1551,7 @@ following arguments: should be selected. When a port range is specified, fallback to ephemeral ports does not happen, else, it will. The port number selection can be informed by the optional ``random`` and ``hash`` flags - described below. The userspace datapath only supports the ``hash`` - behavior. + described below. The optional *flags* are: @@ -2195,13 +2201,17 @@ The following *argument* forms are accepted: The unsigned 32-bit integer identifier of the set of sample collectors to send sampled packets to. Defaults to 0. - ``obs_domain_id=``\ *id* + ``obs_domain_id=``\ *value* When sending samples to IPFIX collectors, the unsigned 32-bit integer - Observation Domain ID sent in every IPFIX flow record. Defaults to 0. + Observation Domain ID sent in every IPFIX flow record. The *value* may + be specified as a 32-bit integer or a field or subfield in the syntax + described under `Field Specifications`_ above. Defaults to 0. - ``obs_point_id=``\ *id* + ``obs_point_id=``\ *value* When sending samples to IPFIX collectors, the unsigned 32-bit integer - Observation Point ID sent in every IPFIX flow record. Defaults to 0. + Observation Point ID sent in every IPFIX flow record. The *value* may + be specified as a 32-bit integer or a field or subfield in the syntax + described under `Field Specifications`_ above. Defaults to 0. ``sampling_port=``\ *port* Sample packets on *port*, which should be the ingress or egress port. This @@ -2226,6 +2236,9 @@ collector sets. **Conformance** This action is an OpenFlow extension added in Open vSwitch 2.4. + Support for subfields in `obs_domain_id` and `obs_point_id` was added in + Open vSwitch 3.4. + Instructions ============ diff --git a/Documentation/ref/ovs-appctl.8.rst b/Documentation/ref/ovs-appctl.8.rst index 3ce02e9848f..7054cf559e5 100644 --- a/Documentation/ref/ovs-appctl.8.rst +++ b/Documentation/ref/ovs-appctl.8.rst @@ -8,6 +8,8 @@ Synopsis ``ovs-appctl`` [``--target=`` | ``-t`` ] [``--timeout=`` | ``-T`` ] +[``--format=`` | ``-f`` ] +[``--pretty``] [...] ``ovs-appctl --help`` @@ -67,6 +69,24 @@ In normal use only a single option is accepted: runtime to approximately seconds. If the timeout expires, ``ovs-appctl`` exits with a ``SIGALRM`` signal. +* ``-f `` or ``--format=`` + + Tells ``ovs-appctl`` which output format to use. By default, or with a + of ``text``, ``ovs-appctl`` will print plain-text for humans. + When is ``json``, ``ovs-appctl`` will return a JSON document. + When ``json`` is requested, but a command has not implemented JSON + output, the plain-text output will be wrapped in a provisional JSON + document with the following structure:: + + {"reply-format":"plain","reply":"$PLAIN_TEXT_HERE"} + +* ``--pretty`` + + By default, JSON output is printed as compactly as possible. This option + causes JSON in output to be printed in a more readable fashion. For + example, members of objects and elements of arrays are printed one + per line, with indentation. Requires ``--format=json``. + Common Commands =============== diff --git a/Documentation/ref/ovs-ctl.8.rst b/Documentation/ref/ovs-ctl.8.rst index 9f077a122c2..cdbaac4dc0b 100644 --- a/Documentation/ref/ovs-ctl.8.rst +++ b/Documentation/ref/ovs-ctl.8.rst @@ -170,8 +170,9 @@ The following options are less important: * ``--no-mlockall`` By default ``ovs-ctl`` passes ``--mlockall`` to ``ovs-vswitchd``, - requesting that it lock all of its virtual memory, preventing it - from being paged to disk. This option suppresses that behavior. + requesting that it lock all of its virtual memory on page fault (on + allocation, when running on Linux kernel 4.4 and older), preventing + it from being paged to disk. This option suppresses that behavior. * ``--no-self-confinement`` diff --git a/Documentation/ref/ovs-tcpdump.8.rst b/Documentation/ref/ovs-tcpdump.8.rst index b9f8cdf6f78..e7bd5e9e4fb 100644 --- a/Documentation/ref/ovs-tcpdump.8.rst +++ b/Documentation/ref/ovs-tcpdump.8.rst @@ -61,8 +61,14 @@ Options If specified, mirror all ports (optional). +* ``--filter `` + + If specified, only mirror packets that match the provided OpenFlow filter. + The available fields are documented in ``ovs-fields(7)``. + See Also ======== ``ovs-appctl(8)``, ``ovs-vswitchd(8)``, ``ovs-pcap(1)``, -``ovs-tcpundump(1)``, ``tcpdump(8)``, ``wireshark(8)``. +``ovs-fields(7)``, ``ovs-tcpundump(1)``, ``tcpdump(8)``, +``wireshark(8)``. diff --git a/Documentation/ref/ovsdb.7.rst b/Documentation/ref/ovsdb.7.rst index 980ba29e760..a45c4ce38b6 100644 --- a/Documentation/ref/ovsdb.7.rst +++ b/Documentation/ref/ovsdb.7.rst @@ -155,6 +155,22 @@ standalone database, configure the server to listen on a "connection method" that the client can reach, then point the client to that connection method. See `Connection Methods`_ below for information about connection methods. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a server +with a **standalone** database may look like this:: + + { + "remotes": { "": {} }, + "databases": { "": {} } + } + +``ovsdb-server`` will infer the service model from the database file itself. +However, if additional verification is desired, an optional +``"service-model": "standalone"`` can be provided for the database file inside +the inner curly braces. If the specified ``service-model`` will not match the +content of the database file, ``ovsdb-server`` will refuse to open this +database. + Active-Backup Database Service Model ------------------------------------ @@ -177,10 +193,36 @@ database file from the active server. Then use connects to the active server. At that point, the backup server will fetch a copy of the active database and keep it up-to-date until it is killed. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a backup +server in this case may look like this:: + + { + "remotes": { "": {} }, + "databases": { + "": { + "service-model": "active-backup", + "backup": true, + "source": { + "": { + "inactivity-probe": , + "max-backoff": + } + } + } + } + } + +All the fields in the ``""`` description above are required. +Options for the ``""`` connection method (``"inactivity-probe"``, etc.) +can be omitted. + When the active server in an active-backup server pair fails, an administrator can switch the backup server to an active role with the ``ovs-appctl`` command ``ovsdb-server/disconnect-active-ovsdb-server``. Clients then have read/write -access to the now-active server. Of course, administrators are slow to respond +access to the now-active server. When the ``--config-file`` is in use, the +same can be achieved by changing the ``"backup"`` value in the file and running +``ovsdb-server/reload`` command. Of course, administrators are slow to respond compared to software, so in practice external management software detects the active server's failure and changes the backup server's role. For example, the "Integration Guide for Centralized Control" in the OVN documentation describes @@ -213,6 +255,12 @@ Open vSwitch 2.6 introduced support for the active-backup service model. `Upgrading from version 2.14 and earlier to 2.15 and later`_ and `Downgrading from version 2.15 and later to 2.14 and earlier`_. + Another change happened in version 3.2. To upgrade/downgrade the + ``ovsdb-server`` processes across this version follow the instructions + described under + `Upgrading from version 3.1 and earlier to 3.2 and later`_ and + `Downgrading from version 3.2 and later to 3.1 and earlier`_. + Clustered Database Service Model -------------------------------- @@ -230,6 +278,22 @@ To set up a clustered database, first initialize it on a single node by running arguments, the ``create-cluster`` command can create an empty database or copy a standalone database's contents into the new database. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a server +with a **clustered** database may look like this:: + + { + "remotes": { "": {} }, + "databases": { "": {} } + } + +``ovsdb-server`` will infer the service model from the database file itself. +However, if additional verification is desired, an optional +``"service-model": "clustered"`` can be provided for the database file inside +the inner curly braces. If the specified ``service-model`` will not match the +content of the database file, ``ovsdb-server`` will refuse to open this +database. + To configure a client to use a clustered database, first configure all of the servers to listen on a connection method that the client can reach, then point the client to all of the servers' connection methods, comma-separated. See @@ -251,16 +315,11 @@ The above methods for adding and removing servers only work for healthy clusters, that is, for clusters with no more failures than their maximum tolerance. For example, in a 3-server cluster, the failure of 2 servers prevents servers joining or leaving the cluster (as well as database access). + To prevent data loss or inconsistency, the preferred solution to this problem is to bring up enough of the failed servers to make the cluster healthy again, then if necessary remove any remaining failed servers and add new ones. If -this cannot be done, though, use ``ovs-appctl`` to invoke ``cluster/leave ---force`` on a running server. This command forces the server to which it is -directed to leave its cluster and form a new single-node cluster that contains -only itself. The data in the new cluster may be inconsistent with the former -cluster: transactions not yet replicated to the server will be lost, and -transactions not yet applied to the cluster may be committed. Afterward, any -servers in its former cluster will regard the server to have failed. +this is not an option, see the next section for `Manual cluster recovery`_. Once a server leaves a cluster, it may never rejoin it. Instead, create a new server and join it to the cluster. @@ -287,11 +346,51 @@ schema, which is covered later under `Upgrading or Downgrading a Database`_.) `Upgrading from version 2.14 and earlier to 2.15 and later`_ and `Downgrading from version 2.15 and later to 2.14 and earlier`_. + Another change happened in version 3.2. To upgrade/downgrade the + ``ovsdb-server`` processes across this version follow the instructions + described under + `Upgrading from version 3.1 and earlier to 3.2 and later`_ and + `Downgrading from version 3.2 and later to 3.1 and earlier`_. + Clustered OVSDB does not support the OVSDB "ephemeral columns" feature. ``ovsdb-tool`` and ``ovsdb-client`` change ephemeral columns into persistent ones when they work with schemas for clustered databases. Future versions of OVSDB might add support for this feature. +Manual cluster recovery +~~~~~~~~~~~~~~~~~~~~~~~ + +.. important:: + + The procedure below will result in ``cid`` and ``sid`` change. A *new* + cluster will be initialized. + +To recover a clustered database after a failure: + +1. Stop *all* old cluster ``ovsdb-server`` instances before proceeding. + +2. Pick one of the old members which will serve as a bootstrap member of the + to-be-recovered cluster. + +3. Convert its database file to the standalone format using ``ovsdb-tool + cluster-to-standalone``. + +4. Backup the standalone database file. + +5. Create a new single-node cluster with ``ovsdb-tool create-cluster`` + using the previously saved standalone database file, then start + ``ovsdb-server``. + +6. Once the single-node cluster is up and running and serves the restored data, + new members should be created and added to the cluster, as usual, with + ``ovsdb-tool join-cluster``. + +.. note:: + + The data in the new cluster may be inconsistent with the former cluster: + transactions not yet replicated to the server chosen in step 2 will be lost, + and transactions not yet applied to the cluster may be committed. + Upgrading from version 2.14 and earlier to 2.15 and later ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -341,6 +440,57 @@ For all service models it's required to: 3. Downgrade and restart ``ovsdb-server`` processes. +Upgrading from version 3.1 and earlier to 3.2 and later +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is another change of a database file format in version 3.2 that doesn't +allow older versions of ``ovsdb-server`` to read the database file modified by +the ``ovsdb-server`` version 3.2 or later. This also affects runtime +communications between servers in **cluster** service models. To upgrade the +``ovsdb-server`` processes from one version of Open vSwitch (3.1 or earlier) to +another (3.2 or higher) instructions below should be followed. (This is +different from upgrading a database schema, which is covered later under +`Upgrading or Downgrading a Database`_.) + +In case of **standalone** or **active-backup** service model no special +handling during upgrade is required. + +For the **cluster** service model recommended upgrade strategy is following: + +1. Upgrade processes one at a time. Each ``ovsdb-server`` process after + upgrade should be started with ``--disable-file-no-data-conversion`` command + line argument. + +2. When all ``ovsdb-server`` processes upgraded, use ``ovs-appctl`` to invoke + ``ovsdb/file/no-data-conversion-enable`` command on each of them or restart + all ``ovsdb-server`` processes one at a time without + ``--disable-file-no-data-conversion`` command line option. + +Downgrading from version 3.2 and later to 3.1 and earlier +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to upgrading covered under `Upgrading from version 3.1 and earlier to +3.2 and later`_, downgrading from the ``ovsdb-server`` version 3.2 and later +to 3.1 and earlier requires additional steps. (This is different from +upgrading a database schema, which is covered later under +`Upgrading or Downgrading a Database`_.) + +For all service models it's required to: + +1. Compact all database files via ``ovsdb-server/compact`` command with + ``ovs-appctl`` utility. This should be done for each involved + ``ovsdb-server`` process separately (single process for **standalone** + service model, all involved processes for **active-backup** and **cluster** + service models). + +2. Stop all ``ovsdb-server`` processes. Make sure that no database schema + conversion operations were performed between steps 1 and 2. For + **standalone** and **active-backup** service models, the database compaction + can be performed after stopping all the processes instead with the + ``ovsdb-tool compact`` command. + +3. Downgrade and restart ``ovsdb-server`` processes. + Understanding Cluster Consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -442,6 +592,29 @@ server. ```` could contain a comma-separated list of connection methods, e.g. to connect to any server of the clustered database. Multiple relay servers could be started for the same relay source. +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for a relay +database server in this case may look like this:: + + { + "remotes": { "": {} }, + "databases": { + "": { + "service-model": "relay", + "source": { + "": { + "inactivity-probe": , + "max-backoff": + } + } + } + } + } + +Both the ``"service-model"`` and the ``"source"`` are required. Options for +the ``""`` connection method (``"inactivity-probe"``, etc.) +can be omitted. + Since the way relays handle read and write transactions is very similar to the clustered model where "cluster" means "set of relay servers connected to the same relay source", "follower" means "relay server" and the "leader" @@ -566,7 +739,8 @@ Creating a Database Creating and starting up the service for a new database was covered separately for each database service model in the `Service -Models`_ section, above. +Models`_ section, above. A single ``ovsdb-server`` process may serve +any number of databases with different service models at the same time. Backing Up and Restoring a Database ----------------------------------- diff --git a/Documentation/requirements.txt b/Documentation/requirements.txt index 77130c6e01b..77f44bd7654 100644 --- a/Documentation/requirements.txt +++ b/Documentation/requirements.txt @@ -1,2 +1,2 @@ -sphinx>=1.1,<2.0 +sphinx>=1.1 ovs_sphinx_theme>=1.0,<1.1 diff --git a/Documentation/topics/dpdk/bridge.rst b/Documentation/topics/dpdk/bridge.rst index 354f1ced143..a077385e9b3 100644 --- a/Documentation/topics/dpdk/bridge.rst +++ b/Documentation/topics/dpdk/bridge.rst @@ -52,7 +52,7 @@ DPDK physical ports and contain all "dropped", "error" and "management" counters from ``XSTATS``. A list of all ``XSTATS`` counters can be found `here`__. -__ https://wiki.opnfv.org/display/fastpath/Collectd+Metrics+and+Events +__ https://wiki.anuket.io/display/HOME/Collectd+Metrics+and+Events .. note:: @@ -98,7 +98,7 @@ datapath flows with very simple match criteria. In theory, for very simple forwarding, OVS doesn't need to parse packets at all in order to follow these rules. In practice, due to various implementation constraints, userspace datapath has to match at least on a small set of packet -fileds. Some matching criteria (for example, ingress port) are not related to +fields. Some matching criteria (for example, ingress port) are not related to the packet itself and others (for example, VLAN tag or Ethernet type) can be extracted without fully parsing the packet. This allows OVS to significantly speed up packet forwarding for these flows with simple match criteria. @@ -202,10 +202,15 @@ get command, note the updated priority of the ``avx512_gather`` function:: avx512_gather (Use count: 0, Priority: 3) If two lookup functions have the same priority, the first one in the list is -chosen, and the 2nd occurance of that priority is not used. Put in logical +chosen, and the 2nd occurrence of that priority is not used. Put in logical terms, a subtable is chosen if its priority is greater than the previous best candidate. +Note that the ``avx512_gather`` implementation uses instructions which may be +affected by the Gather Data Sampling (GDS) vulnerability, aka Downfall, +mitigation (see documentation for CVE-2022-40982 for details). This could +result in lower performance when these mitigations are enabled. + Optimizing Specific Subtable Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -275,7 +280,7 @@ composed of bits and blocks where the bits signify which blocks are set or have values where as the blocks hold the metadata, ip, udp, vlan, etc. These values are used by the datapath for switching decisions later. -Most modern CPUs have some SIMD (single instruction, mutiple data) +Most modern CPUs have some SIMD (single instruction, multiple data) capabilities. These SIMD instructions are able to process a vector rather than act on one variable. OVS provides multiple implementations of packet parsing functions. This allows the user to take advantage of SIMD instructions like diff --git a/Documentation/topics/dpdk/phy.rst b/Documentation/topics/dpdk/phy.rst index 937f4c40e5a..eefc25613d2 100644 --- a/Documentation/topics/dpdk/phy.rst +++ b/Documentation/topics/dpdk/phy.rst @@ -76,8 +76,8 @@ persist across reboots. In addition, there are two options available for this kernel space driver - VFIO (Virtual Function I/O) and UIO (Userspace I/O) - along with a number of drivers for each option. We will demonstrate examples of both tools and will use the ``vfio-pci`` driver, which is the more secure, -robust driver of those available. More information can be found in the `DPDK -documentation `__. +robust driver of those available. More information can be found in the +`DPDK drivers documentation`_. To list devices using :command:`driverctl`, run:: @@ -115,9 +115,9 @@ tool:: Open vSwitch 2.6.0 added support for DPDK 16.07, which in turn renamed the former ``dpdk_nic_bind`` tool to ``dpdk-devbind``. -For more information, refer to the `DPDK documentation `__. +For more information, refer to the `DPDK drivers documentation`_. -.. _dpdk-drivers: https://doc.dpdk.org/guides-21.11/linux_gsg/linux_drivers.html +.. _DPDK drivers documentation: https://doc.dpdk.org/guides-23.11/linux_gsg/linux_drivers.html .. _dpdk-phy-multiqueue: @@ -131,6 +131,93 @@ possible with DPDK acceleration. It is possible to configure multiple Rx queues for ``dpdk`` ports, thus ensuring this is not a bottleneck for performance. For information on configuring PMD threads, refer to :doc:`pmd`. +Traffic Rx Steering +------------------- + +.. warning:: This feature is experimental. + +Some control protocols are used to maintain link status between forwarding +engines. In SDN environments, these packets share the same physical network +with the user data traffic. + +When the system is not sized properly, the PMD threads may not be able to +process all incoming traffic from the configured Rx queues. When a signaling +packet of such protocols is dropped, it can cause link flapping, worsening the +situation. + +Some physical NICs can be programmed to put these protocols in a dedicated +hardware Rx queue using the rte_flow__ API. + +__ https://doc.dpdk.org/guides-23.11/prog_guide/rte_flow.html + +.. warning:: + + This feature is not compatible with all NICs. Refer to the DPDK + `compatibility matrix`__ and vendor documentation for more details. + + __ https://doc.dpdk.org/guides-23.11/nics/overview.html + +Rx steering must be enabled for specific protocols per port. The +``rx-steering`` option takes one of the following values: + +``rss`` + Do regular RSS on all configured Rx queues. This is the default behaviour. + +``rss+lacp`` + Do regular RSS on all configured Rx queues. An extra Rx queue is configured + for LACP__ packets (ether type ``0x8809``). + + __ https://www.ieee802.org/3/ad/public/mar99/seaman_1_0399.pdf + +Example:: + + $ ovs-vsctl add-port br0 dpdk-p0 -- set Interface dpdk-p0 type=dpdk \ + options:dpdk-devargs=0000:01:00.0 options:n_rxq=2 \ + options:rx-steering=rss+lacp + +.. note:: + + If multiple Rx queues are already configured, regular hash-based RSS + (Receive Side Scaling) queue balancing is done on all but the extra Rx + queue. + +.. tip:: + + You can check if Rx steering is supported on a port with the following + command:: + + $ ovs-vsctl get interface dpdk-p0 status + {..., rss_queues="0-1", rx_steering_queue="2"} + + This will also show in ``ovs-vswitchd.log``:: + + INFO|dpdk-p0: rx-steering: redirecting lacp traffic to queue 2 + INFO|dpdk-p0: rx-steering: applying rss on queues 0-1 + + If the hardware does not support redirecting the specified protocols to + a dedicated queue, it will be explicit:: + + $ ovs-vsctl get interface dpdk-p0 status + {..., rx-steering=unsupported} + + More details can often be found in ``ovs-vswitchd.log``:: + + WARN|dpdk-p0: rx-steering: failed to add lacp flow: Unsupported pattern + +To disable Rx steering on a port, use the following command:: + + $ ovs-vsctl remove Interface dpdk-p0 options rx-steering + +You can see that it has been disabled in ``ovs-vswitchd.log``:: + + INFO|dpdk-p0: rx-steering: default rss + +.. warning:: + + This feature is mutually exclusive with ``other-config:hw-offload`` as it + may conflict with the offloaded flows. If both are enabled, ``rx-steering`` + will fall back to default ``rss`` mode. + .. _dpdk-phy-flow-control: Flow Control @@ -235,7 +322,7 @@ To hotplug a port with igb_uio in this case, DPDK must be configured to use physical addressing for IOVA mode. For more information regarding IOVA modes in DPDK please refer to the `DPDK IOVA Mode Detection`__. -__ https://doc.dpdk.org/guides-21.11/prog_guide/env_abstraction_layer.html#iova-mode-detection +__ https://doc.dpdk.org/guides-23.11/prog_guide/env_abstraction_layer.html#iova-mode-detection To configure OVS DPDK to use physical addressing for IOVA:: @@ -267,7 +354,7 @@ Representors are multi devices created on top of one PF. For more information, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/prog_guide/switch_representation.html +__ https://doc.dpdk.org/guides-23.11/prog_guide/switch_representation.html#port-representors Prior to port representors there was a one-to-one relationship between the PF and the eth device. With port representors the relationship becomes one PF to @@ -287,18 +374,18 @@ address in devargs. For an existing bridge called ``br0`` and PCI address When configuring a VF-based port, DPDK uses an extended devargs syntax which has the following format:: - BDBF,representor=[] + BDBF,representor= This syntax shows that a representor is an enumerated eth device (with -a representor ID) which uses the PF PCI address. -The following commands add representors 3 and 5 using PCI device address +a representor identifier) which uses the PF PCI address. +The following commands add representors of VF 3 and 5 using PCI device address ``0000:08:00.0``:: $ ovs-vsctl add-port br0 dpdk-rep3 -- set Interface dpdk-rep3 type=dpdk \ - options:dpdk-devargs=0000:08:00.0,representor=[3] + options:dpdk-devargs=0000:08:00.0,representor=vf3 $ ovs-vsctl add-port br0 dpdk-rep5 -- set Interface dpdk-rep5 type=dpdk \ - options:dpdk-devargs=0000:08:00.0,representor=[5] + options:dpdk-devargs=0000:08:00.0,representor=vf5 .. important:: @@ -394,14 +481,14 @@ in the ``options`` column of the ``Interface`` table. .. important:: - Some DPDK port use `bifurcated drivers `__, - which means that a kernel netdevice remains when Open vSwitch is stopped. + Some DPDK port use `bifurcated drivers`_, which means that a kernel + netdevice remains when Open vSwitch is stopped. In such case, any configuration applied to a VF would remain set on the kernel netdevice, and be inherited from it when Open vSwitch is restarted, even if the options described in this section are unset from Open vSwitch. -.. _bifurcated-drivers: https://doc.dpdk.org/guides-21.11/linux_gsg/linux_drivers.html#bifurcated-driver +.. _bifurcated drivers: https://doc.dpdk.org/guides-23.11/linux_gsg/linux_drivers.html#bifurcated-driver - Configure the VF MAC address:: @@ -412,7 +499,7 @@ its options:: $ ovs-appctl dpctl/show [...] - port 3: dpdk-rep0 (dpdk: configured_rx_queues=1, ..., dpdk-vf-mac=00:11:22:33:44:55, ...) + port 3: dpdk-rep0 (dpdk: ..., dpdk-vf-mac=00:11:22:33:44:55, ...) $ ovs-vsctl show [...] @@ -459,8 +546,8 @@ the firmware every time to fulfil this request. Note that not all PMD drivers support LSC interrupts. -The default configuration is polling mode. To set interrupt mode, option -``dpdk-lsc-interrupt`` has to be set to ``true``. +The default configuration is interrupt mode. To set polling mode, option +``dpdk-lsc-interrupt`` has to be set to ``false``. Command to set interrupt mode for a specific interface:: $ ovs-vsctl set interface options:dpdk-lsc-interrupt=true diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index b259cc8b32d..2e8cf5edb87 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -101,12 +101,20 @@ core cycles for each Rx queue:: .. note:: - A history of one minute is recorded and shown for each Rx queue to allow for - traffic pattern spikes. Any changes in the Rx queue's PMD core cycles usage, - due to traffic pattern or reconfig changes, will take one minute to be fully - reflected in the stats. + By default a history of one minute is recorded and shown for each Rx queue + to allow for traffic pattern spikes. Any changes in the Rx queue's PMD core + cycles usage, due to traffic pattern or reconfig changes, will take one + minute to be fully reflected in the stats by default. - .. versionchanged:: 2.6.0 +PMD thread usage of an Rx queue can be displayed for a shorter period of time, +from the last 5 seconds up to the default 60 seconds in 5 second steps. + +To see the port/Rx queue assignment and the last 5 secs of measured usage +history of PMD core cycles for each Rx queue:: + + $ ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 + +.. versionchanged:: 2.6.0 The ``pmd-rxq-show`` command was added in OVS 2.6.0. @@ -115,6 +123,11 @@ core cycles for each Rx queue:: A ``overhead`` statistics is shown per PMD: it represents the number of cycles inherently consumed by the OVS PMD processing loop. +.. versionchanged:: 3.1.0 + + The ``-secs`` parameter was added to the dpif-netdev/pmd-rxq-show + command. + Rx queue to PMD assignment takes place whenever there are configuration changes or can be triggered by using:: @@ -278,10 +291,10 @@ If a PMD core is detected to be above the load threshold and the minimum pre-requisites are met, a dry-run using the current PMD assignment algorithm is performed. -The current variance of load between the PMD cores and estimated variance from -the dry-run are both calculated. If the estimated dry-run variance is improved -from the current one by the variance threshold, a new Rx queue to PMD -assignment will be performed. +For each numa node, the current variance of load between the PMD cores and +estimated variance from the dry-run are both calculated. If any numa's +estimated dry-run variance is improved from the current one by the variance +threshold, a new Rx queue to PMD assignment will be performed. For example, to set the variance improvement threshold to 40%:: @@ -311,5 +324,86 @@ A user can use this option to set a minimum frequency of Rx queue to PMD reassignment due to PMD Auto Load Balance. For example, this could be set (in min) such that a reassignment is triggered at most every few hours. +PMD load based sleeping +----------------------- + +PMD threads constantly poll Rx queues which are assigned to them. In order to +reduce the CPU cycles they use, they can sleep for small periods of time +when there is no load or very-low load on all the Rx queues they poll. + +This can be enabled by setting the max requested sleep time (in microseconds) +for a PMD thread:: + + $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50 + +.. note:: + + Previous config name 'pmd-maxsleep' is deprecated and will be removed in a + future release. + +With a non-zero max value a PMD may request to sleep by an incrementing amount +of time up to the maximum time. If at any point the threshold of at least half +a batch of packets (i.e. 16) is received from an Rx queue that the PMD is +polling is met, the requested sleep time will be reset to 0. At that point no +sleeps will occur until the no/low load conditions return. + +Sleeping in a PMD thread will mean there is a period of time when the PMD +thread will not process packets. Sleep times requested are not guaranteed +and can differ significantly depending on system configuration. The actual +time not processing packets will be determined by the sleep and processor +wake-up times and should be tested with each system configuration. + +Sleep time statistics for 10 secs can be seen with:: + + $ ovs-appctl dpif-netdev/pmd-stats-clear \ + && sleep 10 && ovs-appctl dpif-netdev/pmd-perf-show + +Example output, showing that during the last 10 seconds, 74.5% of iterations +had a sleep of some length. The total amount of sleep time was 9.06 seconds +and the average sleep time where a sleep was requested was 9 microseconds:: + + - sleep iterations: 977037 ( 74.5 % of iterations) + Sleep time (us): 9068841 ( 9 us/iteration avg.) + +Any potential power saving from PMD load based sleeping is dependent on the +system configuration (e.g. enabling processor C-states) and workloads. + +.. note:: + + If there is a sudden spike of packets while the PMD thread is sleeping and + the processor is in a low-power state it may result in some lost packets or + extra latency before the PMD thread returns to processing packets at full + rate. + +Maximum sleep values can also be set for individual PMD threads using +key:value pairs in the form of core:max_sleep. Any PMD thread that has been +assigned a specified value will use that. Any PMD thread that does not have +a specified value will use the current global value. + +Specified values for individual PMD threads can be added or removed at +any time. + +For example, to set PMD threads on cores 8 and 9 to never request a load based +sleep and all others PMD threads to be able to request a max sleep of +50 microseconds (us):: + + $ ovs-vsctl set open_vswitch . other_config:pmd-sleep-max=50,8:0,9:0 + +The max sleep value for each PMD thread can be checked in the logs or with:: + + $ ovs-appctl dpif-netdev/pmd-sleep-show + pmd thread numa_id 0 core_id 8: + max sleep: 0 us + pmd thread numa_id 1 core_id 9: + max sleep: 0 us + pmd thread numa_id 0 core_id 10: + max sleep: 50 us + pmd thread numa_id 1 core_id 11: + max sleep: 50 us + pmd thread numa_id 0 core_id 12: + max sleep: 50 us + pmd thread numa_id 1 core_id 13: + max sleep: 50 us + .. _ovs-vswitchd(8): http://openvswitch.org/support/dist-docs/ovs-vswitchd.8.html diff --git a/Documentation/topics/dpdk/vdev.rst b/Documentation/topics/dpdk/vdev.rst index 97ac6d9a52a..f1f59af5d95 100644 --- a/Documentation/topics/dpdk/vdev.rst +++ b/Documentation/topics/dpdk/vdev.rst @@ -63,4 +63,4 @@ run:: More information on the different types of virtual DPDK PMDs can be found in the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/nics/overview.html +__ https://doc.dpdk.org/guides-23.11/nics/overview.html diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst index 8c233c1d305..7bba08ac216 100644 --- a/Documentation/topics/dpdk/vhost-user.rst +++ b/Documentation/topics/dpdk/vhost-user.rst @@ -269,7 +269,7 @@ similar to the following:: QEMU waiting for connection on: disconnected:unix:/path/to/socket,server -QEMU will wait until the port is created sucessfully in OVS to boot the VM. +QEMU will wait until the port is created successfully in OVS to boot the VM. One benefit of using this mode is the ability for vHost ports to 'reconnect' in event of the switch crashing or being brought down. Once it is brought back up, the vHost ports will reconnect automatically and normal service will resume. @@ -312,7 +312,7 @@ predictable migration time. Mostly used as a second phase after the normal More information can be found in QEMU `docs`_. -.. _`docs`: https://git.qemu.org/?p=qemu.git;a=blob;f=docs/devel/migration.rst +.. _`docs`: https://www.qemu.org/docs/master/devel/migration/postcopy.html Post-copy support may be enabled via a global config value ``vhost-postcopy-support``. Setting this to ``true`` enables Post-copy support @@ -340,8 +340,10 @@ The default value is ``false``. fixes (like userfaulfd leak) was released in 3.0.1. DPDK Post-copy feature requires avoiding to populate the guest memory - (application must not call mlock* syscall). So enabling mlockall is - incompatible with post-copy feature. + (application must not call mlock* syscall without MCL_ONFAULT). + So enabling mlockall is incompatible with post-copy feature in OVS 3.3 and + older. Newer versions of OVS only lock memory pages that are faulted in, + so both features can be used at the same time. Note that during migration of vhost-user device, PMD threads hang for the time of faulted pages download from source host. Transferring 1GB hugepage @@ -485,7 +487,7 @@ Sample XML -.. _QEMU documentation: http://git.qemu-project.org/?p=qemu.git;a=blob;f=docs/specs/vhost-user.txt;h=7890d7169;hb=HEAD +.. _QEMU documentation: https://www.qemu.org/docs/master/interop/vhost-user.html Jumbo Frames ------------ @@ -539,4 +541,4 @@ shown with:: Further information can be found in the `DPDK documentation -`__ +`__ diff --git a/Documentation/topics/index.rst b/Documentation/topics/index.rst index 90d4c66e625..f239fcf83f8 100644 --- a/Documentation/topics/index.rst +++ b/Documentation/topics/index.rst @@ -55,5 +55,6 @@ OVS userspace-tso idl-compound-indexes ovs-extensions + userspace-checksum-offloading userspace-tx-steering usdt-probes diff --git a/Documentation/topics/integration.rst b/Documentation/topics/integration.rst index 58c4389abef..0f40baae741 100644 --- a/Documentation/topics/integration.rst +++ b/Documentation/topics/integration.rst @@ -191,11 +191,11 @@ contents. At all times, the data can be transacted only from the active server. When the active server dies for some reason, entire OVN operations will be stalled. -`Pacemaker `__ is a cluster resource +`Pacemaker `__ is a cluster resource manager which can manage a defined set of resource across a set of clustered nodes. Pacemaker manages the resource with the help of the resource agents. One among the resource agent is `OCF -`__ +`__ OCF is nothing but a shell script which accepts a set of actions and returns an appropriate status code. @@ -250,7 +250,7 @@ with the active server:: 2. Using load balancer vip ip as a master_ip. In order to use this feature, one needs to use listen_on_master_ip_only to no. Current code for load balancer have been tested to work with tcp protocol and needs to be -tested/enchanced for ssl. Using load balancer, standby nodes will not listen on +tested/enhanced for ssl. Using load balancer, standby nodes will not listen on nb and sb db ports so that load balancer will always communicate to the active node and all the traffic will be sent to active node only. Standby will continue to sync using LB VIP IP in this case. diff --git a/Documentation/topics/language-bindings.rst b/Documentation/topics/language-bindings.rst index 414f7c73fa3..15958d76da9 100644 --- a/Documentation/topics/language-bindings.rst +++ b/Documentation/topics/language-bindings.rst @@ -49,7 +49,7 @@ required dependencies, run: or install `python3-netaddr` and `python3-pyparsing`. -__ https://github.com/openvswitch/ovs/tree/master/python/ovs +__ https://github.com/openvswitch/ovs/tree/main/python/ovs Third-Party Bindings -------------------- diff --git a/Documentation/topics/ovsdb-relay.rst b/Documentation/topics/ovsdb-relay.rst index 50a3c6d07b9..75f0c6577d6 100644 --- a/Documentation/topics/ovsdb-relay.rst +++ b/Documentation/topics/ovsdb-relay.rst @@ -105,6 +105,25 @@ started like this:: $ ... $ ovsdb-server --remote=ptcp:6642:172.16.0.K relay:OVN_Southbound:$REMOTES +Open vSwitch 3.3 introduced support for configuration files via +``--config-file`` command line option. The configuration file for relay +database servers in this case may look like this:: + + { + "remotes": { "ptcp:6642:172.16.0.X": {} }, + "databases": { + "OVN_Southbound": { + "service-model": "relay", + "source": { + "$REMOTES": {} + } + } + } + } + +See ``ovsdb-server(1)`` and ``Relay Service Model`` in ``ovsdb(7)`` for more +configuration options. + Every relay server could connect to any of the cluster members of their choice, fairness of load distribution is achieved by shuffling remotes. diff --git a/Documentation/topics/porting.rst b/Documentation/topics/porting.rst index 839b04d52ee..b627fde1260 100644 --- a/Documentation/topics/porting.rst +++ b/Documentation/topics/porting.rst @@ -210,7 +210,7 @@ vSwitch architecture: :: - Architecure + Architecture _ | +-------------------+ diff --git a/Documentation/topics/record-replay.rst b/Documentation/topics/record-replay.rst index 14a568c2120..f723e05dd7a 100644 --- a/Documentation/topics/record-replay.rst +++ b/Documentation/topics/record-replay.rst @@ -44,7 +44,7 @@ measure performance with ``perf``, and so on. .. note:: The current version of record/replay engine does not work correctly with - internal time-based events that leats to communications with other + internal time-based events that lead to communications with other processes. For this reason it can not be used with clustered databases (RAFT implementation is heavily time dependent). In addition, recording automatically disables inactivity probes on diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst index 871ce5637d1..dcf10a4db2d 100644 --- a/Documentation/topics/testing.rst +++ b/Documentation/topics/testing.rst @@ -343,17 +343,22 @@ To see a list of all the available tests, run:: These tests support a `DPDK supported NIC`_. The tests operate on a wider set of environments, for instance, when a virtual port is used. -They do require proper DPDK variables (``DPDK_DIR`` and ``DPDK_BUILD``). Moreover you need to have root privileges to load the required modules and to bind -the NIC to the DPDK-compatible driver. +a PCI device to the DPDK-compatible driver. .. _DPDK supported NIC: https://core.dpdk.org/supported/#nics +The phy test will skip if no suitable PCI device is found. +It is possible to select which PCI device is used for this test by setting the +DPDK_PCI_ADDR environment variable, which is especially useful when testing +with a mlx5 device:: + + # DPDK_PCI_ADDR=0000:82:00.0 make check-dpdk + All tests are skipped if no hugepages are configured. User must look into the DPDK manual to figure out how to `Configure hugepages`_. -The phy test will skip if no compatible physical device is available. -.. _Configure hugepages: https://doc.dpdk.org/guides-21.11/linux_gsg/sys_reqs.html +.. _Configure hugepages: https://doc.dpdk.org/guides-23.11/linux_gsg/sys_reqs.html All the features documented under `Unit Tests`_ are available for the DPDK testsuite. @@ -404,7 +409,7 @@ options are used:: checking whether actions Autovalidator is default implementation... yes Compile OVS in debug mode to have `ovs_assert` statements error out if -there is a mis-match in the datapath classifier lookup or packet parser +there is a mismatch in the datapath classifier lookup or packet parser implementations. Since the AVX512 implementation of the datapath interface is disabled by @@ -448,7 +453,7 @@ datapath testsuite. an updated iproute2 utilities package. The package is available from the Linux kernel organization open source git repositories. - https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git + https://git.kernel.org/pub/scm/network/iproute2/iproute2.git .. _testing-static-analysis: @@ -474,60 +479,21 @@ You should invoke scan-view to view analysis results. The last line of output from ``clang-analyze`` will list the command (containing results directory) that you should invoke to view the results on a browser. -Continuous Integration with Travis CI -------------------------------------- - -A .travis.yml file is provided to automatically build Open vSwitch with various -build configurations and run the testsuite using Travis CI. Builds will be -performed with gcc, sparse and clang with the -Werror compiler flag included, -therefore the build will fail if a new warning has been introduced. - -The CI build is triggered via git push (regardless of the specific branch) or -pull request against any Open vSwitch GitHub repository that is linked to -travis-ci. - -Instructions to setup travis-ci for your GitHub repository: - -1. Go to https://travis-ci.org/ and sign in using your GitHub ID. -2. Go to the "Repositories" tab and enable the ovs repository. You may disable - builds for pushes or pull requests. -3. In order to avoid forks sending build failures to the upstream mailing list, - the notification email recipient is encrypted. If you want to receive email - notification for build failures, replace the encrypted string: - - 1. Install the travis-ci CLI (Requires ruby >=2.0): gem install travis - 2. In your Open vSwitch repository: travis encrypt mylist@mydomain.org - 3. Add/replace the notifications section in .travis.yml and fill in the - secure string as returned by travis encrypt:: - - notifications: - email: - recipients: - - secure: "....." - - .. note:: - You may remove/omit the notifications section to fall back to default - notification behaviour which is to send an email directly to the author and - committer of the failing commit. Note that the email is only sent if the - author/committer have commit rights for the particular GitHub repository. - -4. Pushing a commit to the repository which breaks the build or the - testsuite will now trigger a email sent to mylist@mydomain.org - -vsperf ------- +ViNePerf +-------- -The vsperf project aims to develop a vSwitch test framework that can be used to -validate the suitability of different vSwitch implementations in a telco -deployment environment. More information can be found on the `OPNFV wiki`_. +The ViNePerf project, formerly known as VswitchPerf or vsperf, aims to +develop a vSwitch test framework that can be used to validate the +suitability of different vSwitch implementations in a telco deployment +environment. More information can be found on the `Anuket project wiki`_. -.. _OPNFV wiki: https://wiki.opnfv.org/display/vsperf/VSperf+Home +.. _Anuket project wiki: https://wiki.anuket.io/display/HOME/ViNePERF Proof of Concepts ~~~~~~~~~~~~~~~~~ Proof of Concepts are documentation materialized into Ansible recipes -executed in VirtualBox or Libvirt environments orchastrated by Vagrant. +executed in VirtualBox or Libvirt environments orchestrated by Vagrant. Proof of Concepts allow developers to create small virtualized setups that demonstrate how certain Open vSwitch features are intended to work avoiding user introduced errors by overlooking instructions. Proof of Concepts diff --git a/Documentation/topics/usdt-probes.rst b/Documentation/topics/usdt-probes.rst index 7ce19aaedea..b9a6c54b29f 100644 --- a/Documentation/topics/usdt-probes.rst +++ b/Documentation/topics/usdt-probes.rst @@ -214,6 +214,12 @@ Available probes in ``ovs_vswitchd``: - dpif_recv:recv_upcall - main:poll_block - main:run_start +- revalidate:flow_result +- revalidate_ukey\_\_:entry +- revalidate_ukey\_\_:exit +- revalidator_sweep\_\_:flow_result +- udpif_revalidator:start_dump +- udpif_revalidator:sweep_done dpif_netlink_operate\_\_:op_flow_del @@ -254,6 +260,7 @@ DPIF_OP_FLOW_EXECUTE operation as part of the dpif ``operate()`` callback. **Script references**: +- ``utilities/usdt-scripts/dpif_nl_exec_monitor.py`` - ``utilities/usdt-scripts/upcall_cost.py`` @@ -327,6 +334,7 @@ probe main:run_start ~~~~~~~~~~~~~~~~~~~~ **Description**: + The ovs-vswitchd's main process contains a loop that runs every time some work needs to be done. This probe gets triggered every time the loop starts from the beginning. See also the ``main:poll_block`` probe below. @@ -344,6 +352,7 @@ probe main:poll_block ~~~~~~~~~~~~~~~~~~~~~ **Description**: + The ovs-vswitchd's main process contains a loop that runs every time some work needs to be done. This probe gets triggered every time the loop is done, and it's about to wait for being re-started by a poll_block() call returning. @@ -358,6 +367,125 @@ See also the ``main:run_start`` probe above. - ``utilities/usdt-scripts/bridge_loop.bt`` +revalidate_ukey\_\_:entry +~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +This probe gets triggered on entry of the revalidate_ukey__() function. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(uint16_t) tcp_flags`` +- *arg3*: ``(struct ofpbuf *) odp_actions`` +- *arg4*: ``(struct recirc_refs *) recircs`` +- *arg5*: ``(struct xlate_cache *) xcache`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + +revalidate_ukey\_\_:exit +~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +This probe gets triggered right before the revalidate_ukey__() function exits. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` + +**Script references**: + +*None* + + +udpif_revalidator:start_dump +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +The ovs-vswitchd's revalidator process contains a loop that runs every time +revalidation work is needed. This probe gets triggered every time the +dump phase has started. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(size_t) n_flows`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + +udpif_revalidator:sweep_done +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: + +The ovs-vswitchd's revalidator process contains a loop that runs every time +revalidation work is needed. This probe gets triggered every time the +sweep phase was completed. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(size_t) n_flows`` +- *arg2*: ``(unsigned) MIN(ofproto_max_idle, ofproto_max_revalidator)`` + +**Script references**: + +- ``utilities/usdt-scripts/reval_monitor.py`` + + +probe revalidate:flow_result +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: +This probe is triggered when the revalidator has executed on a particular +flow key to make a determination whether to evict a flow, and the cause +for eviction. The revalidator runs periodically, and this probe will only +be triggered when a flow is flagged for revalidation. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` +- *arg3*: ``(enum flow_del_reason) del_reason`` + +**Script references**: + +- ``utilities/usdt-scripts/flow_reval_monitor.py`` + + +probe revalidator_sweep\_\_:flow_result +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Description**: +This probe is placed in the path of the revalidator sweep, and is executed +under the condition that a flow entry is in an unexpected state, or the +flows were asked to be purged due to a user action. + +**Arguments**: + +- *arg0*: ``(struct udpif *) udpif`` +- *arg1*: ``(struct udpif_key *) ukey`` +- *arg2*: ``(enum reval_result) result`` +- *arg3*: ``(enum flow_del_reason) del_reason`` + +**Script references**: + +- ``utilities/usdt-scripts/flow_reval_monitor.py`` + + Adding your own probes ---------------------- diff --git a/Documentation/topics/userspace-checksum-offloading.rst b/Documentation/topics/userspace-checksum-offloading.rst new file mode 100644 index 00000000000..036d3965faa --- /dev/null +++ b/Documentation/topics/userspace-checksum-offloading.rst @@ -0,0 +1,96 @@ +.. + Licensed under the Apache License, Version 2.0 (the "License"); you may + not use this file except in compliance with the License. You may obtain + a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + License for the specific language governing permissions and limitations + under the License. + + Convention for heading levels in Open vSwitch documentation: + + ======= Heading 0 (reserved for the title in a document) + ------- Heading 1 + ~~~~~~~ Heading 2 + +++++++ Heading 3 + ''''''' Heading 4 + + Avoid deeper levels because they do not render well. + +======================================== +Userspace Datapath - Checksum Offloading +======================================== + +This document explains the internals of Open vSwitch support for checksum +offloading in the userspace datapath. + +Design +------ + +Open vSwitch strives to forward packets as they arrive regardless of whether +the checksum is correct or not. OVS is not responsible for fixing external +checksum issues. + +The interface (internally referred to as a netdev) can set flags indicating +whether each packet's checksum is good or bad upon receipt. If this flag is not +set, OVS will consider the validity of the packet's checksum to be unknown. + +OVS will not re-calculate or update the packet's checksum if the checksum is +already known to be correct, known to be explicitly incorrect, or destined for +an egress interface that will recalculate the checksum anyways. + +If OVS does invalidate the checksum, and the packet ingresses the datapath with +a checksum that is not known to be incorrect, OVS postpones checksum updates +until the packet egresses the datapath. This recalculation can either be +performed by OVS or, be offloaded onto the NIC if the egress NIC supports +checksum offloading. + +When a packet egress the datapath, the packet flags and the egress interface +flags are verified to make sure all required offload features to send out the +packet are available on the egress interface. If not, the data path will fall +back to equivalent software implementation. + + +Interface (a.k.a. Netdev) +------------------------- + +When the interface initiates, it should set the flags to tell the datapath +which offload features are supported. For example, if the driver supports IP +checksum offloading, then ``netdev->ol_flags`` should set the flag +``NETDEV_TX_OFFLOAD_IPV4_CKSUM``. + + +Rules +----- + +1) OVS should strive to forward all packets regardless of checksum. + +2) OVS must not correct a known bad packet checksum. + +3) Packet with flag ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` means that the IP + checksum is present in the packet and it is good. + +4) Packet with flag ``DP_PACKET_OL_RX_IP_CKSUM_BAD`` means that the IP + checksum is present in the packet and it is bad. Extra care should be taken + to not fix the packet during data path processing. + +5) The ingress packet parser can only set ``DP_PACKET_OL_TX_IP_CKSUM`` if the + packet has ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` to not violate rule #2. + +6) Packet with flag ``DP_PACKET_OL_TX_IPV4`` is an IPv4 packet. + +7) Packet with flag ``DP_PACKET_OL_TX_IPV6`` is an IPv6 packet. + +8) Packet with flag ``DP_PACKET_OL_TX_IP_CKSUM`` tells the datapath to skip + updating the IP checksum if the packet is modified. The IP checksum will be + calculated by the egress interface if that supports IP checksum offload, + otherwise the IP checksum will be performed in software before handing over + the packet to the interface. + +9) When there are modifications to the packet that requires a checksum update, + the datapath needs to remove the ``DP_PACKET_OL_RX_IP_CKSUM_GOOD`` flag, + otherwise the checksum is assumed to be good in the packet. diff --git a/Documentation/topics/userspace-tso.rst b/Documentation/topics/userspace-tso.rst index 33a85965c19..a21bb2b5dee 100644 --- a/Documentation/topics/userspace-tso.rst +++ b/Documentation/topics/userspace-tso.rst @@ -46,7 +46,7 @@ datasheet for compatibility. Secondly, the NIC must have an associated DPDK Poll Mode Driver (PMD) which supports `TSO`. For a list of features per PMD, refer to the `DPDK documentation`__. -__ https://doc.dpdk.org/guides-21.11/nics/overview.html +__ https://doc.dpdk.org/guides-23.11/nics/overview.html Enabling TSO ~~~~~~~~~~~~ @@ -68,7 +68,7 @@ as follows. connection is established, `TSO` is thus advertised to the guest as an available feature: -QEMU Command Line Parameter:: +1. QEMU Command Line Parameter:: $ sudo $QEMU_DIR/x86_64-softmmu/qemu-system-x86_64 \ ... @@ -77,12 +77,34 @@ QEMU Command Line Parameter:: ... 2. Ethtool. Assuming that the guest's OS also supports `TSO`, ethtool can be -used to enable same:: + used to enable same:: $ ethtool -K eth0 sg on # scatter-gather is a prerequisite for TSO $ ethtool -K eth0 tso on $ ethtool -k eth0 +**Note:** Enabling this feature impacts the virtio features exposed by the DPDK +vHost User backend to a guest. If a guest was already connected to OvS before +enabling TSO and restarting OvS, this guest ports won't have TSO available:: + + $ ovs-vsctl get interface vhost0 status:tx_tcp_seg_offload + "false" + +To help diagnose the issue, those ports have some additional information in +their status field in ovsdb:: + + $ ovs-vsctl get interface vhost0 status:userspace-tso + disabled + +To restore TSO for this guest ports, this guest QEMU process must be stopped, +then started again. OvS will then report:: + + $ ovs-vsctl get interface vhost0 status:tx_tcp_seg_offload + "true" + + $ ovs-vsctl get interface vhost0 status:userspace-tso + ovs-vsctl: no key "userspace-tso" in Interface record "vhost0" column status + ~~~~~~~~~~~ Limitations ~~~~~~~~~~~ diff --git a/Documentation/topics/windows.rst b/Documentation/topics/windows.rst index c5b34c85fb8..1f1b513e4a9 100644 --- a/Documentation/topics/windows.rst +++ b/Documentation/topics/windows.rst @@ -66,14 +66,14 @@ ingress path. In the egress path, it is the other way round. In addition, there is a object identifier (OID) interface for control operations Eg. addition of a port. The workflow for the calls is similar in nature to the packets, where higher level layers call into the lower level layers. A good representational -diagram of this architecture is in [4]_. +diagram of this architecture is in [3]_. -Windows Filtering Platform (WFP) [5]_ is a platform implemented on Hyper-V that +Windows Filtering Platform (WFP) [4]_ is a platform implemented on Hyper-V that provides APIs and services for filtering packets. WFP has been utilized to filter on some of the packets that OVS is not equipped to handle directly. More details in later sections. -IP Helper [6]_ is a set of API available on Hyper-V to retrieve information +IP Helper [5]_ is a set of API available on Hyper-V to retrieve information related to the network configuration information on the host machine. IP Helper has been used to retrieve some of the configuration information that OVS needs. @@ -188,10 +188,10 @@ The userspace portion of the OVS solution is mostly POSIX code, and not very Linux specific. Majority of the userspace code does not interface directly with the kernel datapath and was ported independently of the kernel datapath effort. -As explained in the OVS porting design document [7]_, DPIF is the portion of +As explained in the OVS porting design document [6]_, DPIF is the portion of userspace that interfaces with the kernel portion of the OVS. The interface -that each DPIF provider has to implement is defined in ``dpif-provider.h`` -[3]_. Though each platform is allowed to have its own implementation of the +that each DPIF provider has to implement is defined in ``dpif-provider.h``. +Though each platform is allowed to have its own implementation of the DPIF provider, it was found, via community feedback, that it is desired to share code whenever possible. Thus, the DPIF provider for OVS on Hyper-V shares code with the DPIF provider on Linux. This interface is implemented in @@ -253,7 +253,7 @@ Netlink Message Parser ~~~~~~~~~~~~~~~~~~~~~~ The communication between OVS userspace and OVS kernel datapath is in the form -of Netlink messages [1]_, [8]_. More details about this are provided below. In +of Netlink messages [1]_, [7]_. More details about this are provided below. In the kernel, a full fledged netlink message parser has been implemented along the lines of the netlink message parser in OVS userspace. In fact, a lot of the code is ported code. @@ -407,7 +407,7 @@ As has been mentioned in earlier sections, the netlink socket and netlink message based DPIF provider on Linux has been ported to Windows. Most of the code is common. Some divergence is in the code to receive packets. -The Linux implementation uses epoll() [9]_ which is not natively supported on +The Linux implementation uses epoll() [8]_ which is not natively supported on Windows. netdev-windows @@ -501,10 +501,9 @@ References .. [1] Hyper-V Extensible Switch https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch .. [2] Hyper-V Extensible Switch Extensions https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-extensions -.. [3] DPIF Provider http://openvswitch.sourcearchive.com/documentation/1.1.0-1/dpif-provider_8h_source.html -.. [4] Hyper-V Extensible Switch Components https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-components -.. [5] Windows Filtering Platform https://msdn.microsoft.com/en-us/library/windows/desktop/aa366510(v=vs.85).aspx -.. [6] IP Helper https://msdn.microsoft.com/windows/hardware/drivers/network/ip-helper -.. [7] How to Port Open vSwitch to New Software or Hardware :doc:`porting` -.. [8] Netlink https://en.wikipedia.org/wiki/Netlink -.. [9] epoll https://en.wikipedia.org/wiki/Epoll +.. [3] Hyper-V Extensible Switch Components https://msdn.microsoft.com/windows/hardware/drivers/network/hyper-v-extensible-switch-components +.. [4] Windows Filtering Platform https://msdn.microsoft.com/en-us/library/windows/desktop/aa366510(v=vs.85).aspx +.. [5] IP Helper https://msdn.microsoft.com/windows/hardware/drivers/network/ip-helper +.. [6] How to Port Open vSwitch to New Software or Hardware :doc:`porting` +.. [7] Netlink https://en.wikipedia.org/wiki/Netlink +.. [8] epoll https://en.wikipedia.org/wiki/Epoll diff --git a/Documentation/tutorials/faucet.rst b/Documentation/tutorials/faucet.rst index 6aa4d39aa8a..33e4543e402 100644 --- a/Documentation/tutorials/faucet.rst +++ b/Documentation/tutorials/faucet.rst @@ -27,7 +27,7 @@ OVS Faucet Tutorial This tutorial demonstrates how Open vSwitch works with a general-purpose OpenFlow controller, using the Faucet controller as a simple way to get -started. It was tested with the "master" branch of Open vSwitch and version +started. It was tested with the "main" branch of Open vSwitch and version 1.6.15 of Faucet. It does not use advanced or recently added features in OVS or Faucet, so other versions of both pieces of software are likely to work equally well. @@ -68,7 +68,7 @@ approaches: $ git clone https://github.com/openvswitch/ovs.git $ cd ovs - The default checkout is the master branch. You will need to use the master + The default checkout is the main branch. You will need to use the main branch for this tutorial as it includes some functionality required for this tutorial. @@ -84,7 +84,7 @@ approaches: The default behaviour for some of the commands used in this tutorial changed in Open vSwitch versions 2.9.x and 2.10.x which breaks the - tutorial. We recommend following step 3 and building master from + tutorial. We recommend following step 3 and building main from source or using a system Open vSwitch that is version 2.8.x or older. If it is successful, you will find yourself in a subshell environment, which diff --git a/Documentation/tutorials/ovs-conntrack.rst b/Documentation/tutorials/ovs-conntrack.rst index e8a58c4eb29..909daf3bd79 100644 --- a/Documentation/tutorials/ovs-conntrack.rst +++ b/Documentation/tutorials/ovs-conntrack.rst @@ -35,7 +35,6 @@ to match on the TCP segments from connection setup to connection tear down. It will use OVS with the Linux kernel module as the datapath for this tutorial. (The datapath that utilizes the openvswitch kernel module to do the packet processing in the Linux kernel) -It was tested with the "master" branch of Open vSwitch. Definitions ----------- diff --git a/MAINTAINERS.rst b/MAINTAINERS.rst index 27be4aa4129..99a0bd405b2 100644 --- a/MAINTAINERS.rst +++ b/MAINTAINERS.rst @@ -28,11 +28,11 @@ Committers Open vSwitch committers are the people who have been granted access to push changes to the Open vSwitch git repository. -The responsibilities of an Open vSwitch committer are documented -`here `__. +The responsibilities of an Open vSwitch committer are documented here: +|responsibilities|. -The process for adding or removing committers is documented -`here `__. +The process for adding or removing committers is documented here: +|grant-revocation|. This is the current list of active Open vSwitch committers: @@ -41,53 +41,68 @@ This is the current list of active Open vSwitch committers: * - Name - Email - * - Alex Wang - - ee07b291@gmail.com + * - Aaron Conole + - aconole@redhat.com * - Alin Serdean - aserdean@ovn.org - * - Andy Zhou - - azhou@ovn.org * - Ansis Atteka - - aatteka@nicira.com - * - Daniele Di Proietto - - daniele.di.proietto@gmail.com - * - Gurucharan Shetty - - guru@ovn.org + - ansisatteka@gmail.com + * - Eelco Chaudron + - echaudro@redhat.com * - Ian Stokes - istokes@ovn.org * - Ilya Maximets - i.maximets@ovn.org - * - Jarno Rajahalme - - jarno@ovn.org - * - Jesse Gross - - jesse@kernel.org - * - Justin Pettit - - jpettit@ovn.org - * - Pravin B Shelar - - pshelar@ovn.org + * - Kevin Traynor + - ktraynor@redhat.com * - Russell Bryant - russell@ovn.org * - Simon Horman - horms@ovn.org - * - Thomas Graf - - tgraf@noironetworks.com * - William Tu - u9012063@gmail.com - * - YAMAMOTO Takashi - - yamamoto@midokura.com The project also maintains a list of Emeritus Committers (or Maintainers). -More information about Emeritus Committers can be found -`here `__. +More information about Emeritus Committers can be found here: +|emeritus-status|. .. list-table:: OVS Emeritus Maintainers :header-rows: 1 * - Name - Email + * - Alex Wang + - ee07b291@gmail.com + * - Andy Zhou + - azhou@ovn.org * - Ben Pfaff - blp@ovn.org + * - Daniele Di Proietto + - daniele.di.proietto@gmail.com * - Ethan J. Jackson - ejj@eecs.berkeley.edu + * - Gurucharan Shetty + - guru@ovn.org + * - Jarno Rajahalme + - jarno@ovn.org + * - Jesse Gross + - jesse@kernel.org * - Joe Stringer - joe@ovn.org + * - Justin Pettit + - jpettit@ovn.org + * - Pravin B Shelar + - pshelar@ovn.org + * - Thomas Graf + - tgraf@tgraf.ch + * - YAMAMOTO Takashi + - yamamoto@midokura.com + +.. Cut here for the Documentation/internals/maintainers.rst + +.. |responsibilities| replace:: `Expectations for Developers with Open vSwitch + Repo Access `__ +.. |grant-revocation| replace:: `OVS Committer Grant/Revocation Policy + `__ +.. |emeritus-status| replace:: `Emeritus Status for OVS Committers + `__ diff --git a/Makefile.am b/Makefile.am index 54754967e9f..f47217e0bc3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,6 +9,8 @@ AUTOMAKE_OPTIONS = foreign subdir-objects ACLOCAL_AMFLAGS = -I m4 +AM_DISTCHECK_CONFIGURE_FLAGS = --with-version-suffix="$(VERSION_SUFFIX)" + AM_CPPFLAGS = $(SSL_CFLAGS) AM_LDFLAGS = $(SSL_LDFLAGS) AM_LDFLAGS += $(OVS_LDFLAGS) @@ -76,13 +78,18 @@ EXTRA_DIST = \ MAINTAINERS.rst \ README.rst \ NOTICE \ + .ci/dpdk-build.sh \ + .ci/dpdk-prepare.sh \ .ci/linux-build.sh \ .ci/linux-prepare.sh \ .ci/osx-build.sh \ .ci/osx-prepare.sh \ + .ci/windows-build.sh \ + .ci/windows-prepare.sh \ .cirrus.yml \ + .editorconfig \ .github/workflows/build-and-test.yml \ - .travis.yml \ + .readthedocs.yaml \ appveyor.yml \ boot.sh \ poc/builders/Vagrantfile \ @@ -121,6 +128,7 @@ OVSIDL_BUILT = pkgdata_DATA = sbin_SCRIPTS = scripts_SCRIPTS = +usdt_SCRIPTS = completion_SCRIPTS = scripts_DATA = SUFFIXES = @@ -134,6 +142,7 @@ C ?= 1 endif scriptsdir = $(pkgdatadir)/scripts +usdtdir = $(pkgdatadir)/scripts/usdt completiondir = $(sysconfdir)/bash_completion.d pkgconfigdir = $(libdir)/pkgconfig @@ -157,6 +166,7 @@ SUFFIXES += .in -e 's,[@]PYTHON3[@],$(PYTHON3),g' \ -e 's,[@]RUNDIR[@],$(RUNDIR),g' \ -e 's,[@]VERSION[@],$(VERSION),g' \ + -e 's,[@]VERSION_SUFFIX[@],$(VERSION_SUFFIX),g' \ -e 's,[@]localstatedir[@],$(localstatedir),g' \ -e 's,[@]pkgdatadir[@],$(pkgdatadir),g' \ -e 's,[@]sysconfdir[@],$(sysconfdir),g' \ @@ -366,7 +376,7 @@ ALL_LOCAL += manpage-check manpage-check: $(man_MANS) $(dist_man_MANS) $(noinst_man_MANS) @error=false; \ for manpage in $?; do \ - LANG=en_US.UTF-8 groff -w mac -w delim -w escape -w input -w missing -w tab -T utf8 -man -p -z $$manpage >$@.tmp 2>&1; \ + LANG=en_US.UTF-8 groff -t -w mac -w delim -w escape -w input -w missing -w tab -T utf8 -man -p -z $$manpage >$@.tmp 2>&1; \ if grep warning: $@.tmp; then error=:; fi; \ rm -f $@.tmp; \ done; \ @@ -397,23 +407,17 @@ ALL_LOCAL += flake8-check # F811 redefinition of unused from line (only from flake8 v2.0) # D*** -- warnings from flake8-docstrings plugin # H*** -- warnings from flake8 hacking plugin (custom style checks beyond PEP8) -# H231 Python 3.x incompatible 'except x,y:' construct -# H232 Python 3.x incompatible octal 077 should be written as 0o77 -# H233 Python 3.x incompatible use of print operator -# H238 old style class declaration, use new style (inherit from `object`) -FLAKE8_SELECT = H231,H232,H233,H238 FLAKE8_IGNORE = E121,E123,E125,E126,E127,E128,E129,E131,E203,E722,W503,W504,F811,D,H,I flake8-check: $(FLAKE8_PYFILES) $(FLAKE8_WERROR)$(AM_V_GEN) \ src='$^' && \ - flake8 $$src --select=$(FLAKE8_SELECT) $(FLAKE8_FLAGS) && \ flake8 $$src --ignore=$(FLAKE8_IGNORE) $(FLAKE8_FLAGS) && \ touch $@ endif CLEANFILES += flake8-check -include manpages.mk -manpages.mk: $(MAN_ROOTS) build-aux/sodepends.py python/build/soutil.py +manpages.mk: $(MAN_ROOTS) build-aux/sodepends.py python/ovs_build_helpers/soutil.py @PYTHONPATH=$$PYTHONPATH$(psep)$(srcdir)/python $(PYTHON3) $(srcdir)/build-aux/sodepends.py -I. -I$(srcdir) $(MAN_ROOTS) >$(@F).tmp @if cmp -s $(@F).tmp $@; then \ touch $@; \ diff --git a/NEWS b/NEWS index ff77ee404f3..a6fc436c88d 100644 --- a/NEWS +++ b/NEWS @@ -1,28 +1,230 @@ -Post-v3.0.0 +Post-v3.4.0 -------------------- + + +v3.4.0 - xx xxx xxxx +-------------------- + - Option '--mlockall' now only locks memory pages on fault, if possible. + This also makes it compatible with vHost Post-copy Live Migration. + - ovs-appctl: + * Added new option [-f|--format] to choose the output format, e.g. 'json' + or 'text' (by default). + * Added new option [--pretty] to print JSON output in a readable fashion. + * 'dpif/show' and 'list-commands' now support output in JSON format. + * Added 'ofproto/detrace' command that outputs the set of OpenFlow rules + and groups that contributed to the creation of a specific datapath flow. + - ovs-vsctl: + * Added a new filter column in the Mirror table which can be used to + apply filters to mirror ports. + - ovs-tcpdump: + * Added command line parameter --filter to enable filtering the packets + that are captured by tcpdump. + - Userspace datapath: + * Conntrack now supports 'random' flag for selecting ports in a range + while natting and 'persistent' flag for selection of the IP address + from a range. + * IPv6 UDP tunnel encapsulation including Geneve and VXLAN will now + honour the csum option. Configuring the interface with + "options:csum=false" now has the same effect as the udp6zerocsumtx + option has with Linux kernel UDP tunnels. + - The primary development branch has been renamed from 'master' to 'main'. + The OVS tree remains hosted on GitHub. + https://github.com/openvswitch/ovs.git + - DPDK: + * OVS validated with DPDK 23.11.1. + * Link status changes are now handled via interrupt mode if the DPDK + driver supports it. It is possible to revert to polling mode by setting + per interface 'options:dpdk-lsc-interrupt' to 'false'. + - Python: + * Added custom transaction support to the Idl via add_op(). + * Added support for different output formats like 'json' to Python's + unixctl classes. + - Tunnels: + * Previously the kernel datapath did not enable UDP checksums by default + in IPv6 tunnels. This behaviour is non-standard, differs from the + Linux kernel, and as also different than the userspace datapath. Now + these tunnels will calculate checksums by default and that behaviour can + be changed with "options:csum=false" just as with the userspace + datapath. + - Local sampling is introduced. It reuses the OpenFlow sample action and + allows samples to be emitted locally (instead of via IPFIX) in a + datapath-specific manner. The Linux kernel datapath is the first to + support this feature by using the new datapath 'psample' action. See + 'local-group-id' column in the Flow_Sample_Collector_Set table. + - A new configuration knob 'other-config:explicit-sampled-drops' in the + Open_vSwitch table controls whether an explicit drop action shall be + added at the end of datapath flows whose last action is an + observability-driven sample action. + - OpenFlow: + * A new version of the 'sample' action (NXAST_SAMPLE4) is introduced + that allows use of subfields in 'obs_point_id' and 'obs_domain_id'. + + +v3.3.0 - 16 Feb 2024 +-------------------- + - OVSDB: + * Support pre-vote mechanism in RAFT that protects the cluster against + disruptive servers (section 9.6 of the original RAFT paper). Upgrading + from older version is supported but it may trigger more leader elections + during the process, and error logs complaining unrecognized fields may + be observed on old nodes. + * New command line option --config-file that allows a fine control over + remotes and database configuration, including setting options for + connection methods for relays and active-backup replication. + For more details see ovsdb-server(1) and ovsdb(7). + * Make use of cooperative multitasking to improve maintenance of RAFT + cluster during long running processing such as online schema conversion. + - OpenFlow: + * NXT_CT_FLUSH extension is updated to support flushing connections + based on mark and labels. 'ct-flush' command of ovs-ofctl updated + to support these new arguments accordingly. + - ovs-appctl: + * 'ofproto/trace' now reports OpenFlow rules that make up a conjunctive + flow match. + * Output of 'dpctl/show' command no longer shows interface configuration + status, only values of the actual configuration options, a.k.a. + 'requested' configuration. The interface configuration status, + a.k.a. 'configured' values, can be found in the 'status' column of + the Interface table, i.e. with 'ovs-vsctl get interface <..> status'. + Reported names adjusted accordingly. + * Added support for removal of default CT zone limit, e.g. + "ovs-appctl dpctl/ct-del-limits default". + * 'dpctl/flush-conntrack' is now capable of flushing connections based + on mark and labels. + * 'mdb/show': support for multicast snooping to show the protocol + responsible for adding/updating the entry. + - ovs-vsctl: + * New commands 'set-zone-limit', 'del-zone-limit' and 'list-zone-limits' + to manage the maximum number of connections in conntrack zones via + a new 'limit' column in the 'CT_Zone' database table and + 'ct_zone_default_limit' column in the 'Datapath' table. + - Userspace datapath: + * Added support for Generic Segmentation Offloading for the cases where + TSO is enabled but not supported by an egress interface (except for + tunnel interfaces). + * 'pmd-sleep-max' is updated to also accept pmd-thread-core:sleep-max. + The existing behaviour is maintained and a non key:value pair value + will be applied to all other PMD thread cores.'pmd-sleep-show' is + updated to show the maximum sleep for each PMD thread core. + * The userspace conntrack module no longer requires the user to specify + connection helpers in all flow rules. Instead, the helper specified + during connection commit will be used by default. + - DPDK: + * Add support for DPDK 23.11. + + +v3.2.0 - 17 Aug 2023 +-------------------- + - OVSDB: + * Changed format in which ovsdb schema conversion operations are stored in + clustered database files. Such operations are now allowed to contain + the bare schema (without data). This allows to significantly improve + the schema conversion performance. + New ovsdb-server process will be able to read old database format, but + old processes will *fail* to read database created by the new one, if + conversion operation is present. For the cluster service model follow + upgrade instructions in 'Upgrading from version 3.1 and earlier to 3.2 + and later' section of ovsdb(7). + * When ovsdb-server is running in relay mode, the probe interval is + now configurable via 'ovsdb-server/set-relay-source-probe-interval' + unixctl command. + - IPFIX template and statistics intervals can now be configured through two + new options in the IPFIX table: 'template_interval' and 'stats_interval'. + - Linux kernel datapath: + * OVS now collects per-interface upcall statistics that can be obtained + via 'ovs-appctl dpctl/show -s' or the interface's statistics column + in OVSDB. Available with upstream kernel 6.2+. + - OVS route table in userspace now takes into account preferred source + address from cached kernel routes. + - ovs-appctl: + * Add support for selecting the source address with the + 'ovs-appctl ovs/route/add' command. + * New commands "dpctl/{ct-get-sweep-interval,ct-set-sweep-interval}" that + allow to get and set, for the userspace datapath, the sweep interval + for the conntrack garbage collector. + * New commands "dpctl/dump-conntrack-exp" that allows to dump + conntrack's expectations for the userspace datapath. + - ovs-ctl: + * Added new options --[ovsdb-server|ovs-vswitchd]-umask=MODE to set umask + value when starting OVS daemons. E.g., use --ovsdb-server-umask=0002 + in order to create OVSDB sockets with access mode of 0770. + - QoS: + * Added new configuration option 'jitter' for a linux-netem QoS type. + * 'linux-htb' QoS type now supports rates higher than 34 Gbps. + - Ingress Policing: + * Ingress policing byte rates can now be configured higher than 34 Gbps. + - DPDK: + * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started + with the --hw-rawio-access command line option. This allows the + process extra privileges when mapping physical interconnect memory. + * New experimental "rx-steering=rss+" option to redirect + certain protocols (for now, only LACP) to a dedicated hardware queue + using the rte_flow API. + - SRv6 Tunnel Protocol + * Added support for userspace datapath (only). + - Userspace datapath: + * Connection tracking now supports extraction of SCTP L4 information. + * Implementation of OpenFlow meters is now lockless allowing for better + multi-thread scalability. + * IP and L4 checksum offload support is now enabled by default for + interfaces that support it. See the 'status' column in the 'interface' + table to check the status. + * 'pmd-maxsleep' other_config was renamed to 'pmd-sleep-max'. + 'pmd-maxsleep' is deprecated and will be removed in a future release. + * 'ovs-appctl dpif-netdev/pmd-sleep-show' command was added to get the + max sleep configuration of PMD thread cores. + * Removed experimental tag from PMD load based sleeping. + - Linux TC offload: + * Add support for offloading VXLAN tunnels with the GBP extensions. + - Python + * Added async DNS support. + * Dropped support for Python < 3.6. + + +v3.1.0 - 16 Feb 2023 +-------------------- + - ovs-vswitchd now detects changes in CPU affinity and adjusts the number + of handler and revalidator threads if necessary. + - AF_XDP: + * Added support for building with libxdp and libbpf >= 0.7. + * Support for AF_XDP is now enabled by default if all dependencies are + available at the build time. Use --disable-afxdp to disable. + Use --enable-afxdp to fail the build if dependencies are not present. - ovs-appctl: * "ovs-appctl ofproto/trace" command can now display port names with the "--names" option. + - OVSDB-IDL: + * Add the support to specify the persistent uuid for row insert in both + C and Python IDLs. - Windows: * Conntrack IPv6 fragment support. - DPDK: - * OVS validated with DPDK 21.11.2. - DPDK 21.11.2 contains fixes for the following CVEs: - CVE-2022-28199 cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-28199 - CVE-2022-2132 cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2022-2132 - A bug was introduced in DPDK 21.11.1 by the commit - 01e3dee29c02 ("vhost: fix unsafe vring addresses modifications"). - This bug can cause a deadlock when vIOMMU is enabled and NUMA - reallocation of the virtqueues happen. - A fix has been posted and pushed to the DPDK 21.11 branch. - It can be found here: - https://patches.dpdk.org/project/dpdk/patch/20220725203206.427083-2-david.marchand@redhat.com/. - If a user wishes to avoid the issue then it is recommended to use - DPDK 21.11.0 until the release of DPDK 21.11.3. - It should be noted that DPDK 21.11.0 does not benefit from the numerous - bug and CVE fixes addressed since its release. - If a user wishes to benefit from these fixes it is recommended to use - DPDK 21.11.2. + * Add support for DPDK 22.11.1. + - For the QoS max-rate and STP/RSTP path-cost configuration OVS now assumes + 10 Gbps link speed by default in case the actual link speed cannot be + determined. Previously it was 10 Mbps. Values can still be overridden + by specifying 'max-rate' or '[r]stp-path-cost' accordingly. + - OpenFlow: + * New OpenFlow extension NXT_CT_FLUSH to flush connections matching + the specified fields. + - ovs-ctl: + * New option '--dump-hugepages' to include hugepages in core dumps. This + can assist with postmortem analysis involving DPDK, but may also produce + significantly larger core dump files. + - ovs-dpctl and 'ovs-appctl dpctl/' commands: + * 'flush-conntrack' is now capable of handling partial 5-tuple, + with additional optional parameter to specify the reply direction. + - ovs-ofctl: + * New command 'flush-conntrack' that accepts zone and 5-tuple (or partial + 5-tuple) for both directions. + - Support for travis-ci.org based continuous integration builds has been + dropped. + - Userspace datapath: + * Add '-secs' argument to appctl 'dpif-netdev/pmd-rxq-show' to show + the pmd usage of an Rx queue over a configurable time period. + * Add new experimental PMD load based sleeping feature. PMD threads can + request to sleep up to a user configured 'pmd-maxsleep' value under + low load conditions. v3.0.0 - 15 Aug 2022 diff --git a/README.rst b/README.rst index 8fe01f4cf23..ca9e386c206 100644 --- a/README.rst +++ b/README.rst @@ -8,12 +8,12 @@ Open vSwitch .. image:: https://github.com/openvswitch/ovs/workflows/Build%20and%20Test/badge.svg :target: https://github.com/openvswitch/ovs/actions -.. image:: https://travis-ci.org/openvswitch/ovs.png - :target: https://travis-ci.org/openvswitch/ovs -.. image:: https://ci.appveyor.com/api/projects/status/github/openvswitch/ovs?branch=master&svg=true&retina=true +.. image:: https://ci.appveyor.com/api/projects/status/github/openvswitch/ovs?branch=main&svg=true&retina=true :target: https://ci.appveyor.com/project/blp/ovs/history .. image:: https://api.cirrus-ci.com/github/openvswitch/ovs.svg :target: https://cirrus-ci.com/github/openvswitch/ovs +.. image:: https://readthedocs.org/projects/openvswitch/badge/?version=latest + :target: https://docs.openvswitch.org/en/latest/ What is Open vSwitch? --------------------- @@ -37,7 +37,7 @@ following features: - NIC bonding with or without LACP on upstream switch - NetFlow, sFlow(R), and mirroring for increased visibility - QoS (Quality of Service) configuration, plus policing -- Geneve, GRE, VXLAN, STT, and LISP tunneling +- Geneve, GRE, VXLAN, STT, ERSPAN, GTP-U, SRv6, Bareudp, and LISP tunneling - 802.1ag connectivity fault management - OpenFlow 1.0 plus numerous extensions - Transactional configuration database with C and Python bindings diff --git a/acinclude.m4 b/acinclude.m4 index ad07989ac29..1ace70c92a7 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -163,10 +163,10 @@ dnl Configure Linux tc compat. AC_DEFUN([OVS_CHECK_LINUX_TC], [ AC_COMPILE_IFELSE([ AC_LANG_PROGRAM([#include ], [ - int x = TCA_POLICE_PKTRATE64; + int x = TCA_ACT_FLAGS_SKIP_HW; ])], - [AC_DEFINE([HAVE_TCA_POLICE_PKTRATE64], [1], - [Define to 1 if TCA_POLICE_PKTRATE64 is available.])]) + [AC_DEFINE([HAVE_TCA_ACT_FLAGS_SKIP_HW], [1], + [Define to 1 if TCA_ACT_FLAGS_SKIP_HW is available.])]) AC_CHECK_MEMBERS([struct tcf_t.firstuse], [], [], [#include ]) @@ -191,6 +191,13 @@ AC_DEFUN([OVS_CHECK_LINUX_TC], [ [AC_DEFINE([HAVE_TCA_TUNNEL_KEY_ENC_TTL], [1], [Define to 1 if TCA_TUNNEL_KEY_ENC_TTL is available.])]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN; + ])], + [AC_DEFINE([HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN], [1], + [Define to 1 if TCA_TUNNEL_KEY_ENC_OPTS_VXLAN is available.])]) + AC_COMPILE_IFELSE([ AC_LANG_PROGRAM([#include ], [ int x = TCA_PEDIT_KEY_EX_HDR_TYPE_UDP; @@ -211,6 +218,26 @@ AC_DEFUN([OVS_CHECK_LINUX_TC], [ ])], [AC_DEFINE([HAVE_TCA_STATS_PKT64], [1], [Define to 1 if TCA_STATS_PKT64 is available.])]) + + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_HTB_RATE64; + ])], + [AC_SUBST(HAVE_TCA_HTB_RATE64,yes) + AC_DEFINE([HAVE_TCA_HTB_RATE64], [1], + [Define to 1 if TCA_HTB_RATE64 is available.])], + [AC_SUBST(HAVE_TCA_HTB_RATE64,no)] + ) + + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include ], [ + int x = TCA_POLICE_PKTRATE64; + ])], + [AC_SUBST(HAVE_TCA_POLICE_PKTRATE64,yes) + AC_DEFINE([HAVE_TCA_POLICE_PKTRATE64], [1], + [Define to 1 if TCA_POLICE_PKTRATE64 is available.])], + [AC_SUBST(HAVE_TCA_POLICE_PKTRATE64,no)] + ) ]) dnl OVS_CHECK_LINUX_SCTP_CT @@ -251,42 +278,72 @@ AC_DEFUN([OVS_FIND_DEPENDENCY], [ dnl OVS_CHECK_LINUX_AF_XDP dnl -dnl Check both Linux kernel AF_XDP and libbpf support +dnl Check both Linux kernel AF_XDP and libbpf/libxdp support AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [ - AC_ARG_ENABLE([afxdp], - [AS_HELP_STRING([--enable-afxdp], [Enable AF-XDP support])], - [], [enable_afxdp=no]) + AC_ARG_ENABLE( + [afxdp], + [AS_HELP_STRING([--disable-afxdp], [Disable AF-XDP support])], + [case "${enableval}" in + (yes | no | auto) ;; + (*) AC_MSG_ERROR([bad value ${enableval} for --enable-afxdp]) ;; + esac], + [enable_afxdp=auto]) + AC_MSG_CHECKING([whether AF_XDP is enabled]) - if test "$enable_afxdp" != yes; then + if test "$enable_afxdp" = no; then AC_MSG_RESULT([no]) AF_XDP_ENABLE=false else - AC_MSG_RESULT([yes]) + AC_MSG_RESULT([$enable_afxdp]) AF_XDP_ENABLE=true - - AC_CHECK_HEADER([bpf/libbpf.h], [], - [AC_MSG_ERROR([unable to find bpf/libbpf.h for AF_XDP support])]) - - AC_CHECK_HEADER([linux/if_xdp.h], [], - [AC_MSG_ERROR([unable to find linux/if_xdp.h for AF_XDP support])]) - - AC_CHECK_HEADER([bpf/xsk.h], [], - [AC_MSG_ERROR([unable to find bpf/xsk.h for AF_XDP support])]) - - AC_CHECK_FUNCS([pthread_spin_lock], [], - [AC_MSG_ERROR([unable to find pthread_spin_lock for AF_XDP support])]) - - OVS_FIND_DEPENDENCY([numa_alloc_onnode], [numa], [libnuma]) - - AC_DEFINE([HAVE_AF_XDP], [1], - [Define to 1 if AF_XDP support is available and enabled.]) - LIBBPF_LDADD=" -lbpf -lelf" - AC_SUBST([LIBBPF_LDADD]) - - AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [ - AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1], - [XDP need wakeup support detected in xsk.h.]) - ], [], [[#include ]]) + failed_dep=none + dnl Saving libs to restore in case we will end up not building with AF_XDP. + save_LIBS=$LIBS + + AC_CHECK_HEADER([bpf/libbpf.h], [], [failed_dep="bpf/libbpf.h"]) + + if test "$failed_dep" = none; then + AC_CHECK_HEADER([linux/if_xdp.h], [], [failed_dep="linux/if_xdp.h"]) + fi + + if test "$failed_dep" = none; then + AC_SEARCH_LIBS([libbpf_strerror], [bpf], [], [failed_dep="libbpf"]) + AC_CHECK_FUNCS([bpf_xdp_query_id bpf_xdp_detach]) + fi + + if test "$failed_dep" = none -a "x$ac_cv_func_bpf_xdp_detach" = xyes; then + dnl We have libbpf >= 0.7. Look for libxdp as xsk functions + dnl were moved into this library. + AC_SEARCH_LIBS([libxdp_strerror], [xdp], + AC_CHECK_HEADER([xdp/xsk.h], + AC_DEFINE([HAVE_LIBXDP], [1], [xsk.h is supplied with libxdp]), + [failed_dep="xdp/xsk.h"]), + [failed_dep="libxdp"]) + elif test "$failed_dep" = none; then + dnl libbpf < 0.7 contains all the necessary functionality. + AC_CHECK_HEADER([bpf/xsk.h], [], [failed_dep="bpf/xsk.h"]) + fi + + if test "$failed_dep" = none; then + AC_CHECK_FUNCS([pthread_spin_lock], [], [failed_dep="pthread_spin_lock"]) + fi + + if test "$failed_dep" = none; then + AC_SEARCH_LIBS([numa_alloc_onnode], [numa], [], [failed_dep="libnuma"]) + fi + + if test "$failed_dep" = none; then + AC_DEFINE([HAVE_AF_XDP], [1], + [Define to 1 if AF_XDP support is available and enabled.]) + elif test "$enable_afxdp" = yes; then + AC_MSG_ERROR([Missing $failed_dep dependency for AF_XDP support]) + else + AC_MSG_WARN(m4_normalize( + [Cannot find $failed_dep, netdev-afxdp will not be supported + (use --disable-afxdp to suppress this warning).])) + AF_XDP_ENABLE=false + LIBS=$save_LIBS + fi fi AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true) ]) @@ -357,7 +414,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [ ], [], [[#include ]]) AC_CHECK_DECL([RTE_NET_AF_XDP], [ - LIBBPF_LDADD="-lbpf" + OVS_FIND_DEPENDENCY([libbpf_strerror], [bpf], [libbpf]) ], [], [[#include ]]) AC_CHECK_DECL([RTE_LIBRTE_VHOST_NUMA], [ @@ -440,6 +497,19 @@ AC_DEFUN([OVS_CHECK_DPDK], [ AM_CONDITIONAL([DPDK_NETDEV], test "$DPDKLIB_FOUND" = true) ]) +dnl Append a version suffix. + +AC_DEFUN([OVS_CHECK_VERSION_SUFFIX], [ + AC_ARG_WITH([version-suffix], + [AS_HELP_STRING([--with-version-suffix=ver_suffix], + [Specify a string that will be appended + to OVS version])]) + AC_DEFINE_UNQUOTED([VERSION_SUFFIX], ["$with_version_suffix"], + [Package version suffix]) + AC_SUBST([VERSION_SUFFIX], [$with_version_suffix]) + ]) +]) + dnl Checks for net/if_dl.h. dnl dnl (We use this as a proxy for checking whether we're building on FreeBSD diff --git a/appveyor.yml b/appveyor.yml index 25c3f69fb48..d0293b2118c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,59 +2,78 @@ version: 1.0.{build} image: Visual Studio 2019 branches: only: - - master + - main configuration: - Debug - Release clone_folder: C:\openvswitch_compile -init: -- ps: $env:PATH ="C:\Python37;"+$env:PATH -- ps: New-Item -Type HardLink -Path "C:\Python37\python3.exe" -Value "C:\Python37\python.exe" -- ps: >- - mkdir C:\ovs-build-downloads - - mkdir C:\openvswitch\driver - - $source = "https://slproweb.com/download/Win64OpenSSL-1_0_2u.exe" +shallow_clone: true - $destination = "C:\ovs-build-downloads\Win64OpenSSL-1_0_2u.exe" +init: +- ps: $env:PATH ="C:\Python312-x64;"+$env:PATH +- ps: New-Item -Type HardLink -Path "C:\Python312-x64\python3.exe" + -Value "C:\Python312-x64\python.exe" - Invoke-WebRequest $source -OutFile $destination +cache: +- C:\ovs-build-downloads - cd C:\ovs-build-downloads +install: +- ps: | + Remove-Item -Recurse -Force -Path C:/OpenSSL-Win64 + New-Item -ItemType Directory -Force -Path C:\ovs-build-downloads - .\Win64OpenSSL-1_0_2u.exe /silent /verysilent /sp- /suppressmsgboxes + # Find and download the latest stable OpenSSl 3.0. + $URL = "https://raw.githubusercontent.com/slproweb/opensslhashes/master/win32_openssl_hashes.json" + $webData = (Invoke-WebRequest -Uri $URL).content | ConvertFrom-Json + $source = ($webData.files.PSObject.Properties | Where-Object { + $_.Value.basever -match "^3\.0\.[0-9]+" -and + $_.Value.bits -eq "64" -and + $_.Value.arch -eq "INTEL" -and + $_.Value.installer -eq "exe" -and + -not $_.Value.light + } | Select-Object Value | Select -First 1).PSObject.Properties.Value - Start-Sleep -s 30 + Write-Host "Latest OpenSSL 3.0:" ($source | Format-List | Out-String) - cd C:\openvswitch + $destination = "C:\ovs-build-downloads\Win64OpenSSL.exe" + if (Test-Path $destination) { + $fileHash = (Get-FileHash $destination -Algorithm SHA256).Hash.ToLower() + if ($fileHash -ne $source.sha256) { + Write-Host "Cache miss:" $fileHash "!=" $source.sha256 + Remove-Item -Path $destination + } + } - git clone https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code + if (Test-Path $destination) { + Write-Host "Using cached:" $destination + } else { + Write-Host "Downloading:" $source.url + Invoke-WebRequest $source.url -OutFile $destination + } - python3 -m pip install pypiwin32 --disable-pip-version-check + Write-Host "Installing:" $destination + Start-Process -FilePath $destination ` + -ArgumentList "/silent /verysilent /sp- /suppressmsgboxes" -Wait - cd C:\openvswitch_compile +- ps: git clone -q https://git.code.sf.net/p/pthreads4w/code c:\pthreads4w-code +- ps: python3 -m pip install pypiwin32 --disable-pip-version-check +- '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"' +- ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c + ".ci/windows-prepare.sh 2>&1" build_script: -- '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat"' -- C:\MinGW\msys\1.0\bin\bash -lc "echo \"C:/MinGW /mingw\" > /etc/fstab" -- C:\MinGW\msys\1.0\bin\bash -lc "mv /bin/link.exe /bin/link_copy.exe" -# Build pthreads -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/pthreads4w-code && nmake all install" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && ./boot.sh" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && ./configure CC=build-aux/cccl LD=\"`which link`\" LIBS=\"-lws2_32 -lShlwapi -liphlpapi -lwbemuuid -lole32 -loleaut32\" --prefix=C:/openvswitch/usr --localstatedir=C:/openvswitch/var --sysconfdir=C:/openvswitch/etc --with-pthread=c:/PTHREADS-BUILT/ --enable-ssl --with-openssl=C:/OpenSSL-Win64 --with-vstudiotarget=\"%CONFIGURATION%\"" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make -j 4" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make datapath_windows_analyze" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make install" -- C:\MinGW\msys\1.0\bin\bash -lc "cd /c/openvswitch_compile && make windows_installer" -- cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\bin -- cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\sbin -- ps: cp C:\openvswitch_compile\datapath-windows\x64\Win10$env:CONFIGURATION\package\* C:\openvswitch\driver -- ps: cp C:\openvswitch_compile\datapath-windows\x64\Win10$env:CONFIGURATION\package.cer C:\openvswitch\driver -- ps: cp C:\openvswitch_compile\datapath-windows\misc\* C:\openvswitch\driver -- cp c:\openvswitch_compile\windows\ovs-windows-installer\bin\x64\Release\OpenvSwitch.msi c:\OpenvSwitch-%CONFIGURATION%.msi +- ps: C:\msys64\msys2_shell.cmd -here -defterm -no-start -use-full-path -c + ".ci/windows-build.sh $env:CONFIGURATION 2>&1" +- ps: cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\bin +- ps: cp C:\PTHREADS-BUILT\bin\pthreadVC3.dll C:\openvswitch\usr\sbin +- ps: mkdir C:\openvswitch\driver +- ps: cp datapath-windows\x64\Win10$env:CONFIGURATION\package\* C:\openvswitch\driver +- ps: cp datapath-windows\x64\Win10$env:CONFIGURATION\package.cer C:\openvswitch\driver +- ps: cp datapath-windows\misc\* C:\openvswitch\driver +- ps: cp windows\ovs-windows-installer\bin\x64\Release\OpenvSwitch.msi + c:\OpenvSwitch-$env:CONFIGURATION.msi after_build: - - ps: 7z a C:\ovs-master-$env:CONFIGURATION.zip C:\openvswitch - - ps: Push-AppveyorArtifact C:\ovs-master-$env:CONFIGURATION.zip - - ps: Push-AppveyorArtifact C:\OpenvSwitch-$env:CONFIGURATION.msi +- ps: 7z a C:\ovs-main-$env:CONFIGURATION.zip C:\openvswitch +- ps: Push-AppveyorArtifact C:\ovs-main-$env:CONFIGURATION.zip +- ps: Push-AppveyorArtifact C:\OpenvSwitch-$env:CONFIGURATION.msi diff --git a/build-aux/automake.mk b/build-aux/automake.mk index b9a77a51cfe..d65b6da6c5a 100644 --- a/build-aux/automake.mk +++ b/build-aux/automake.mk @@ -1,11 +1,19 @@ EXTRA_DIST += \ build-aux/calculate-schema-cksum \ build-aux/cccl \ + build-aux/check-structs \ build-aux/cksum-schema-check \ build-aux/dist-docs \ build-aux/dpdkstrip.py \ - build-aux/generate-dhparams-c \ + build-aux/extract-odp-netlink-h \ + build-aux/extract-odp-netlink-macros-h \ + build-aux/extract-odp-netlink-windows-dp-h \ + build-aux/extract-ofp-actions \ + build-aux/extract-ofp-errors \ + build-aux/extract-ofp-fields \ + build-aux/extract-ofp-msgs \ build-aux/gen_ofp_field_decoders \ + build-aux/generate-dhparams-c \ build-aux/initial-tab-allowed-files \ build-aux/sodepends.py \ build-aux/soexpand.py \ @@ -13,8 +21,12 @@ EXTRA_DIST += \ build-aux/xml2nroff FLAKE8_PYFILES += \ - build-aux/dpdkstrip.py \ - build-aux/gen_ofp_field_decoders \ - build-aux/sodepends.py \ - build-aux/soexpand.py \ - build-aux/xml2nroff + build-aux/dpdkstrip.py \ + build-aux/extract-ofp-actions \ + build-aux/extract-ofp-errors \ + build-aux/extract-ofp-fields \ + build-aux/extract-ofp-msgs \ + build-aux/gen_ofp_field_decoders \ + build-aux/sodepends.py \ + build-aux/soexpand.py \ + build-aux/xml2nroff diff --git a/build-aux/extract-ofp-actions b/build-aux/extract-ofp-actions index 0aa6c65f316..cc5c1dbb062 100755 --- a/build-aux/extract-ofp-actions +++ b/build-aux/extract-ofp-actions @@ -17,27 +17,30 @@ version_map = {"1.0": 0x01, version_reverse_map = dict((v, k) for (k, v) in version_map.items()) # Map from vendor name to the length of the action header. -vendor_map = {"OF": (0x00000000, 4), +vendor_map = {"OF": (0x00000000, 4), "ONF": (0x4f4e4600, 10), "NX": (0x00002320, 10)} # Basic types used in action arguments. -types = {} -types['uint8_t'] = {"size": 1, "align": 1, "ntoh": None, "hton": None} -types['ovs_be16'] = {"size": 2, "align": 2, "ntoh": "ntohs", "hton": "htons"} -types['ovs_be32'] = {"size": 4, "align": 4, "ntoh": "ntohl", "hton": "htonl"} -types['ovs_be64'] = {"size": 8, "align": 8, "ntoh": "ntohll", "hton": "htonll"} -types['uint16_t'] = {"size": 2, "align": 2, "ntoh": None, "hton": None} -types['uint32_t'] = {"size": 4, "align": 4, "ntoh": None, "hton": None} -types['uint64_t'] = {"size": 8, "align": 8, "ntoh": None, "hton": None} +types = { + "uint8_t" : {"size": 1, "align": 1, "ntoh": None, "hton": None}, + "ovs_be16": {"size": 2, "align": 2, "ntoh": "ntohs", "hton": "htons"}, + "ovs_be32": {"size": 4, "align": 4, "ntoh": "ntohl", "hton": "htonl"}, + "ovs_be64": {"size": 8, "align": 8, "ntoh": "ntohll", "hton": "htonll"}, + "uint16_t": {"size": 2, "align": 2, "ntoh": None, "hton": None}, + "uint32_t": {"size": 4, "align": 4, "ntoh": None, "hton": None}, + "uint64_t": {"size": 8, "align": 8, "ntoh": None, "hton": None}, +} line = "" - +n_errors = 0 arg_structs = set() + def round_up(x, y): return int((x + (y - 1)) / y) * y + def open_file(fn): global file_name global input_file @@ -46,6 +49,7 @@ def open_file(fn): input_file = open(file_name) line_number = 0 + def get_line(): global input_file global line @@ -56,16 +60,18 @@ def get_line(): fatal("unexpected end of input") return line -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (file_name, line_number, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def usage(): argv0 = os.path.basename(sys.argv[0]) print('''\ @@ -84,10 +90,8 @@ Commands: ''' % {"argv0": argv0}) sys.exit(0) -def extract_ofp_actions(fn, definitions): - error_types = {} - comments = [] +def extract_ofp_actions(fn, definitions): names = [] domain = {} for code, size in vendor_map.values(): @@ -100,14 +104,14 @@ def extract_ofp_actions(fn, definitions): while True: get_line() - if re.match('enum ofp_raw_action_type {', line): + if re.match(r'enum ofp_raw_action_type {', line): break while True: get_line() if line.startswith('/*') or not line or line.isspace(): continue - elif re.match('}', line): + elif re.match(r'}', line): break if not line.lstrip().startswith('/*'): @@ -119,10 +123,10 @@ def extract_ofp_actions(fn, definitions): if line.startswith('/*') or not line or line.isspace(): fatal("unexpected syntax within action") comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') - comment = re.sub('\[[^]]*\]', '', comment) + comment = re.sub(r'\[[^]]*\]', '', comment) comment = comment[:-2].rstrip() - m = re.match('([^:]+):\s+(.*)$', comment) + m = re.match(r'([^:]+):\s+(.*)$', comment) if not m: fatal("unexpected syntax between actions") @@ -147,7 +151,9 @@ def extract_ofp_actions(fn, definitions): names.append(enum) for dst in dsts.split(', '): - m = re.match(r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?(?:\((\d+)\))(?: is deprecated \(([^)]+)\))?$', dst) + m = re.match( + r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?(?:\((\d+)\))(?:' + r' is deprecated \(([^)]+)\))?$', dst) if not m: fatal("%r: syntax error in destination" % dst) vendor_name = m.group(1) @@ -220,18 +226,18 @@ def extract_ofp_actions(fn, definitions): else: max_length = min_length - info = {"enum": enum, # 0 - "deprecation": deprecation, # 1 - "file_name": file_name, # 2 - "line_number": line_number, # 3 - "min_length": min_length, # 4 - "max_length": max_length, # 5 - "arg_ofs": arg_ofs, # 6 - "arg_len": arg_len, # 7 - "base_argtype": base_argtype, # 8 - "arg_vl_mff_map": arg_vl_mff_map, # 9 - "version": version, # 10 - "type": type_} # 11 + info = {"enum": enum, # 0 + "deprecation": deprecation, # 1 + "file_name": file_name, # 2 + "line_number": line_number, # 3 + "min_length": min_length, # 4 + "max_length": max_length, # 5 + "arg_ofs": arg_ofs, # 6 + "arg_len": arg_len, # 7 + "base_argtype": base_argtype, # 8 + "arg_vl_mff_map": arg_vl_mff_map, # 9 + "version": version, # 10 + "type": type_} # 11 domain[vendor][type_][version] = info enums.setdefault(enum, []) @@ -247,9 +253,13 @@ def extract_ofp_actions(fn, definitions): """) if definitions: - print("/* Verify that structs used as actions are reasonable sizes. */") + print( + "/* Verify that structs used as actions are reasonable sizes. */" + ) for s in sorted(arg_structs): - print("BUILD_ASSERT_DECL(sizeof(%s) %% OFP_ACTION_ALIGN == 0);" % s) + print( + "BUILD_ASSERT_DECL(sizeof(%s) %% OFP_ACTION_ALIGN == 0);" % s + ) print("\nstatic struct ofpact_raw_instance all_raw_instances[] = {") for vendor in domain: @@ -265,9 +275,11 @@ def extract_ofp_actions(fn, definitions): print(" %s," % d["max_length"]) print(" %s," % d["arg_ofs"]) print(" %s," % d["arg_len"]) - print(" \"%s\"," % re.sub('_RAW[0-9]*', '', d["enum"], 1)) + print(" \"%s\"," + % re.sub(r'_RAW[0-9]*', '', d["enum"], 1)) if d["deprecation"]: - print(" \"%s\"," % re.sub(r'(["\\])', r'\\\1', d["deprecation"])) + print(" \"%s\"," + % re.sub(r'(["\\])', r'\\\1', d["deprecation"])) else: print(" NULL,") print(" },") @@ -286,10 +298,11 @@ def extract_ofp_actions(fn, definitions): decl = "static inline " if base_argtype.startswith('struct'): - decl += "%s *" %base_argtype + decl += "%s *" % base_argtype else: decl += "void" - decl += "\nput_%s(struct ofpbuf *openflow" % versions[0]["enum"].replace('_RAW', '', 1) + decl += "\nput_%s(struct ofpbuf *openflow" \ + % versions[0]["enum"].replace('_RAW', '', 1) if need_ofp_version: decl += ", enum ofp_version version" if base_argtype != 'void' and not base_argtype.startswith('struct'): @@ -348,9 +361,13 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, else: arg = "arg" if arg_vl_mff_map: - print(" return decode_%s(%s, version, vl_mff_map, tlv_bitmap, out);" % (enum, arg)) + print( + " return decode_%s(%s," % (enum, arg), + "version, vl_mff_map, tlv_bitmap, out);" + ) else: - print(" return decode_%s(%s, version, out);" % (enum, arg)) + print(" return decode_%s(%s, version, out);" + % (enum, arg)) print("") print("""\ default: @@ -366,7 +383,8 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, arg_vl_mff_map = versions[0]["arg_vl_mff_map"] if base_argtype != 'void': if base_argtype.startswith('struct'): - prototype += "const %s *, enum ofp_version, " % base_argtype + prototype += "const %s *, " % base_argtype + prototype += "enum ofp_version, " else: prototype += "%s, enum ofp_version, " % base_argtype if arg_vl_mff_map: @@ -378,13 +396,15 @@ ofpact_decode(const struct ofp_action_header *a, enum ofp_raw_action_type raw, static enum ofperr ofpact_decode(const struct ofp_action_header *, enum ofp_raw_action_type raw, enum ofp_version version, - uint64_t arg, const struct vl_mff_map *vl_mff_map, + uint64_t arg, + const struct vl_mff_map *vl_mff_map, uint64_t *tlv_bitmap, struct ofpbuf *out); """) + -## ------------ ## -## Main Program ## -## ------------ ## +# ------------ # +# Main Program # +# ------------ # if __name__ == '__main__': argv0 = sys.argv[0] diff --git a/build-aux/extract-ofp-errors b/build-aux/extract-ofp-errors index 2c3fbfc881b..eeefccbee05 100755 --- a/build-aux/extract-ofp-errors +++ b/build-aux/extract-ofp-errors @@ -22,6 +22,9 @@ tokenRe = "#?" + idRe + "|[0-9]+|." inComment = False inDirective = False +n_errors = 0 + + def open_file(fn): global fileName global inputFile @@ -30,6 +33,7 @@ def open_file(fn): inputFile = open(fileName) lineNumber = 0 + def tryGetLine(): global inputFile global line @@ -38,10 +42,12 @@ def tryGetLine(): lineNumber += 1 return line != "" + def getLine(): if not tryGetLine(): fatal("unexpected end of input") + def getToken(): global token global line @@ -82,37 +88,43 @@ def getToken(): line = line[:-2] + inputFile.readline() lineNumber += 1 if line == "": - if token == None: + if token is None: fatal("unexpected end of input") token = None return False -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (fileName, lineNumber, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def skipDirective(): getToken() while token != '$': getToken() + def isId(s): - return re.match(idRe + "$", s) != None + return re.match(idRe + "$", s) is not None + def forceId(): if not isId(token): fatal("identifier expected") + def forceInteger(): - if not re.match('[0-9]+$', token): + if not re.match(r'[0-9]+$', token): fatal("integer expected") + def match(t): if token == t: getToken() @@ -120,10 +132,12 @@ def match(t): else: return False + def forceMatch(t): if not match(t): fatal("%s expected" % t) + def parseTaggedName(): assert token in ('struct', 'union') name = token @@ -133,26 +147,26 @@ def parseTaggedName(): getToken() return name + def print_enum(tag, constants, storage_class): - print (""" + print(""" %(storage_class)sconst char * %(tag)s_to_string(uint16_t value) { switch (value) {\ """ % {"tag": tag, - "bufferlen": len(tag) + 32, "storage_class": storage_class}) for constant in constants: - print (" case %s: return \"%s\";" % (constant, constant)) - print ("""\ + print(" case %s: return \"%s\";" % (constant, constant)) + print("""\ } return NULL; -}\ -""" % {"tag": tag}) +}""") + def usage(): argv0 = os.path.basename(sys.argv[0]) - print ('''\ + print('''\ %(argv0)s, for extracting OpenFlow error codes from header files usage: %(argv0)s ERROR_HEADER VENDOR_HEADER @@ -167,6 +181,7 @@ The output is suitable for use as lib/ofp-errors.inc.\ ''' % {"argv0": argv0}) sys.exit(0) + def extract_vendor_ids(fn): global vendor_map vendor_map = {} @@ -174,7 +189,10 @@ def extract_vendor_ids(fn): open_file(fn) while tryGetLine(): - m = re.match(r'#define\s+([A-Z0-9_]+)_VENDOR_ID\s+(0x[0-9a-fA-F]+|[0-9]+)', line) + m = re.match( + r'#define\s+([A-Z0-9_]+)_VENDOR_ID\s+(0x[0-9a-fA-F]+|[0-9]+)', + line + ) if not m: continue @@ -202,9 +220,8 @@ def extract_vendor_ids(fn): % (id_, vendor_reverse_map[id_], name)) vendor_reverse_map[id_] = name -def extract_ofp_errors(fn): - error_types = {} +def extract_ofp_errors(fn): comments = [] names = [] domain = {} @@ -220,14 +237,14 @@ def extract_ofp_errors(fn): while True: getLine() - if re.match('enum ofperr', line): + if re.match(r'enum ofperr', line): break while True: getLine() if line.startswith('/*') or not line or line.isspace(): continue - elif re.match('}', line): + elif re.match(r'}', line): break if not line.lstrip().startswith('/*'): @@ -241,19 +258,19 @@ def extract_ofp_errors(fn): comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') comment = comment[:-2].rstrip() - m = re.match('Expected: (.*)\.$', comment) + m = re.match(r'Expected: (.*)\.$', comment) if m: expected_errors[m.group(1)] = (fileName, lineNumber) continue - m = re.match('((?:.(?!\. ))+.)\.\s+(.*)$', comment) + m = re.match(r'((?:.(?!\. ))+.)\.\s+(.*)$', comment) if not m: fatal("unexpected syntax between errors") dsts, comment = m.groups() getLine() - m = re.match('\s+(?:OFPERR_([A-Z0-9_]+))(\s*=\s*OFPERR_OFS)?,', + m = re.match(r'\s+(?:OFPERR_([A-Z0-9_]+))(\s*=\s*OFPERR_OFS)?,', line) if not m: fatal("syntax error expecting OFPERR_ enum value") @@ -262,11 +279,14 @@ def extract_ofp_errors(fn): if enum in names: fatal("%s specified twice" % enum) - comments.append(re.sub('\[[^]]*\]', '', comment)) + comments.append(re.sub(r'\[[^]]*\]', '', comment)) names.append(enum) for dst in dsts.split(', '): - m = re.match(r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?\((\d+)(?:,(\d+))?\)$', dst) + m = re.match( + r'([A-Z]+)([0-9.]+)(\+|-[0-9.]+)?\((\d+)(?:,(\d+))?\)$', + dst + ) if not m: fatal("%r: syntax error in destination" % dst) vendor_name = m.group(1) @@ -313,8 +333,7 @@ def extract_ofp_errors(fn): # mechanism that includes a type but not a code. if v1 < version_map['1.2'] or v2 < version_map['1.2']: if code is None: - fatal("%s: NX1.0 and NX1.1 domains require code" - % (dst, vendor_name)) + fatal("%s: NX1.0 and NX1.1 domains require code" % dst) if v1 >= version_map['1.2'] or v2 >= version_map['1.2']: if code is not None: fatal("%s: NX1.2+ domains do not have codes" % dst) @@ -340,11 +359,13 @@ def extract_ofp_errors(fn): del expected_errors[msg] else: error("%s: %s." % (dst, msg)) - sys.stderr.write("%s:%d: %s: Here is the location " - "of the previous definition.\n" - % (domain[version][vendor][type_][code][1], - domain[version][vendor][type_][code][2], - dst)) + sys.stderr.write( + "%s:%d: %s: Here is the location " + "of the previous definition.\n" + % (domain[version][vendor][type_][code][1], + domain[version][vendor][type_][code][2], + dst) + ) else: domain[version][vendor][type_][code] = (enum, fileName, lineNumber) @@ -361,7 +382,7 @@ def extract_ofp_errors(fn): if n_errors: sys.exit(1) - print ("""\ + print("""\ /* Generated automatically; do not modify! -*- buffer-read-only: t -*- */ #define OFPERR_N_ERRORS %d @@ -386,7 +407,7 @@ static const char *error_comments[OFPERR_N_ERRORS] = { for comment in comments))) def output_domain(map, name, description, version): - print (""" + print(""" static enum ofperr %s_decode(uint32_t vendor, uint16_t type, uint16_t code) { @@ -405,16 +426,16 @@ static enum ofperr vendor_s = "(%#xULL << 32) | " % vendor else: vendor_s = "" - print (" case %s ((uint32_t) %d << 16) | %d:" % (vendor_s, + print(" case %s ((uint32_t) %d << 16) | %d:" % (vendor_s, type_, code)) - print (" return OFPERR_%s;" % enum) - print ("""\ + print(" return OFPERR_%s;" % enum) + print("""\ } return 0; }""") - print (""" + print(""" static const struct ofperr_domain %s = { "%s", %d, @@ -423,20 +444,22 @@ static const struct ofperr_domain %s = { for enum in names: if enum in map: vendor, type_, code = map[enum] - if code == None: + if code is None: code = -1 - print (" { %#8x, %2d, %3d }, /* %s */" % (vendor, type_, code, enum)) + print(" { %#8x, %2d, %3d }, /* %s */" % (vendor, type_, + code, enum)) else: - print (" { -1, -1, -1 }, /* %s */" % enum) - print ("""\ + print(" { -1, -1, -1 }, /* %s */" % enum) + print("""\ }, };""") for version_name, id_ in version_map.items(): - var = 'ofperr_of' + re.sub('[^A-Za-z0-9_]', '', version_name) + var = 'ofperr_of' + re.sub(r'[^A-Za-z0-9_]', '', version_name) description = "OpenFlow %s" % version_name output_domain(reverse[id_], var, description, id_) + if __name__ == '__main__': if '--help' in sys.argv: usage() diff --git a/build-aux/extract-ofp-fields b/build-aux/extract-ofp-fields index efec59c25b3..2657d9249bb 100755 --- a/build-aux/extract-ofp-fields +++ b/build-aux/extract-ofp-fields @@ -4,9 +4,9 @@ import getopt import sys import os.path import xml.dom.minidom -import build.nroff -from build.extract_ofp_fields import ( +from ovs_build_helpers import nroff +from ovs_build_helpers.extract_ofp_fields import ( extract_ofp_fields, PREREQS, OXM_CLASSES, @@ -167,9 +167,9 @@ def make_nx_match(meta_flow_h): print(oline) -## ------------------------ ## -## Documentation Generation ## -## ------------------------ ## +# ------------------------ # +# Documentation Generation # +# ------------------------ # def field_to_xml(field_node, f, body, summary): @@ -189,9 +189,9 @@ def field_to_xml(field_node, f, body, summary): ovs_version = [int(x) for x in ovs_version_s.split(".")] if min_ovs_version is None or ovs_version < min_ovs_version: min_ovs_version = ovs_version - summary += ["\\fB%s\\fR" % f["name"]] + summary += [r"\fB%s\fR" % f["name"]] if f["extra_name"]: - summary += [" aka \\fB%s\\fR" % f["extra_name"]] + summary += [r" aka \fB%s\fR" % f["extra_name"]] summary += [";%d" % f["n_bytes"]] if f["n_bits"] != 8 * f["n_bytes"]: summary += [" (low %d bits)" % f["n_bits"]] @@ -213,18 +213,18 @@ def field_to_xml(field_node, f, body, summary): title = field_node.attributes["title"].nodeValue body += [ - """.PP -\\fB%s Field\\fR + r""".PP +\fB%s Field\fR .TS -tab(;); +tab(;),nowarn; l lx. """ % title ] - body += ["Name:;\\fB%s\\fR" % f["name"]] + body += [r"Name:;\fB%s\fR" % f["name"]] if f["extra_name"]: - body += [" (aka \\fB%s\\fR)" % f["extra_name"]] + body += [r" (aka \fB%s\fR)" % f["extra_name"]] body += ["\n"] body += ["Width:;"] @@ -297,7 +297,7 @@ l lx. body += [".TE\n"] body += [".PP\n"] - body += [build.nroff.block_xml_to_nroff(field_node.childNodes)] + body += [nroff.block_xml_to_nroff(field_node.childNodes)] def group_xml_to_nroff(group_node, fields): @@ -310,17 +310,18 @@ def group_xml_to_nroff(group_node, fields): id_ = node.attributes["id"].nodeValue field_to_xml(node, fields[id_], body, summary) else: - body += [build.nroff.block_xml_to_nroff([node])] + body += [nroff.block_xml_to_nroff([node])] content = [ ".bp\n", - '.SH "%s"\n' % build.nroff.text_to_nroff(title.upper() + " FIELDS"), + '.SH "%s"\n' % nroff.text_to_nroff(title.upper() + " FIELDS"), '.SS "Summary:"\n', ".TS\n", - "tab(;);\n", - "l l l l l l l.\n", + "tab(;),nowarn;\n", + "l l l l l l.\n", "Name;Bytes;Mask;RW?;Prereqs;NXM/OXM Support\n", - "\_;\_;\_;\_;\_;\_\n", + r"\_;\_;\_;\_;\_;\_", + "\n", ] content += summary content += [".TE\n"] @@ -329,7 +330,7 @@ def group_xml_to_nroff(group_node, fields): def make_oxm_classes_xml(document): - s = """tab(;); + s = r"""tab(;),nowarn; l l l. Prefix;Vendor;Class \_;\_;\_ @@ -367,42 +368,41 @@ def make_ovs_fields(meta_flow_h, meta_flow_xml): doc = document.documentElement global version - if version == None: + if version is None: version = "UNKNOWN" print( - """\ -'\\" tp -.\\" -*- mode: troff; coding: utf-8 -*- + r"""'\" tp +.\" -*- mode: troff; coding: utf-8 -*- .TH "ovs\-fields" 7 "%s" "Open vSwitch" "Open vSwitch Manual" -.fp 5 L CR \\" Make fixed-width font available as \\fL. +.fp 5 L CR \" Make fixed-width font available as \fL. .de ST . PP . RS -0.15in -. I "\\\\$1" +. I "\\$1" . RE .. .de SU . PP -. I "\\\\$1" +. I "\\$1" .. .de IQ . br . ns -. IP "\\\\$1" +. IP "\\$1" .. .de TQ . br . ns -. TP "\\\\$1" +. TP "\\$1" .. .de URL -\\\\$2 \\(laURL: \\\\$1 \\(ra\\\\$3 +\\$2 \(laURL: \\$1 \(ra\\$3 .. -.if \\n[.g] .mso www.tmac +.if \n[.g] .mso www.tmac .SH NAME ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch . @@ -422,7 +422,7 @@ ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch elif node.nodeType == node.COMMENT_NODE: pass else: - s += build.nroff.block_xml_to_nroff([node]) + s += nroff.block_xml_to_nroff([node]) for f in fields: if "used" not in f: @@ -460,9 +460,9 @@ ovs\-fields \- protocol header fields in OpenFlow and Open vSwitch print(output[i]) -## ------------ ## -## Main Program ## -## ------------ ## +# ------------ # +# Main Program # +# ------------ # if __name__ == "__main__": argv0 = sys.argv[0] diff --git a/build-aux/extract-ofp-msgs b/build-aux/extract-ofp-msgs index 6b3295cf64c..c26ea1d3557 100755 --- a/build-aux/extract-ofp-msgs +++ b/build-aux/extract-ofp-msgs @@ -24,6 +24,9 @@ OFPT11_STATS_REQUEST = 18 OFPT11_STATS_REPLY = 19 OFPST_VENDOR = 0xffff +n_errors = 0 + + def decode_version_range(range): if range in VERSION: return (VERSION[range], VERSION[range]) @@ -35,6 +38,7 @@ def decode_version_range(range): a, b = re.match(r'^([^-]+)-([^-]+)$', range).groups() return (VERSION[a], VERSION[b]) + def get_line(): global line global line_number @@ -43,16 +47,18 @@ def get_line(): if line == "": fatal("unexpected end of input") -n_errors = 0 + def error(msg): global n_errors sys.stderr.write("%s:%d: %s\n" % (file_name, line_number, msg)) n_errors += 1 + def fatal(msg): error(msg) sys.exit(1) + def usage(): argv0 = os.path.basename(sys.argv[0]) print('''\ @@ -65,6 +71,7 @@ only controls #line directives in the output.\ ''' % {"argv0": argv0}) sys.exit(0) + def make_sizeof(s): m = re.match(r'(.*) up to (.*)', s) if m: @@ -73,9 +80,8 @@ def make_sizeof(s): else: return "sizeof(%s)" % s -def extract_ofp_msgs(output_file_name): - raw_types = [] +def extract_ofp_msgs(output_file_name): all_hdrs = {} all_raws = {} all_raws_order = [] @@ -108,15 +114,16 @@ def extract_ofp_msgs(output_file_name): comment += ' %s' % line.lstrip('* \t').rstrip(' \t\r\n') comment = comment[:-2].rstrip() - m = re.match(r'([A-Z]+) ([-.+\d]+|) \((\d+)\): ([^.]+)\.$', comment) + m = re.match( + r'([A-Z]+) ([-.+\d]+|) \((\d+)\): ([^.]+)\.$', comment + ) if not m: fatal("unexpected syntax between messages") type_, versions, number, contents = m.groups() number = int(number) get_line() - m = re.match('\s+(?:OFPRAW_%s)(\d*)_([A-Z0-9_]+),?$' % type_, - line) + m = re.match(r'\s+(?:OFPRAW_%s)(\d*)_([A-Z0-9_]+),?$' % type_, line) if not m: fatal("syntax error expecting OFPRAW_ enum") vinfix, name = m.groups() @@ -300,7 +307,7 @@ def extract_ofp_msgs(output_file_name): for hdrs in r['hdrs']: output.append(" { {0, NULL}, {%d, %d, %d, 0x%x, %d}, %s, 0 }," % (hdrs + (raw,))) - + output.append("};") output.append("") @@ -349,8 +356,8 @@ def extract_ofp_msgs(output_file_name): % r["human_name"]) output.append("};") - output.append(""); - output.append("static const char *type_names[] = {"); + output.append("") + output.append("static const char *type_names[] = {") for t in all_types: output.append(" \"%s\"," % t) output.append("};") @@ -378,4 +385,3 @@ if __name__ == '__main__': for line in extract_ofp_msgs(sys.argv[2]): print(line) - diff --git a/build-aux/gen_ofp_field_decoders b/build-aux/gen_ofp_field_decoders index 96f99e860f7..0cb6108c222 100755 --- a/build-aux/gen_ofp_field_decoders +++ b/build-aux/gen_ofp_field_decoders @@ -2,7 +2,7 @@ import argparse -import build.extract_ofp_fields as extract_fields +from ovs_build_helpers.extract_ofp_fields import extract_ofp_fields def main(): @@ -19,15 +19,19 @@ def main(): args = parser.parse_args() - fields = extract_fields.extract_ofp_fields(args.metaflow) + fields = extract_ofp_fields(args.metaflow) field_decoders = {} + aliases = {} for field in fields: decoder = get_decoder(field) field_decoders[field.get("name")] = decoder if field.get("extra_name"): field_decoders[field.get("extra_name")] = decoder + for nxm in field.get("OXM", []): + aliases[nxm[1]] = field.get("name") + code = """ # This file is auto-generated. Do not edit! @@ -35,14 +39,25 @@ from ovs.flow import decoders field_decoders = {{ {decoders} +}} + +field_aliases = {{ +{aliases} }}""".format( decoders="\n".join( [ " '{name}': {decoder},".format(name=name, decoder=decoder) for name, decoder in field_decoders.items() ] + ), + aliases="\n".join( + [ + " '{alias}': '{name}',".format(name=name, alias=alias) + for alias, name in aliases.items() + ] ) ) + print(code) diff --git a/build-aux/generate-dhparams-c b/build-aux/generate-dhparams-c index 1884c99e1f0..aca1dbca910 100755 --- a/build-aux/generate-dhparams-c +++ b/build-aux/generate-dhparams-c @@ -1,5 +1,74 @@ #! /bin/sh -e +dhparam_to_c() { + local bits + local get_p=0 + local line + local nl=" +" + local p + local i=0 + while read -r line; do + case "$line" in + *"DH Parameters: "*) + bits=${line#*DH Parameters: (} + bits=${bits% bit)} + continue + ;; + "P:"|"prime:") + get_p=1 + continue + ;; + "G: "*|"generator: "*) + g=${line#*(} + g=${g%)} + g=$(printf "0x%.2X" "$g") + continue + ;; + esac + if [ "$get_p" = 1 ]; then + IFS=":" + for x in $line; do + [ -z "$p" ] && [ "$x" = "00" ] && continue + [ $i -ge 10 ] && i=0 + [ $i -eq 0 ] && p="$p$nl " + x=0x$x + p=$(printf "%s 0x%.2X," "$p" "$x") + i=$((i + 1)) + done + unset IFS + fi + done < $@ -EXTRA_DIST += $(srcdir)/build-aux/extract-odp-netlink-windows-dp-h - CLEANFILES += $(srcdir)/datapath-windows/include/OvsDpInterface.h diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c index 2f44086b469..97029b0f4e1 100644 --- a/datapath-windows/ovsext/Actions.c +++ b/datapath-windows/ovsext/Actions.c @@ -1514,6 +1514,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, UINT16 *checkField = NULL; BOOLEAN l4Offload = FALSE; NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; + UINT16 preNatPseudoChecksum = 0; + BOOLEAN preservePseudoChecksum = FALSE; ASSERT(layers->value != 0); @@ -1549,6 +1551,11 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, * case, we only update the TTL. */ /*Only tx direction the checksum value will be reset to be PseudoChecksum*/ + if (!isTx) { + preNatPseudoChecksum = IPPseudoChecksum(&ipHdr->saddr, &ipHdr->daddr, + tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, + ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); + } if (isSource) { addrField = &ipHdr->saddr; @@ -1565,7 +1572,12 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, ((BOOLEAN)csumInfo.Receive.UdpChecksumSucceeded || (BOOLEAN)csumInfo.Receive.UdpChecksumFailed); } - if (isTx && l4Offload) { + if (!isTx && l4Offload) { + if (*checkField == preNatPseudoChecksum) { + preservePseudoChecksum = TRUE; + } + } + if (isTx && l4Offload || preservePseudoChecksum) { *checkField = IPPseudoChecksum(&newAddr, &ipHdr->daddr, tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); @@ -1585,8 +1597,13 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, ((BOOLEAN)csumInfo.Receive.UdpChecksumSucceeded || (BOOLEAN)csumInfo.Receive.UdpChecksumFailed); } + if (!isTx && l4Offload) { + if (*checkField == preNatPseudoChecksum) { + preservePseudoChecksum = TRUE; + } + } - if (isTx && l4Offload) { + if (isTx && l4Offload || preservePseudoChecksum) { *checkField = IPPseudoChecksum(&ipHdr->saddr, &newAddr, tcpHdr ? IPPROTO_TCP : IPPROTO_UDP, ntohs(ipHdr->tot_len) - ipHdr->ihl * 4); @@ -1595,7 +1612,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, if (*addrField != newAddr) { UINT32 oldAddr = *addrField; - if ((checkField && *checkField != 0) && (!l4Offload || !isTx)) { + if ((checkField && *checkField != 0) && + (!l4Offload || (!isTx && !preservePseudoChecksum))) { /* Recompute total checksum. */ *checkField = ChecksumUpdate32(*checkField, oldAddr, newAddr); @@ -1609,7 +1627,8 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, } if (portField && *portField != newPort) { - if ((checkField) && (!l4Offload || !isTx)) { + if ((checkField) && + (!l4Offload || (!isTx && !preservePseudoChecksum))) { /* Recompute total checksum. */ *checkField = ChecksumUpdate16(*checkField, *portField, newPort); diff --git a/debian/changelog b/debian/changelog index 32cea72d9ca..3bc24aa706b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,8 +1,32 @@ -openvswitch (3.0.90-1) unstable; urgency=low +openvswitch (3.4.90-1) unstable; urgency=low * New upstream version - -- Open vSwitch team Mon, 15 Aug 2022 17:43:59 +0200 + -- Open vSwitch team Mon, 15 Jul 2024 13:00:01 +0100 + +openvswitch (3.4.0-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Mon, 15 Jul 2024 13:00:00 +0100 + +openvswitch (3.3.0-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Fri, 16 Feb 2024 12:25:58 +0100 + +openvswitch (3.2.0-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Thu, 17 Aug 2023 15:20:36 +0200 + +openvswitch (3.1.0-1) unstable; urgency=low + + * New upstream version + + -- Open vSwitch team Thu, 16 Feb 2023 13:52:24 +0100 openvswitch (3.0.0-1) unstable; urgency=low diff --git a/debian/control.in b/debian/control.in index db52c8a99f0..f9eea897ed9 100644 --- a/debian/control.in +++ b/debian/control.in @@ -21,7 +21,7 @@ Build-Depends: iproute2, libcap-ng-dev, libdbus-1-dev [amd64 i386 ppc64el arm64], -# DPDK_NETDEV libdpdk-dev (>= 21.11) [amd64 i386 ppc64el arm64], +# DPDK_NETDEV libdpdk-dev (>= 23.11) [amd64 i386 ppc64el arm64], libnuma-dev [amd64 i386 ppc64el arm64], libpcap-dev [amd64 i386 ppc64el arm64], libssl-dev, @@ -287,6 +287,7 @@ Depends: Suggests: python3-netaddr, python3-pyparsing, + python3-unbound, Description: Python 3 bindings for Open vSwitch Open vSwitch is a production quality, multilayer, software-based, Ethernet virtual switch. It is designed to enable massive network diff --git a/debian/openvswitch-switch.init b/debian/openvswitch-switch.init index 7b9fbf61e16..96fe1f7c4c6 100755 --- a/debian/openvswitch-switch.init +++ b/debian/openvswitch-switch.init @@ -47,21 +47,21 @@ load_kmod () { start () { if ovs_ctl load-kmod; then - : + : else - echo "Module has probably not been built for this kernel." - echo "Please install Linux 3.3 or later with openvswitch kernel support." + echo "Module has probably not been built for this kernel." + echo "Please install Linux 3.3 or later with openvswitch kernel support." - if test X"$OVS_MISSING_KMOD_OK" = Xyes; then - # We're being invoked by the package postinst. Do not - # fail package installation just because the kernel module - # is not available. - exit 0 - fi + if test X"$OVS_MISSING_KMOD_OK" = Xyes; then + # We're being invoked by the package postinst. Do not + # fail package installation just because the kernel module + # is not available. + exit 0 + fi fi set ovs_ctl ${1-start} --system-id=random if test X"$FORCE_COREFILES" != X; then - set "$@" --force-corefiles="$FORCE_COREFILES" + set "$@" --force-corefiles="$FORCE_COREFILES" fi set "$@" $OVS_CTL_OPTS "$@" || exit $? @@ -113,7 +113,7 @@ restart () { case $1 in start) start - ;; + ;; stop | force-stop) stop ;; diff --git a/debian/openvswitch-switch.postinst b/debian/openvswitch-switch.postinst index 042e671d514..1a20a944ecc 100755 --- a/debian/openvswitch-switch.postinst +++ b/debian/openvswitch-switch.postinst @@ -30,20 +30,20 @@ case "$1" in mv "${conffile}.dpkg-bak" "${conffile}" fi - # Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, - # moving an existing file if there is one. - # - # Ditto for .conf.db.~lock~. - for base in conf.db .conf.db.~lock~; do - new=/var/lib/openvswitch/$base - old=/etc/openvswitch/$base - if test -f $old && test ! -e $new; then - mv $old $new - fi - if test ! -e $old && test ! -h $old; then - ln -s $new $old - fi - done + # Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, + # moving an existing file if there is one. + # + # Ditto for .conf.db.~lock~. + for base in conf.db .conf.db.~lock~; do + new=/var/lib/openvswitch/$base + old=/etc/openvswitch/$base + if test -f $old && test ! -e $new; then + mv $old $new + fi + if test ! -e $old && test ! -h $old; then + ln -s $new $old + fi + done ;; abort-upgrade|abort-remove|abort-deconfigure) diff --git a/debian/openvswitch-test.install b/debian/openvswitch-test.install index b3a80d86ae2..88c82528054 100644 --- a/debian/openvswitch-test.install +++ b/debian/openvswitch-test.install @@ -2,3 +2,4 @@ usr/bin/ovs-l3ping usr/bin/ovs-test usr/share/man/man8/ovs-l3ping.8 usr/share/man/man8/ovs-test.8 +usr/share/openvswitch/scripts/usdt/* diff --git a/debian/rules b/debian/rules index 971bc1775ee..b6f905f3cdd 100755 --- a/debian/rules +++ b/debian/rules @@ -23,21 +23,26 @@ override_dh_auto_configure: test -d _debian || mkdir _debian cd _debian && ( \ test -e Makefile || \ - ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ - --sysconfdir=/etc \ - $(DATAPATH_CONFIGURE_OPTS) \ - $(EXTRA_CONFIGURE_OPTS) \ - ) + ../configure --prefix=/usr --localstatedir=/var \ + --enable-ssl \ + --disable-afxdp \ + --sysconfdir=/etc \ + $(DATAPATH_CONFIGURE_OPTS) \ + $(EXTRA_CONFIGURE_OPTS) \ + ) ifneq (,$(filter i386 amd64 ppc64el arm64, $(DEB_HOST_ARCH))) ifeq (,$(filter nodpdk, $(DEB_BUILD_OPTIONS))) test -d _dpdk || mkdir _dpdk cd _dpdk && ( \ test -e Makefile || \ - ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ - --with-dpdk=shared --sysconfdir=/etc \ - $(DATAPATH_CONFIGURE_OPTS) \ - $(EXTRA_CONFIGURE_OPTS) \ - ) + ../configure --prefix=/usr --localstatedir=/var \ + --enable-ssl \ + --disable-afxdp \ + --with-dpdk=shared \ + --sysconfdir=/etc \ + $(DATAPATH_CONFIGURE_OPTS) \ + $(EXTRA_CONFIGURE_OPTS) \ + ) endif endif @@ -129,8 +134,8 @@ override_dh_python3: # Helper target for creating snapshots from upstream git DATE=$(shell date +%Y%m%d) # Upstream branch to track -BRANCH=branch-3.0 -VERSION=3.0.0 +BRANCH=branch-3.4 +VERSION=3.4.0 get-orig-snapshot: rm -Rf openvswitch-upstream diff --git a/include/automake.mk b/include/automake.mk index 6a140c75adf..38ce05a5433 100644 --- a/include/automake.mk +++ b/include/automake.mk @@ -8,7 +8,6 @@ include/odp-netlink-macros.h: include/odp-netlink.h \ build-aux/extract-odp-netlink-macros-h $(AM_V_GEN)sh -f $(srcdir)/build-aux/extract-odp-netlink-macros-h $< > $@ -EXTRA_DIST += build-aux/extract-odp-netlink-h build-aux/extract-odp-netlink-macros-h CLEANFILES += include/odp-netlink.h include/odp-netlink-macros.h include include/openflow/automake.mk diff --git a/include/linux/automake.mk b/include/linux/automake.mk index cdae5eedc48..ac306b53c2c 100644 --- a/include/linux/automake.mk +++ b/include/linux/automake.mk @@ -3,6 +3,7 @@ noinst_HEADERS += \ include/linux/netfilter/nf_conntrack_sctp.h \ include/linux/openvswitch.h \ include/linux/pkt_cls.h \ + include/linux/psample.h \ include/linux/gen_stats.h \ include/linux/tc_act/tc_mpls.h \ include/linux/tc_act/tc_pedit.h \ diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index 8bb5abdc834..0023b65fbbf 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -254,6 +254,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_IP6GRE = 109, OVS_VPORT_TYPE_GTPU = 110, OVS_VPORT_TYPE_BAREUDP = 111, /* Bareudp tunnel. */ + OVS_VPORT_TYPE_SRV6 = 112, /* SRv6 tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -301,11 +302,25 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_PAD, OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, + OVS_VPORT_ATTR_UPCALL_STATS, __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/** + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands + * @OVS_VPORT_UPCALL_ATTR_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_ATTR_FAIL: 64-bit upcall fail packets. + */ +enum ovs_vport_upcall_attr { + OVS_VPORT_UPCALL_ATTR_SUCCESS, + OVS_VPORT_UPCALL_ATTR_FAIL, + __OVS_VPORT_UPCALL_ATTR_MAX, +}; + +#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1) + enum { OVS_VXLAN_EXT_UNSPEC, OVS_VXLAN_EXT_GBP, @@ -977,6 +992,31 @@ struct check_pkt_len_arg { }; #endif +#define OVS_PSAMPLE_COOKIE_MAX_SIZE 16 +/** + * enum ovs_pample_attr - Attributes for %OVS_ACTION_ATTR_PSAMPLE + * action. + * + * @OVS_PSAMPLE_ATTR_GROUP: 32-bit number to identify the source of the + * sample. + * @OVS_PSAMPLE_ATTR_COOKIE: An optional variable-length binary cookie that + * contains user-defined metadata. The maximum length is + * OVS_PSAMPLE_COOKIE_MAX_SIZE bytes. + * + * Sends the packet to the psample multicast group with the specified group and + * cookie. It is possible to combine this action with the + * %OVS_ACTION_ATTR_TRUNC action to limit the size of the sample. + */ +enum ovs_psample_attr { + OVS_PSAMPLE_ATTR_GROUP = 1, /* u32 number. */ + OVS_PSAMPLE_ATTR_COOKIE, /* Optional, user specified cookie. */ + + /* private: */ + __OVS_PSAMPLE_ATTR_MAX +}; + +#define OVS_PSAMPLE_ATTR_MAX (__OVS_PSAMPLE_ATTR_MAX - 1) + /** * enum ovs_action_attr - Action types. * @@ -1041,6 +1081,8 @@ struct check_pkt_len_arg { * of l3 tunnel flag in the tun_flags field of OVS_ACTION_ATTR_ADD_MPLS * argument. * @OVS_ACTION_ATTR_DROP: Explicit drop action. + * @OVS_ACTION_ATTR_PSAMPLE: Send a sample of the packet to external observers + * via psample. */ enum ovs_action_attr { @@ -1070,11 +1112,13 @@ enum ovs_action_attr { OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ OVS_ACTION_ATTR_ADD_MPLS, /* struct ovs_action_add_mpls. */ + OVS_ACTION_ATTR_DEC_TTL, /* Nested OVS_DEC_TTL_ATTR_*. */ + OVS_ACTION_ATTR_DROP, /* u32 xlate_error. */ + OVS_ACTION_ATTR_PSAMPLE, /* Nested OVS_PSAMPLE_ATTR_*. */ #ifndef __KERNEL__ OVS_ACTION_ATTR_TUNNEL_PUSH, /* struct ovs_action_push_tnl*/ OVS_ACTION_ATTR_TUNNEL_POP, /* u32 port number. */ - OVS_ACTION_ATTR_DROP, /* u32 xlate_error. */ OVS_ACTION_ATTR_LB_OUTPUT, /* u32 bond-id. */ #endif __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted @@ -1168,6 +1212,14 @@ struct ovs_zone_limit { __u32 count; }; +enum ovs_dec_ttl_attr { + OVS_DEC_TTL_ATTR_UNSPEC, + OVS_DEC_TTL_ATTR_ACTION, /* Nested struct nlattr */ + __OVS_DEC_TTL_ATTR_MAX +}; + +#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1) + #define OVS_CLONE_ATTR_EXEC 0 /* Specify an u32 value. When nonzero, * actions in clone will not change flow * keys. False otherwise. diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h index ba82e690eba..fb4a7ecea4c 100644 --- a/include/linux/pkt_cls.h +++ b/include/linux/pkt_cls.h @@ -1,7 +1,7 @@ #ifndef __LINUX_PKT_CLS_WRAPPER_H #define __LINUX_PKT_CLS_WRAPPER_H 1 -#if defined(__KERNEL__) || defined(HAVE_TCA_POLICE_PKTRATE64) +#if defined(__KERNEL__) || defined(HAVE_TCA_ACT_FLAGS_SKIP_HW) #include_next #else @@ -21,9 +21,12 @@ enum { __TCA_ACT_MAX }; -#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for - * actions stats. - */ +/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */ +#define TCA_ACT_FLAGS_NO_PERCPU_STATS (1 << 0) /* Don't use percpu allocator for + * actions stats. + */ +#define TCA_ACT_FLAGS_SKIP_HW (1 << 1) /* don't offload action to HW */ +#define TCA_ACT_FLAGS_SKIP_SW (1 << 2) /* don't use action in SW */ #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) @@ -270,6 +273,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_GENEVE * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_VXLAN + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -287,6 +294,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/include/linux/psample.h b/include/linux/psample.h new file mode 100644 index 00000000000..d5761b73072 --- /dev/null +++ b/include/linux/psample.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PSAMPLE_H +#define __LINUX_PSAMPLE_H + +enum { + PSAMPLE_ATTR_IIFINDEX, + PSAMPLE_ATTR_OIFINDEX, + PSAMPLE_ATTR_ORIGSIZE, + PSAMPLE_ATTR_SAMPLE_GROUP, + PSAMPLE_ATTR_GROUP_SEQ, + PSAMPLE_ATTR_SAMPLE_RATE, + PSAMPLE_ATTR_DATA, + PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_TUNNEL, + + PSAMPLE_ATTR_PAD, + PSAMPLE_ATTR_OUT_TC, /* u16 */ + PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */ + PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */ + PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */ + PSAMPLE_ATTR_PROTO, /* u16 */ + PSAMPLE_ATTR_USER_COOKIE, /* binary, user provided data */ + PSAMPLE_ATTR_SAMPLE_PROBABILITY,/* no argument, interpret rate in + * PSAMPLE_ATTR_SAMPLE_RATE as a + * probability scaled 0 - U32_MAX. + */ + + __PSAMPLE_ATTR_MAX +}; + +enum psample_command { + PSAMPLE_CMD_SAMPLE, + PSAMPLE_CMD_GET_GROUP, + PSAMPLE_CMD_NEW_GROUP, + PSAMPLE_CMD_DEL_GROUP, + PSAMPLE_CMD_SAMPLE_FILTER_SET, +}; + +enum psample_tunnel_key_attr { + PSAMPLE_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */ + PSAMPLE_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */ + PSAMPLE_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ + PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ + PSAMPLE_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + PSAMPLE_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */ + PSAMPLE_TUNNEL_KEY_ATTR_VXLAN_OPTS, /* Nested VXLAN opts* */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ + PSAMPLE_TUNNEL_KEY_ATTR_PAD, + PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ + __PSAMPLE_TUNNEL_KEY_ATTR_MAX +}; + +/* Can be overridden at runtime by module option */ +#define PSAMPLE_ATTR_MAX (__PSAMPLE_ATTR_MAX - 1) + +#define PSAMPLE_NL_MCGRP_CONFIG_NAME "config" +#define PSAMPLE_NL_MCGRP_SAMPLE_NAME "packets" +#define PSAMPLE_GENL_NAME "psample" +#define PSAMPLE_GENL_VERSION 1 + +#endif diff --git a/include/linux/tc_act/tc_tunnel_key.h b/include/linux/tc_act/tc_tunnel_key.h index f13acf17dd7..17291b90bf3 100644 --- a/include/linux/tc_act/tc_tunnel_key.h +++ b/include/linux/tc_act/tc_tunnel_key.h @@ -1,7 +1,7 @@ #ifndef __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H #define __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H 1 -#if defined(__KERNEL__) || defined(HAVE_TCA_TUNNEL_KEY_ENC_TTL) +#if defined(__KERNEL__) || defined(HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN) #include_next #else @@ -53,6 +53,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_GENEVE * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_VXLAN + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -70,6 +74,15 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) -#endif /* __KERNEL__ || HAVE_TCA_TUNNEL_KEY_ENC_TTL */ +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + +#endif /* __KERNEL__ || HAVE_TCA_TUNNEL_KEY_ENC_OPTS_VXLAN */ #endif /* __LINUX_TC_ACT_TC_TUNNEL_KEY_WRAPPER_H */ diff --git a/include/openflow/automake.mk b/include/openflow/automake.mk index a1d75756c9d..820c09f84bd 100644 --- a/include/openflow/automake.mk +++ b/include/openflow/automake.mk @@ -22,6 +22,3 @@ HSTAMP_FILES = $(openflowinclude_HEADERS:.h=.hstamp) CLEANFILES += $(HSTAMP_FILES) ALL_LOCAL += $(HSTAMP_FILES) $(HSTAMP_FILES): build-aux/check-structs $(openflowinclude_HEADERS) - -EXTRA_DIST += build-aux/check-structs - diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h index b68804991aa..959845ce6d7 100644 --- a/include/openflow/nicira-ext.h +++ b/include/openflow/nicira-ext.h @@ -1064,4 +1064,45 @@ struct nx_zone_id { }; OFP_ASSERT(sizeof(struct nx_zone_id) == 8); +/* CT flush available TLVs. */ +enum nx_ct_flush_tlv_type { + /* Outer types. */ + NXT_CT_ORIG_TUPLE = 0, /* Outer type for original tuple TLV. + * Nested TLVs are specified + * by 'enum nx_ct_flush_tuple_tlv_type'. */ + NXT_CT_REPLY_TUPLE = 1, /* Outer type for reply tuple TLV. * + * Nested TLVs are specified + * by 'enum nx_ct_flush_tuple_tlv_type'*/ + /* Primitive types. */ + NXT_CT_ZONE_ID = 2, /* be16 zone id. */ + NXT_CT_MARK = 3, /* be32 mark. */ + NXT_CT_MARK_MASK = 4, /* be32 mark mask. */ + NXT_CT_LABELS = 5, /* be128 labels. */ + NXT_CT_LABELS_MASK = 6, /* be128 labels mask. */ +}; + +/* CT flush nested TLVs. */ +enum nx_ct_flush_tuple_tlv_type { + NXT_CT_TUPLE_SRC = 0, /* IPv6 or mapped IPv4 address. */ + NXT_CT_TUPLE_DST = 1, /* IPv6 or mapped IPv4 address. */ + NXT_CT_TUPLE_SRC_PORT = 2, /* be16 source port. */ + NXT_CT_TUPLE_DST_PORT = 3, /* be16 destination port. */ + NXT_CT_TUPLE_ICMP_ID = 4, /* be16 ICMP id. */ + NXT_CT_TUPLE_ICMP_TYPE = 5, /* u8 ICMP type. */ + NXT_CT_TUPLE_ICMP_CODE = 6, /* u8 ICMP code. */ +}; + +/* NXT_CT_FLUSH. + * + * Flushes the connection tracking entries specified by 5-tuple. + * The struct should be followed by TLVs specifying the matching parameters. + * Currently there is a limitation for ICMP, in order to partially match on + * ICMP parameters the tuple should include at least SRC/DST. */ +struct nx_ct_flush { + uint8_t ip_proto; /* IP protocol. */ + uint8_t pad[7]; /* Align to 64 bits (must be zero). */ + /* Followed by optional TLVs of type 'enum nx_ct_flush_tlv_type'. */ +}; +OFP_ASSERT(sizeof(struct nx_ct_flush) == 8); + #endif /* openflow/nicira-ext.h */ diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk index 84670d80aae..0cc1f569e0a 100644 --- a/include/openvswitch/automake.mk +++ b/include/openvswitch/automake.mk @@ -15,6 +15,7 @@ openvswitchinclude_HEADERS = \ include/openvswitch/ofp-actions.h \ include/openvswitch/ofp-bundle.h \ include/openvswitch/ofp-connection.h \ + include/openvswitch/ofp-ct.h \ include/openvswitch/ofp-ed-props.h \ include/openvswitch/ofp-errors.h \ include/openvswitch/ofp-flow.h \ diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h index cf009f82644..ecb91801cc1 100644 --- a/include/openvswitch/compiler.h +++ b/include/openvswitch/compiler.h @@ -37,6 +37,22 @@ #define OVS_NO_RETURN #endif +#if __GNUC__ && !__CHECKER__ +#define OVS_RETURNS_NONNULL __attribute__((returns_nonnull)) +#else +#define OVS_RETURNS_NONNULL +#endif + +#ifndef typeof +#define typeof __typeof__ +#endif + +#ifndef __cplusplus +#ifndef asm +#define asm __asm__ +#endif +#endif + #if __GNUC__ && !__CHECKER__ #define OVS_UNUSED __attribute__((__unused__)) #define OVS_PRINTF_FORMAT(FMT, ARG1) __attribute__((__format__(printf, FMT, ARG1))) @@ -53,6 +69,17 @@ #define OVS_UNLIKELY(CONDITION) (!!(CONDITION)) #endif +/* Clang 17's implementation of ubsan enables checking that function pointers + * match the type of the called function. This currently breaks ovs-rcu, which + * calls multiple different types of callbacks via a generic void *(void*) + * function pointer type. This macro enables disabling that check for specific + * functions. */ +#if __clang__ && __has_feature(undefined_behavior_sanitizer) +#define OVS_NO_SANITIZE_FUNCTION __attribute__((no_sanitize("function"))) +#else +#define OVS_NO_SANITIZE_FUNCTION +#endif + #if __has_feature(c_thread_safety_attributes) /* "clang" annotations for thread safety check. * diff --git a/include/openvswitch/json.h b/include/openvswitch/json.h index 35b403c29bd..55544076084 100644 --- a/include/openvswitch/json.h +++ b/include/openvswitch/json.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef JSON_H -#define JSON_H 1 +#ifndef OPENVSWITCH_JSON_H +#define OPENVSWITCH_JSON_H 1 /* This is an implementation of JavaScript Object Notation (JSON) as specified * by RFC 4627. It is intended to fully comply with RFC 4627, with the @@ -91,6 +91,7 @@ struct json *json_array_create(struct json **, size_t n); struct json *json_array_create_1(struct json *); struct json *json_array_create_2(struct json *, struct json *); struct json *json_array_create_3(struct json *, struct json *, struct json *); +bool json_array_contains_string(const struct json *, const char *); struct json *json_object_create(void); void json_object_put(struct json *, const char *name, struct json *value); @@ -158,14 +159,14 @@ json_clone(const struct json *json_) return json; } -void json_destroy__(struct json *json); +void json_destroy__(struct json *json, bool); /* Frees 'json' and everything it points to, recursively. */ static inline void json_destroy(struct json *json) { if (json && !--json->count) { - json_destroy__(json); + json_destroy__(json, false); } } @@ -173,4 +174,4 @@ json_destroy(struct json *json) } #endif -#endif /* json.h */ +#endif /* OPENVSWITCH_JSON_H */ diff --git a/include/openvswitch/meta-flow.h b/include/openvswitch/meta-flow.h index 045dce8f5fa..aff917bcf60 100644 --- a/include/openvswitch/meta-flow.h +++ b/include/openvswitch/meta-flow.h @@ -2233,6 +2233,9 @@ union mf_subvalue { }; BUILD_ASSERT_DECL(sizeof(union mf_value) == sizeof (union mf_subvalue)); +/* A const mf_subvalue with all bits initialized to ones. */ +extern const union mf_subvalue exact_sub_match_mask; + bool mf_subvalue_intersect(const union mf_subvalue *a_value, const union mf_subvalue *a_mask, const union mf_subvalue *b_value, @@ -2366,6 +2369,10 @@ void mf_format_subvalue(const union mf_subvalue *subvalue, struct ds *s); void field_array_set(enum mf_field_id id, const union mf_value *, struct field_array *); +/* Mask the required l3 prerequisites if a 'set' action occurs. */ +void mf_set_mask_l3_prereqs(const struct mf_field *, const struct flow *, + struct flow_wildcards *); + #ifdef __cplusplus } #endif diff --git a/include/openvswitch/netdev.h b/include/openvswitch/netdev.h index 0c10f7b487c..83e8633dda6 100644 --- a/include/openvswitch/netdev.h +++ b/include/openvswitch/netdev.h @@ -87,6 +87,10 @@ struct netdev_stats { uint64_t rx_oversize_errors; uint64_t rx_fragmented_errors; uint64_t rx_jabber_errors; + + /* Datapath upcall statistics. */ + uint64_t upcall_packets; /* Rx packets forwarded to userspace. */ + uint64_t upcall_errors; /* Rx packets failed forwarding to userspace. */ }; /* Structure representation of custom statistics counter */ @@ -121,11 +125,14 @@ enum netdev_features { NETDEV_F_PAUSE_ASYM = 1 << 15, /* Asymmetric pause. */ }; +#define NETDEV_DEFAULT_BPS UINT64_C(10 * 1000 * 1000 * 1000) + int netdev_get_features(const struct netdev *, enum netdev_features *current, enum netdev_features *advertised, enum netdev_features *supported, enum netdev_features *peer); +int netdev_get_speed(const struct netdev *, uint32_t *current, uint32_t *max); uint64_t netdev_features_to_bps(enum netdev_features features, uint64_t default_bps); bool netdev_features_is_full_duplex(enum netdev_features features); diff --git a/include/openvswitch/ofp-actions.h b/include/openvswitch/ofp-actions.h index 7b57e49ad65..56dc2c1476c 100644 --- a/include/openvswitch/ofp-actions.h +++ b/include/openvswitch/ofp-actions.h @@ -1015,14 +1015,16 @@ enum nx_action_sample_direction { /* OFPACT_SAMPLE. * - * Used for NXAST_SAMPLE, NXAST_SAMPLE2, and NXAST_SAMPLE3. */ + * Used for NXAST_SAMPLE, NXAST_SAMPLE2, NXAST_SAMPLE3 and NXAST_SAMPLE4. */ struct ofpact_sample { OFPACT_PADDED_MEMBERS( struct ofpact ofpact; uint16_t probability; /* Always positive. */ uint32_t collector_set_id; - uint32_t obs_domain_id; - uint32_t obs_point_id; + uint32_t obs_domain_imm; + struct mf_subfield obs_domain_src; + uint32_t obs_point_imm; + struct mf_subfield obs_point_src; ofp_port_t sampling_port; enum nx_action_sample_direction direction; ); diff --git a/include/openvswitch/ofp-ct.h b/include/openvswitch/ofp-ct.h new file mode 100644 index 00000000000..d57b6267843 --- /dev/null +++ b/include/openvswitch/ofp-ct.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023, Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OPENVSWITCH_OFP_CT_H +#define OPENVSWITCH_OFP_CT_H 1 + +#include +#include +#include +#include + +#include "openflow/nicira-ext.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct ofp_ct_tuple { + struct in6_addr src; + struct in6_addr dst; + + union { + ovs_be16 src_port; + ovs_be16 icmp_id; + }; + union { + ovs_be16 dst_port; + struct { + uint8_t icmp_code; + uint8_t icmp_type; + }; + }; +}; + +struct ofp_ct_match { + uint8_t ip_proto; + uint16_t l3_type; + + struct ofp_ct_tuple tuple_orig; + struct ofp_ct_tuple tuple_reply; + + uint32_t mark; + uint32_t mark_mask; + + ovs_u128 labels; + ovs_u128 labels_mask; +}; + +bool ofp_ct_match_is_zero(const struct ofp_ct_match *); +bool ofp_ct_match_is_five_tuple(const struct ofp_ct_match *); + +void ofp_ct_match_format(struct ds *, const struct ofp_ct_match *); +bool ofp_ct_match_parse(const char **, int argc, struct ds *, + struct ofp_ct_match *, bool *with_zone, + uint16_t *zone_id); + +enum ofperr ofp_ct_match_decode(struct ofp_ct_match *, bool *with_zone, + uint16_t *zone_id, const struct ofp_header *); +struct ofpbuf *ofp_ct_match_encode(const struct ofp_ct_match *, + uint16_t *zone_id, + enum ofp_version version); + +#ifdef __cplusplus +} +#endif + +#endif /* ofp-ct.h */ diff --git a/include/openvswitch/ofp-group.h b/include/openvswitch/ofp-group.h index cd7af0ebff9..7cbb2f70f31 100644 --- a/include/openvswitch/ofp-group.h +++ b/include/openvswitch/ofp-group.h @@ -70,6 +70,10 @@ struct ofputil_bucket *ofputil_bucket_find(const struct ovs_list *, bool ofputil_bucket_check_duplicate_id(const struct ovs_list *); struct ofputil_bucket *ofputil_bucket_list_front(const struct ovs_list *); struct ofputil_bucket *ofputil_bucket_list_back(const struct ovs_list *); +void ofputil_bucket_format(struct ds *, const struct ofputil_bucket *, + enum ofp11_group_type, enum ofp_version, + const struct ofputil_port_map *, + const struct ofputil_table_map *); static inline bool ofputil_bucket_has_liveness(const struct ofputil_bucket *bucket) @@ -88,6 +92,8 @@ struct ofputil_group_props { void ofputil_group_properties_destroy(struct ofputil_group_props *); void ofputil_group_properties_copy(struct ofputil_group_props *to, const struct ofputil_group_props *from); +void ofputil_group_properties_format(const struct ofputil_group_props *, + struct ds *); /* Protocol-independent group_mod. */ struct ofputil_group_mod { uint16_t command; /* One of OFPGC15_*. */ @@ -199,6 +205,14 @@ enum ofperr ofputil_group_desc_format(struct ds *, const struct ofp_header *, enum ofperr ofputil_group_features_format(struct ds *, const struct ofp_header *); +/* Group formatting. */ +void ofputil_group_format(struct ds *s, uint32_t group_id, uint8_t type, + const struct ofputil_bucket *, + const struct ovs_list *p_buckets, + const struct ofputil_group_props *, + enum ofp_version, bool suppress_type, + const struct ofputil_port_map *, + const struct ofputil_table_map *); #ifdef __cplusplus } #endif diff --git a/include/openvswitch/ofp-msgs.h b/include/openvswitch/ofp-msgs.h index 921a937e5e3..708427fc041 100644 --- a/include/openvswitch/ofp-msgs.h +++ b/include/openvswitch/ofp-msgs.h @@ -515,6 +515,9 @@ enum ofpraw { /* NXT 1.0+ (29): struct nx_zone_id. */ OFPRAW_NXT_CT_FLUSH_ZONE, + /* NXT 1.0+ (32): struct nx_ct_flush, uint8_t[8][]. */ + OFPRAW_NXT_CT_FLUSH, + /* NXST 1.0+ (3): void. */ OFPRAW_NXST_IPFIX_BRIDGE_REQUEST, @@ -772,6 +775,7 @@ enum ofptype { OFPTYPE_IPFIX_FLOW_STATS_REQUEST, /* OFPRAW_NXST_IPFIX_FLOW_REQUEST */ OFPTYPE_IPFIX_FLOW_STATS_REPLY, /* OFPRAW_NXST_IPFIX_FLOW_REPLY */ OFPTYPE_CT_FLUSH_ZONE, /* OFPRAW_NXT_CT_FLUSH_ZONE. */ + OFPTYPE_CT_FLUSH, /* OFPRAW_NXT_CT_FLUSH. */ /* Flow monitor extension. */ OFPTYPE_FLOW_MONITOR_CANCEL, /* OFPRAW_NXT_FLOW_MONITOR_CANCEL. diff --git a/include/openvswitch/ofp-prop.h b/include/openvswitch/ofp-prop.h index e676f8dc0f7..afc86a5f701 100644 --- a/include/openvswitch/ofp-prop.h +++ b/include/openvswitch/ofp-prop.h @@ -84,10 +84,12 @@ enum ofperr ofpprop_pull(struct ofpbuf *msg, struct ofpbuf *property, enum ofperr ofpprop_parse_be16(const struct ofpbuf *, ovs_be16 *value); enum ofperr ofpprop_parse_be32(const struct ofpbuf *, ovs_be32 *value); enum ofperr ofpprop_parse_be64(const struct ofpbuf *, ovs_be64 *value); +enum ofperr ofpprop_parse_be128(const struct ofpbuf *, ovs_be128 *value); enum ofperr ofpprop_parse_u8(const struct ofpbuf *, uint8_t *value); enum ofperr ofpprop_parse_u16(const struct ofpbuf *, uint16_t *value); enum ofperr ofpprop_parse_u32(const struct ofpbuf *, uint32_t *value); enum ofperr ofpprop_parse_u64(const struct ofpbuf *, uint64_t *value); +enum ofperr ofpprop_parse_u128(const struct ofpbuf *, ovs_u128 *value); enum ofperr ofpprop_parse_uuid(const struct ofpbuf *, struct uuid *); enum ofperr ofpprop_parse_nested(const struct ofpbuf *, struct ofpbuf *); @@ -98,10 +100,12 @@ void *ofpprop_put_zeros(struct ofpbuf *, uint64_t type, size_t len); void ofpprop_put_be16(struct ofpbuf *, uint64_t type, ovs_be16 value); void ofpprop_put_be32(struct ofpbuf *, uint64_t type, ovs_be32 value); void ofpprop_put_be64(struct ofpbuf *, uint64_t type, ovs_be64 value); +void ofpprop_put_be128(struct ofpbuf *, uint64_t type, ovs_be128 value); void ofpprop_put_u8(struct ofpbuf *, uint64_t type, uint8_t value); void ofpprop_put_u16(struct ofpbuf *, uint64_t type, uint16_t value); void ofpprop_put_u32(struct ofpbuf *, uint64_t type, uint32_t value); void ofpprop_put_u64(struct ofpbuf *, uint64_t type, uint64_t value); +void ofpprop_put_u128(struct ofpbuf *, uint64_t type, ovs_u128 value); void ofpprop_put_bitmap(struct ofpbuf *, uint64_t type, uint64_t bitmap); void ofpprop_put_flag(struct ofpbuf *, uint64_t type); void ofpprop_put_uuid(struct ofpbuf *, uint64_t type, const struct uuid *); diff --git a/include/openvswitch/version.h.in b/include/openvswitch/version.h.in index 23d8fde4f18..231f61e30c0 100644 --- a/include/openvswitch/version.h.in +++ b/include/openvswitch/version.h.in @@ -19,7 +19,7 @@ #define OPENVSWITCH_VERSION_H 1 #define OVS_PACKAGE_STRING "@PACKAGE_STRING@" -#define OVS_PACKAGE_VERSION "@PACKAGE_VERSION@" +#define OVS_PACKAGE_VERSION "@PACKAGE_VERSION@@VERSION_SUFFIX@" #define OVS_LIB_VERSION @LT_CURRENT@ #define OVS_LIB_REVISION @LT_REVISION@ diff --git a/include/openvswitch/vlog.h b/include/openvswitch/vlog.h index e53ce6d8145..481e1c0f0a8 100644 --- a/include/openvswitch/vlog.h +++ b/include/openvswitch/vlog.h @@ -148,6 +148,9 @@ void vlog_set_syslog_target(const char *target); /* Write directly to log file. */ void vlog_direct_write_to_log_file_unsafe(const char *s); +/* Return the current log file descriptor. */ +int vlog_get_log_file_fd_unsafe(void); + /* Initialization. */ void vlog_init(void); void vlog_enable_async(void); diff --git a/include/sparse/automake.mk b/include/sparse/automake.mk index e966371192b..45e6202c52e 100644 --- a/include/sparse/automake.mk +++ b/include/sparse/automake.mk @@ -1,9 +1,11 @@ noinst_HEADERS += \ include/sparse/rte_byteorder.h \ + include/sparse/immintrin.h \ include/sparse/xmmintrin.h \ include/sparse/arpa/inet.h \ include/sparse/bits/floatn.h \ include/sparse/assert.h \ + include/sparse/ia32intrin.h \ include/sparse/math.h \ include/sparse/numa.h \ include/sparse/netinet/in.h \ diff --git a/include/sparse/ia32intrin.h b/include/sparse/ia32intrin.h new file mode 100644 index 00000000000..5045bf38d96 --- /dev/null +++ b/include/sparse/ia32intrin.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CHECKER__ +#error "Use this header only with sparse. It is not a correct implementation." +#endif + +#define __builtin_ia32_rdtsc() (unsigned long long) 0 + +/* Get actual definitions for us to annotate and build on. */ +#include_next diff --git a/include/sparse/immintrin.h b/include/sparse/immintrin.h new file mode 100644 index 00000000000..9a23d7f7461 --- /dev/null +++ b/include/sparse/immintrin.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CHECKER__ +#error "Use this header only with sparse. It is not a correct implementation." +#endif + +/* Sparse doesn't know some types used by AVX512 and some other headers. + * Mark those headers as already included to avoid failures. This is fragile, + * so may need adjustments with compiler changes. */ +#define _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVXNECONVERTINTRIN_H_INCLUDED +#define _KEYLOCKERINTRIN_H_INCLUDED +#define __AVX512FP16INTRIN_H_INCLUDED +#define __AVX512FP16VLINTRIN_H_INCLUDED +/* GCC >=14 changed the '__AVX512FP16INTRIN_H_INCLUDED' to have only single + * underscore. We need both to keep compatibility between various GCC + * versions. */ +#define _AVX512FP16INTRIN_H_INCLUDED + +#include_next diff --git a/include/sparse/netinet/in.h b/include/sparse/netinet/in.h index 21deceb28d4..00927281643 100644 --- a/include/sparse/netinet/in.h +++ b/include/sparse/netinet/in.h @@ -68,6 +68,7 @@ struct sockaddr_in6 { #define IPPROTO_HOPOPTS 0 #define IPPROTO_ICMP 1 #define IPPROTO_IGMP 2 +#define IPPROTO_IPIP 4 #define IPPROTO_TCP 6 #define IPPROTO_UDP 17 #define IPPROTO_ROUTING 43 diff --git a/include/sparse/netinet/ip6.h b/include/sparse/netinet/ip6.h index bfa637a4604..b2b6f47d9e2 100644 --- a/include/sparse/netinet/ip6.h +++ b/include/sparse/netinet/ip6.h @@ -18,6 +18,10 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif +#ifndef NETINET_IN_H_INCLUDED +#error "Must include before for FreeBSD support" +#endif + #ifndef __NETINET_IP6_SPARSE #define __NETINET_IP6_SPARSE 1 diff --git a/include/sparse/numa.h b/include/sparse/numa.h index 3691a0eaf72..a185972e31a 100644 --- a/include/sparse/numa.h +++ b/include/sparse/numa.h @@ -18,10 +18,21 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif -/* Avoid sparse warning: non-ANSI function declaration of function" */ -#define numa_get_membind_compat() numa_get_membind_compat(void) -#define numa_get_interleave_mask_compat() numa_get_interleave_mask_compat(void) -#define numa_get_run_node_mask_compat() numa_get_run_node_mask_compat(void) +#ifndef __NUMA_H_SPARSE +#define __NUMA_H_SPARSE 1 -/* Get actual definitions for us to annotate and build on. */ -#include_next +/* Avoid sparse warning "non-ANSI function declaration of function" with + * libnuma < 2.0.13. */ + +struct bitmask { + unsigned long size; + unsigned long *maskp; +}; + +int numa_available(void); +struct bitmask *numa_allocate_nodemask(void); +void numa_bitmask_free(struct bitmask *); +void numa_set_localalloc(void); +void numa_set_preferred(int node); + +#endif /* for sparse. */ diff --git a/include/sparse/rte_memcpy.h b/include/sparse/rte_memcpy.h index 5cd3f013ea8..ec88500242a 100644 --- a/include/sparse/rte_memcpy.h +++ b/include/sparse/rte_memcpy.h @@ -20,11 +20,8 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif -/* Include the same headers as the real rte_memcpy(). */ -#include +#include #include -#include -#include /* Declare the same functions as the real rte_memcpy.h, without defining them. * This gives sparse the information it needs without provoking sparse's diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in index 7945162f9f3..37c509ac682 100755 --- a/ipsec/ovs-monitor-ipsec.in +++ b/ipsec/ovs-monitor-ipsec.in @@ -457,14 +457,36 @@ conn prevent_unencrypted_vxlan CERTKEY_PREFIX = "ovs_certkey_" def __init__(self, libreswan_root_prefix, args): + # Collect version infromation + self.IPSEC = libreswan_root_prefix + "/usr/sbin/ipsec" + self.IPSEC_AUTO = [self.IPSEC] + proc = subprocess.Popen([self.IPSEC, "--version"], + stdout=subprocess.PIPE, + encoding="latin1") + pout, perr = proc.communicate() + + v = re.match("^Libreswan v?(.*)$", pout) + try: + version = int(v.group(1).split(".")[0]) + except: + version = 0 + + if version < 5: + # With v5, LibreSWAN removed the auto command, however, it is + # still required for older versions + self.IPSEC_AUTO.append("auto") + + if version >= 4: + ipsec_d = args.ipsec_d if args.ipsec_d else "/var/lib/ipsec/nss" + else: + ipsec_d = args.ipsec_d if args.ipsec_d else "/etc/ipsec.d" + ipsec_conf = args.ipsec_conf if args.ipsec_conf else "/etc/ipsec.conf" - ipsec_d = args.ipsec_d if args.ipsec_d else "/etc/ipsec.d" ipsec_secrets = (args.ipsec_secrets if args.ipsec_secrets else "/etc/ipsec.secrets") ipsec_ctl = (args.ipsec_ctl if args.ipsec_ctl else "/run/pluto/pluto.ctl") - self.IPSEC = libreswan_root_prefix + "/usr/sbin/ipsec" self.IPSEC_CONF = libreswan_root_prefix + ipsec_conf self.IPSEC_SECRETS = libreswan_root_prefix + ipsec_secrets self.IPSEC_D = "sql:" + libreswan_root_prefix + ipsec_d @@ -577,7 +599,7 @@ conn prevent_unencrypted_vxlan def refresh(self, monitor): vlog.info("Refreshing LibreSwan configuration") - subprocess.call([self.IPSEC, "auto", "--ctlsocket", self.IPSEC_CTL, + subprocess.call(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL, "--config", self.IPSEC_CONF, "--rereadsecrets"]) tunnels = set(monitor.tunnels.keys()) @@ -605,7 +627,7 @@ conn prevent_unencrypted_vxlan if not tunnel or tunnel.version != ver: vlog.info("%s is outdated %u" % (conn, ver)) - subprocess.call([self.IPSEC, "auto", "--ctlsocket", + subprocess.call(self.IPSEC_AUTO + ["--ctlsocket", self.IPSEC_CTL, "--config", self.IPSEC_CONF, "--delete", conn]) elif ifname in tunnels: @@ -627,44 +649,44 @@ conn prevent_unencrypted_vxlan # Update shunt policy if changed if monitor.conf_in_use["skb_mark"] != monitor.conf["skb_mark"]: if monitor.conf["skb_mark"]: - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_gre"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_geneve"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_stt"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--add", "--asynchronous", "prevent_unencrypted_vxlan"]) else: - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_gre"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_geneve"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_stt"]) - subprocess.call([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + subprocess.call(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--delete", "--asynchronous", "prevent_unencrypted_vxlan"]) @@ -710,8 +732,8 @@ conn prevent_unencrypted_vxlan # the "ipsec auto --start" command is lost. Just retry to make sure # the command is received by LibreSwan. while True: - proc = subprocess.Popen([self.IPSEC, "auto", - "--config", self.IPSEC_CONF, + proc = subprocess.Popen(self.IPSEC_AUTO + + ["--config", self.IPSEC_CONF, "--ctlsocket", self.IPSEC_CTL, "--start", "--asynchronous", conn], diff --git a/lib/automake.mk b/lib/automake.mk index 0ce9f08ae3d..8cf2b66413f 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -9,7 +9,6 @@ lib_LTLIBRARIES += lib/libopenvswitch.la lib_libopenvswitch_la_LIBADD = $(SSL_LIBS) lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD) -lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD) if WIN32 @@ -95,6 +94,9 @@ lib_libopenvswitch_la_SOURCES = \ lib/conntrack-other.c \ lib/conntrack.c \ lib/conntrack.h \ + lib/cooperative-multitasking.c \ + lib/cooperative-multitasking.h \ + lib/cooperative-multitasking-private.h \ lib/coverage.c \ lib/coverage.h \ lib/cpu.c \ @@ -119,6 +121,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpctl.h \ lib/dp-packet.h \ lib/dp-packet.c \ + lib/dp-packet-gso.c \ + lib/dp-packet-gso.h \ lib/dpdk.h \ lib/dpif-netdev-extract-study.c \ lib/dpif-netdev-lookup.h \ @@ -174,6 +178,7 @@ lib_libopenvswitch_la_SOURCES = \ lib/jhash.c \ lib/jhash.h \ lib/json.c \ + lib/json.h \ lib/jsonrpc.c \ lib/jsonrpc.h \ lib/lacp.c \ @@ -227,6 +232,7 @@ lib_libopenvswitch_la_SOURCES = \ lib/ofp-actions.c \ lib/ofp-bundle.c \ lib/ofp-connection.c \ + lib/ofp-ct.c \ lib/ofp-ed-props.c \ lib/ofp-errors.c \ lib/ofp-flow.c \ @@ -457,7 +463,7 @@ lib_libsflow_la_SOURCES = \ lib/sflow_poller.c \ lib/sflow_receiver.c lib_libsflow_la_CPPFLAGS = $(AM_CPPFLAGS) -lib_libsflow_la_CFLAGS = $(AM_CFLAGS) +lib_libsflow_la_CFLAGS = $(AM_CFLAGS) -D_BSD_SOURCE -D_DEFAULT_SOURCE if HAVE_WNO_UNUSED lib_libsflow_la_CFLAGS += -Wno-unused endif @@ -648,7 +654,6 @@ lib/nx-match.inc: $(srcdir)/build-aux/extract-ofp-fields include/openvswitch/met $(AM_V_at)mv $@.tmp $@ lib/nx-match.lo: lib/nx-match.inc CLEANFILES += lib/meta-flow.inc lib/nx-match.inc -EXTRA_DIST += build-aux/extract-ofp-fields lib/ofp-actions.inc1: $(srcdir)/build-aux/extract-ofp-actions lib/ofp-actions.c $(AM_V_GEN)$(run_python) $< prototypes $(srcdir)/lib/ofp-actions.c > $@.tmp && mv $@.tmp $@ @@ -656,7 +661,6 @@ lib/ofp-actions.inc2: $(srcdir)/build-aux/extract-ofp-actions lib/ofp-actions.c $(AM_V_GEN)$(run_python) $< definitions $(srcdir)/lib/ofp-actions.c > $@.tmp && mv $@.tmp $@ lib/ofp-actions.lo: lib/ofp-actions.inc1 lib/ofp-actions.inc2 CLEANFILES += lib/ofp-actions.inc1 lib/ofp-actions.inc2 -EXTRA_DIST += build-aux/extract-ofp-actions lib/ofp-errors.inc: include/openvswitch/ofp-errors.h include/openflow/openflow-common.h \ $(srcdir)/build-aux/extract-ofp-errors @@ -666,14 +670,12 @@ lib/ofp-errors.inc: include/openvswitch/ofp-errors.h include/openflow/openflow-c mv $@.tmp $@ lib/ofp-errors.lo: lib/ofp-errors.inc CLEANFILES += lib/ofp-errors.inc -EXTRA_DIST += build-aux/extract-ofp-errors lib/ofp-msgs.inc: include/openvswitch/ofp-msgs.h $(srcdir)/build-aux/extract-ofp-msgs $(AM_V_GEN)$(run_python) $(srcdir)/build-aux/extract-ofp-msgs \ $(srcdir)/include/openvswitch/ofp-msgs.h $@ > $@.tmp && mv $@.tmp $@ lib/ofp-msgs.lo: lib/ofp-msgs.inc CLEANFILES += lib/ofp-msgs.inc -EXTRA_DIST += build-aux/extract-ofp-msgs # _server IDL OVSIDL_BUILT += lib/ovsdb-server-idl.c lib/ovsdb-server-idl.h lib/ovsdb-server-idl.ovsidl diff --git a/lib/backtrace.c b/lib/backtrace.c index 2853d5ff150..65c92fd723c 100644 --- a/lib/backtrace.c +++ b/lib/backtrace.c @@ -32,12 +32,27 @@ VLOG_DEFINE_THIS_MODULE(backtrace); void backtrace_capture(struct backtrace *b) { - void *frames[BACKTRACE_MAX_FRAMES]; - int i; + b->n_frames = backtrace(b->frames, BACKTRACE_MAX_FRAMES); +} + +void +backtrace_format(struct ds *ds, const struct backtrace *bt, + const char *delimiter) +{ + if (bt->n_frames) { + char **symbols = backtrace_symbols(bt->frames, bt->n_frames); + + if (!symbols) { + return; + } - b->n_frames = backtrace(frames, BACKTRACE_MAX_FRAMES); - for (i = 0; i < b->n_frames; i++) { - b->frames[i] = (uintptr_t) frames[i]; + for (int i = 0; i < bt->n_frames - 1; i++) { + ds_put_format(ds, "%s%s", symbols[i], delimiter); + } + + ds_put_format(ds, "%s", symbols[bt->n_frames - 1]); + + free(symbols); } } @@ -47,23 +62,14 @@ backtrace_capture(struct backtrace *backtrace) { backtrace->n_frames = 0; } -#endif -static char * -backtrace_format(const struct backtrace *b, struct ds *ds) +void +backtrace_format(struct ds *ds, const struct backtrace *bt OVS_UNUSED, + const char *delimiter OVS_UNUSED) { - if (b->n_frames) { - int i; - - ds_put_cstr(ds, " (backtrace:"); - for (i = 0; i < b->n_frames; i++) { - ds_put_format(ds, " 0x%08"PRIxPTR, b->frames[i]); - } - ds_put_cstr(ds, ")"); - } - - return ds_cstr(ds); + ds_put_cstr(ds, "backtrace() is not supported!\n"); } +#endif void log_backtrace_at(const char *msg, const char *where) @@ -77,41 +83,85 @@ log_backtrace_at(const char *msg, const char *where) } ds_put_cstr(&ds, where); - VLOG_ERR("%s", backtrace_format(&b, &ds)); + ds_put_cstr(&ds, " backtrace:\n"); + backtrace_format(&ds, &b, "\n"); + VLOG_ERR("%s", ds_cstr_ro(&ds)); ds_destroy(&ds); } +#if defined(HAVE_UNWIND) || defined(HAVE_BACKTRACE) +static bool +read_received_backtrace(int fd, void *dest, size_t len) +{ + VLOG_DBG("%s fd %d", __func__, fd); + fcntl(fd, F_SETFL, O_NONBLOCK); + memset(dest, 0, len); + + int byte_read = read(fd, dest, len); + if (byte_read < 0) { + VLOG_ERR("Read fd %d failed: %s", fd, ovs_strerror(errno)); + } + + return byte_read > 0;; +} +#else +static bool +read_received_backtrace(int fd OVS_UNUSED, void *dest OVS_UNUSED, + size_t len OVS_UNUSED) +{ + return false; +} +#endif + #ifdef HAVE_UNWIND void -log_received_backtrace(int fd) { - int byte_read; +log_received_backtrace(int fd) +{ struct unw_backtrace backtrace[UNW_MAX_DEPTH]; - VLOG_WARN("%s fd %d", __func__, fd); - fcntl(fd, F_SETFL, O_NONBLOCK); - memset(backtrace, 0, UNW_MAX_BUF); + if (read_received_backtrace(fd, backtrace, UNW_MAX_BUF)) { + struct ds ds = DS_EMPTY_INITIALIZER; + + ds_put_cstr(&ds, BACKTRACE_DUMP_MSG); - byte_read = read(fd, backtrace, UNW_MAX_BUF); - if (byte_read < 0) { - VLOG_ERR("Read fd %d failed: %s", fd, - ovs_strerror(errno)); - } else if (byte_read > 0) { - VLOG_WARN("SIGSEGV detected, backtrace:"); for (int i = 0; i < UNW_MAX_DEPTH; i++) { if (backtrace[i].func[0] == 0) { break; } - VLOG_WARN("0x%016"PRIxPTR" <%s+0x%"PRIxPTR">\n", - backtrace[i].ip, - backtrace[i].func, - backtrace[i].offset); + ds_put_format(&ds, "0x%016"PRIxPTR" <%s+0x%"PRIxPTR">\n", + backtrace[i].ip, + backtrace[i].func, + backtrace[i].offset); } + + VLOG_WARN("%s", ds_cstr_ro(&ds)); + + ds_destroy(&ds); } } -#else /* !HAVE_UNWIND */ +#elif HAVE_BACKTRACE void -log_received_backtrace(int daemonize_fd OVS_UNUSED) { - VLOG_WARN("Backtrace using libunwind not supported."); +log_received_backtrace(int fd) +{ + struct backtrace bt; + + if (read_received_backtrace(fd, &bt, sizeof bt)) { + struct ds ds = DS_EMPTY_INITIALIZER; + + bt.n_frames = MIN(bt.n_frames, BACKTRACE_MAX_FRAMES); + + ds_put_cstr(&ds, BACKTRACE_DUMP_MSG); + backtrace_format(&ds, &bt, "\n"); + VLOG_WARN("%s", ds_cstr_ro(&ds)); + + ds_destroy(&ds); + } } -#endif /* HAVE_UNWIND */ +#else +void +log_received_backtrace(int daemonize_fd OVS_UNUSED) +{ + VLOG_WARN("Backtrace using libunwind or backtrace() is not supported."); +} +#endif diff --git a/lib/backtrace.h b/lib/backtrace.h index 5708bf9c683..a2506da5fff 100644 --- a/lib/backtrace.h +++ b/lib/backtrace.h @@ -26,7 +26,7 @@ #endif /* log_backtrace() will save the backtrace of a running program - * into the log at the DEBUG level. + * into the log at the ERROR level. * * To use it, insert the following code to where backtrace is * desired: @@ -36,41 +36,53 @@ * log_backtrace_msg("your message"); <-- with a message * * - * A typical log will look like the following. The hex numbers listed after - * "backtrace" are the addresses of the backtrace. + * A typical backtrace will look like the following example: + * /lib64/libopenvswitch-3.1.so.0(backtrace_capture+0x1e) [0x7fc5db298dfe] + * /lib64/libopenvswitch-3.1.so.0(log_backtrace_at+0x57) [0x7fc5db2999e7] + * /lib64/libovsdb-3.1.so.0(ovsdb_txn_complete+0x7b) [0x7fc5db56247b] + * /lib64/libovsdb-3.1.so.0(ovsdb_txn_propose_commit_block+0x8d) + * [0x7fc5db563a8d] + * ovsdb-server(+0xa661) [0x562cfce2e661] + * ovsdb-server(+0x7e39) [0x562cfce2be39] + * /lib64/libc.so.6(+0x27b4a) [0x7fc5db048b4a] + * /lib64/libc.so.6(__libc_start_main+0x8b) [0x7fc5db048c0b] + * ovsdb-server(+0x8c35) [0x562cfce2cc35] * - * 2014-03-13T23:18:11.979Z|00002|backtrace(revalidator_6)|ERR|lib/dpif-netdev.c:1312: (backtrace: 0x00521f57 0x00460365 0x00463ea4 0x0046470b 0x0043b32d 0x0043bac3 0x0043bae2 0x0043943b 0x004c22b3 0x2b5b3ac94e9a 0x2b5b3b4a33fd) + * GDB can be used to view the exact line of the code for particular backtrace. + * One thing to keep in mind is that the lines in source files might not + * 100% correspond with the backtrace due to various optimizations as LTO etc. + * (The effect can be seen in this example). * - * The following bash command can be used to view backtrace in - * a more readable form. - * addr2line -p -e vswitchd/ovs-vswitchd + * Assuming that debuginfo for the library or binary is installed load it to + * GDB: + * $ gdb ovsdb-server + * (gdb) list *(+0x7e39) + * 0x7e39 is in main (ovsdb/ovsdb-server.c:278). + * (gdb) list *(+0xa661) + * 0xa661 is in commit_txn (ovsdb/ovsdb-server.c:1173) * - * An typical run and output will look like: - * addr2line -p -e vswitchd/ovs-vswitchd 0x00521f57 0x00460365 0x00463ea4 - * 0x0046470b 0x0043b32d 0x0043bac3 0x0043bae2 0x0043943b 0x004c22b3 - * 0x2b5b3ac94e9a 0x2b5b3b4a33fd + * $ gdb /lib64/libovsdb-3.1.so.0 + * (gdb) list *(ovsdb_txn_propose_commit_block+0x8d) + * 0x3aa8d is in ovsdb_txn_propose_commit_block (ovsdb/transaction.c:1328) + * (gdb) list *(ovsdb_txn_complete+0x7b) + * 0x3947b is in ovsdb_txn_complete (./include/openvswitch/list.h:321) * - * openvswitch/lib/backtrace.c:33 - * openvswitch/lib/dpif-netdev.c:1312 - * openvswitch/lib/dpif.c:937 - * openvswitch/lib/dpif.c:1258 - * openvswitch/ofproto/ofproto-dpif-upcall.c:1440 - * openvswitch/ofproto/ofproto-dpif-upcall.c:1595 - * openvswitch/ofproto/ofproto-dpif-upcall.c:160 - * openvswitch/ofproto/ofproto-dpif-upcall.c:717 - * openvswitch/lib/ovs-thread.c:268 - * ??:0 - * ??:0 + * $ gdb /lib64/libopenvswitch-3.1.so.0 + * (gdb) list *(log_backtrace_at+0x57) + * 0x999e7 is in log_backtrace_at (lib/backtrace.c:77) + * (gdb) list *(backtrace_capture+0x1e) + * 0x98dfe is in backtrace_capture (lib/backtrace.c:35) */ #define log_backtrace() log_backtrace_at(NULL, OVS_SOURCE_LOCATOR); #define log_backtrace_msg(msg) log_backtrace_at(msg, OVS_SOURCE_LOCATOR); #define BACKTRACE_MAX_FRAMES 31 +#define BACKTRACE_DUMP_MSG "SIGSEGV detected, backtrace:\n" struct backtrace { int n_frames; - uintptr_t frames[BACKTRACE_MAX_FRAMES]; + void *frames[BACKTRACE_MAX_FRAMES]; }; #ifdef HAVE_UNWIND @@ -88,6 +100,8 @@ struct unw_backtrace { void backtrace_capture(struct backtrace *); void log_backtrace_at(const char *msg, const char *where); +void backtrace_format(struct ds *, const struct backtrace *, + const char *delimiter); void log_received_backtrace(int fd); #endif /* backtrace.h */ diff --git a/lib/bfd.c b/lib/bfd.c index 9698576d071..b8149e78973 100644 --- a/lib/bfd.c +++ b/lib/bfd.c @@ -586,7 +586,6 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, { long long int min_tx, min_rx; struct udp_header *udp; - struct eth_header *eth; struct ip_header *ip; struct msg *msg; @@ -605,15 +604,13 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, * set. */ ovs_assert(!(bfd->flags & FLAG_POLL) || !(bfd->flags & FLAG_FINAL)); - dp_packet_reserve(p, 2); /* Properly align after the ethernet header. */ - eth = dp_packet_put_uninit(p, sizeof *eth); - eth->eth_src = eth_addr_is_zero(bfd->local_eth_src) - ? eth_src : bfd->local_eth_src; - eth->eth_dst = eth_addr_is_zero(bfd->local_eth_dst) - ? eth_addr_bfd : bfd->local_eth_dst; - eth->eth_type = htons(ETH_TYPE_IP); + ip = eth_compose(p, + eth_addr_is_zero(bfd->local_eth_dst) + ? eth_addr_bfd : bfd->local_eth_dst, + eth_addr_is_zero(bfd->local_eth_src) + ? eth_src : bfd->local_eth_src, + ETH_TYPE_IP, sizeof *ip + sizeof *udp + sizeof *msg); - ip = dp_packet_put_zeros(p, sizeof *ip); ip->ip_ihl_ver = IP_IHL_VER(5, 4); ip->ip_tot_len = htons(sizeof *ip + sizeof *udp + sizeof *msg); ip->ip_ttl = MAXTTL; @@ -621,15 +618,17 @@ bfd_put_packet(struct bfd *bfd, struct dp_packet *p, ip->ip_proto = IPPROTO_UDP; put_16aligned_be32(&ip->ip_src, bfd->ip_src); put_16aligned_be32(&ip->ip_dst, bfd->ip_dst); - /* Checksum has already been zeroed by put_zeros call. */ + /* Checksum has already been zeroed by eth_compose call. */ ip->ip_csum = csum(ip, sizeof *ip); + dp_packet_set_l4(p, ip + 1); - udp = dp_packet_put_zeros(p, sizeof *udp); + udp = dp_packet_l4(p); udp->udp_src = htons(bfd->udp_src); udp->udp_dst = htons(BFD_DEST_PORT); udp->udp_len = htons(sizeof *udp + sizeof *msg); + /* Checksum already zero from eth_compose. */ - msg = dp_packet_put_uninit(p, sizeof *msg); + msg = (struct msg *)(udp + 1); msg->vers_diag = (BFD_VERSION << 5) | bfd->diag; msg->flags = (bfd->state & STATE_MASK) | bfd->flags; @@ -1131,10 +1130,11 @@ bfd_set_state(struct bfd *bfd, enum state state, enum diag diag) if (!VLOG_DROP_INFO(&rl)) { struct ds ds = DS_EMPTY_INITIALIZER; - ds_put_format(&ds, "%s: BFD state change: %s->%s" - " \"%s\"->\"%s\".\n", + ds_put_format(&ds, "%s: BFD state change: (bfd.SessionState: %s," + " bfd.LocalDiag: \"%s\") -> (bfd.SessionState: %s," + " bfd.LocalDiag: \"%s\")\n", bfd->name, bfd_state_str(bfd->state), - bfd_state_str(state), bfd_diag_str(bfd->diag), + bfd_diag_str(bfd->diag), bfd_state_str(state), bfd_diag_str(diag)); bfd_put_details(&ds, bfd); VLOG_INFO("%s", ds_cstr(&ds)); diff --git a/lib/cfm.c b/lib/cfm.c index c3742f3de20..7eb08015776 100644 --- a/lib/cfm.c +++ b/lib/cfm.c @@ -863,7 +863,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct dp_packet *p) rmp->num_health_ccm++; if (cfm->demand) { timer_set_duration(&cfm->demand_rx_ccm_t, - 100 * cfm->ccm_interval_ms); + 100LL * cfm->ccm_interval_ms); } } rmp->recv = true; diff --git a/lib/classifier.c b/lib/classifier.c index 0a89626cc30..0729bd19024 100644 --- a/lib/classifier.c +++ b/lib/classifier.c @@ -853,6 +853,32 @@ trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie) ctx->lookup_done = false; } +static void +insert_conj_flows(struct hmapx *conj_flows, uint32_t id, int priority, + struct cls_conjunction_set **soft, size_t n_soft) +{ + struct cls_conjunction_set *conj_set; + + if (!conj_flows) { + return; + } + + for (size_t i = 0; i < n_soft; i++) { + conj_set = soft[i]; + + if (conj_set->priority != priority) { + continue; + } + + for (size_t j = 0; j < conj_set->n; j++) { + if (conj_set->conj[j].id == id) { + hmapx_add(conj_flows, (void *) (conj_set->match->cls_rule)); + break; + } + } + } +} + struct conjunctive_match { struct hmap_node hmap_node; uint32_t id; @@ -933,11 +959,15 @@ free_conjunctive_matches(struct hmap *matches, * recursion within this function itself. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ static const struct cls_rule * classifier_lookup__(const struct classifier *cls, ovs_version_t version, struct flow *flow, struct flow_wildcards *wc, - bool allow_conjunctive_matches) + bool allow_conjunctive_matches, + struct hmapx *conj_flows) { struct trie_ctx trie_ctx[CLS_MAX_TRIES]; const struct cls_match *match; @@ -1097,10 +1127,15 @@ classifier_lookup__(const struct classifier *cls, ovs_version_t version, const struct cls_rule *rule; flow->conj_id = id; - rule = classifier_lookup__(cls, version, flow, wc, false); + rule = classifier_lookup__(cls, version, flow, wc, false, + NULL); flow->conj_id = saved_conj_id; if (rule) { + if (allow_conjunctive_matches) { + insert_conj_flows(conj_flows, id, soft_pri, soft, + n_soft); + } free_conjunctive_matches(&matches, cm_stubs, ARRAY_SIZE(cm_stubs)); if (soft != soft_stub) { @@ -1161,12 +1196,16 @@ classifier_lookup__(const struct classifier *cls, ovs_version_t version, * flow_wildcards_init_catchall()). * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ const struct cls_rule * classifier_lookup(const struct classifier *cls, ovs_version_t version, - struct flow *flow, struct flow_wildcards *wc) + struct flow *flow, struct flow_wildcards *wc, + struct hmapx *conj_flows) { - return classifier_lookup__(cls, version, flow, wc, true); + return classifier_lookup__(cls, version, flow, wc, true, conj_flows); } /* Finds and returns a rule in 'cls' with exactly the same priority and @@ -1695,6 +1734,8 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, const struct cls_match *rule = NULL; struct flowmap stages_map = FLOWMAP_EMPTY_INITIALIZER; unsigned int mask_offset = 0; + bool adjust_ports_mask = false; + ovs_be32 ports_mask; int i; /* Try to finish early by checking fields in segments. */ @@ -1722,6 +1763,9 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, subtable->index_maps[i], flow, wc)) { goto no_match; } + /* Accumulate the map used so far. */ + stages_map = flowmap_or(stages_map, subtable->index_maps[i]); + hash = flow_hash_in_minimask_range(flow, &subtable->mask, subtable->index_maps[i], &mask_offset, &basis); @@ -1731,14 +1775,16 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, * unwildcarding all the ports bits, use the ports trie to figure out a * smaller set of bits to unwildcard. */ unsigned int mbits; - ovs_be32 value, plens, mask; + ovs_be32 value, plens; - mask = miniflow_get_ports(&subtable->mask.masks); - value = ((OVS_FORCE ovs_be32 *)flow)[TP_PORTS_OFS32] & mask; + ports_mask = miniflow_get_ports(&subtable->mask.masks); + value = ((OVS_FORCE ovs_be32 *) flow)[TP_PORTS_OFS32] & ports_mask; mbits = trie_lookup_value(&subtable->ports_trie, &value, &plens, 32); - ((OVS_FORCE ovs_be32 *)&wc->masks)[TP_PORTS_OFS32] |= - mask & be32_prefix_mask(mbits); + ports_mask &= be32_prefix_mask(mbits); + ports_mask |= ((OVS_FORCE ovs_be32 *) &wc->masks)[TP_PORTS_OFS32]; + + adjust_ports_mask = true; goto no_match; } @@ -1751,6 +1797,14 @@ find_match_wc(const struct cls_subtable *subtable, ovs_version_t version, /* Unwildcard the bits in stages so far, as they were used in determining * there is no match. */ flow_wildcards_fold_minimask_in_map(wc, &subtable->mask, stages_map); + if (adjust_ports_mask) { + /* This has to be done after updating flow wildcards to overwrite + * the ports mask back. We can't simply disable the corresponding bit + * in the stages map, because it has 64-bit resolution, i.e. one + * bit covers not only tp_src/dst, but also ct_tp_src/dst, which are + * not covered by the trie. */ + ((OVS_FORCE ovs_be32 *) &wc->masks)[TP_PORTS_OFS32] = ports_mask; + } return NULL; } diff --git a/lib/classifier.h b/lib/classifier.h index f646a8f7429..f55a2cba998 100644 --- a/lib/classifier.h +++ b/lib/classifier.h @@ -299,6 +299,7 @@ * parallel to the rule's removal. */ #include "cmap.h" +#include "hmapx.h" #include "openvswitch/match.h" #include "openvswitch/meta-flow.h" #include "pvector.h" @@ -398,7 +399,8 @@ static inline void classifier_publish(struct classifier *); * and each other. */ const struct cls_rule *classifier_lookup(const struct classifier *, ovs_version_t, struct flow *, - struct flow_wildcards *); + struct flow_wildcards *, + struct hmapx *conj_flows); bool classifier_rule_overlaps(const struct classifier *, const struct cls_rule *, ovs_version_t); const struct cls_rule *classifier_find_rule_exactly(const struct classifier *, diff --git a/lib/cmap.c b/lib/cmap.c index c9eef3f4aea..8ca893b0b25 100644 --- a/lib/cmap.c +++ b/lib/cmap.c @@ -598,7 +598,9 @@ cmap_set_bucket(struct cmap_bucket *b, int i, uint32_t c; atomic_read_explicit(&b->counter, &c, memory_order_acquire); - atomic_store_explicit(&b->counter, c + 1, memory_order_release); + atomic_store_explicit(&b->counter, c + 1, memory_order_relaxed); + /* Need to make sure setting hash is not moved up before counter update. */ + atomic_thread_fence(memory_order_release); ovsrcu_set(&b->nodes[i].next, node); /* Also atomic. */ b->hashes[i] = hash; atomic_store_explicit(&b->counter, c + 2, memory_order_release); diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h index fae8b3a9baf..71367f211c9 100644 --- a/lib/conntrack-private.h +++ b/lib/conntrack-private.h @@ -49,6 +49,12 @@ struct ct_endpoint { * hashing in ct_endpoint_hash_add(). */ BUILD_ASSERT_DECL(sizeof(struct ct_endpoint) == sizeof(union ct_addr) + 4); +enum key_dir { + CT_DIR_FWD = 0, + CT_DIR_REV, + CT_DIRS, +}; + /* Changes to this structure need to be reflected in conn_key_hash() * and conn_key_cmp(). */ struct conn_key { @@ -112,20 +118,18 @@ enum ct_timeout { #define N_EXP_LISTS 100 -enum OVS_PACKED_ENUM ct_conn_type { - CT_CONN_TYPE_DEFAULT, - CT_CONN_TYPE_UN_NAT, +struct conn_key_node { + enum key_dir dir; + struct conn_key key; + struct cmap_node cm_node; }; struct conn { /* Immutable data. */ - struct conn_key key; - struct conn_key rev_key; + struct conn_key_node key_node[CT_DIRS]; struct conn_key parent_key; /* Only used for orig_tuple support. */ - struct cmap_node cm_node; uint16_t nat_action; char *alg; - struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */ atomic_flag reclaimed; /* False during the lifetime of the connection, * True as soon as a thread has started freeing * its memory. */ @@ -150,7 +154,6 @@ struct conn { /* Immutable data. */ bool alg_related; /* True if alg data connection. */ - enum ct_conn_type conn_type; uint32_t tp_id; /* Timeout policy ID. */ }; @@ -197,7 +200,7 @@ enum ct_ephemeral_range { struct conntrack { struct ovs_mutex ct_lock; /* Protects 2 following fields. */ - struct cmap conns OVS_GUARDED; + struct cmap conns[UINT16_MAX + 1] OVS_GUARDED; struct rculist exp_lists[N_EXP_LISTS]; struct cmap zone_limits OVS_GUARDED; struct cmap timeout_policies OVS_GUARDED; @@ -224,6 +227,7 @@ struct conntrack { struct ipf *ipf; /* Fragmentation handling context. */ uint32_t zone_limit_seq; /* Used to disambiguate zone limit counts. */ atomic_bool tcp_seq_chk; /* Check TCP sequence numbers. */ + atomic_uint32_t sweep_ms; /* Next sweep interval. */ }; /* Lock acquisition order: diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c index 89cb2704a6c..2149fdc73a7 100644 --- a/lib/conntrack-tp.c +++ b/lib/conntrack-tp.c @@ -253,7 +253,8 @@ conn_update_expiration(struct conntrack *ct, struct conn *conn, } VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d " "val=%u sec.", - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, + conn->tp_id, val); atomic_store_relaxed(&conn->expiration, now + val * 1000); } @@ -273,7 +274,8 @@ conn_init_expiration(struct conntrack *ct, struct conn *conn, } VLOG_DBG_RL(&rl, "Init timeout %s zone=%u with policy id=%d val=%u sec.", - ct_timeout_str[tm], conn->key.zone, conn->tp_id, val); + ct_timeout_str[tm], conn->key_node[CT_DIR_FWD].key.zone, + conn->tp_id, val); conn->expiration = now + val * 1000; } diff --git a/lib/conntrack-tp.h b/lib/conntrack-tp.h index 4d411d19fd5..7ece2eae2f9 100644 --- a/lib/conntrack-tp.h +++ b/lib/conntrack-tp.h @@ -17,8 +17,15 @@ #ifndef CONNTRACK_TP_H #define CONNTRACK_TP_H 1 +#include + #define CT_DPIF_NETDEV_TP_MIN 30 + enum ct_timeout; +struct conn; +struct conntrack; +struct timeout_policy; + void timeout_policy_init(struct conntrack *ct); int timeout_policy_update(struct conntrack *ct, struct timeout_policy *tp); int timeout_policy_delete(struct conntrack *ct, uint32_t tp_id); diff --git a/lib/conntrack.c b/lib/conntrack.c index 13c5ab6283d..db44f823749 100644 --- a/lib/conntrack.c +++ b/lib/conntrack.c @@ -27,6 +27,7 @@ #include "conntrack-private.h" #include "conntrack-tp.h" #include "coverage.h" +#include "crc32c.h" #include "csum.h" #include "ct-dpif.h" #include "dp-packet.h" @@ -41,6 +42,7 @@ #include "random.h" #include "rculist.h" #include "timeval.h" +#include "unaligned.h" VLOG_DEFINE_THIS_MODULE(conntrack); @@ -101,7 +103,7 @@ static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn, struct conn_lookup_ctx *ctx, long long now); static long long int conn_expiration(const struct conn *); -static bool conn_expired(struct conn *, long long now); +static bool conn_expired(const struct conn *, long long now); static void conn_expire_push_front(struct conntrack *ct, struct conn *conn); static void set_mark(struct dp_packet *, struct conn *, uint32_t val, uint32_t mask); @@ -111,8 +113,7 @@ static void set_label(struct dp_packet *, struct conn *, static void *clean_thread_main(void *f_); static bool -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, - struct conn *nat_conn, +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, const struct nat_action_info_t *nat_info); static uint8_t @@ -206,7 +207,7 @@ static alg_helper alg_helpers[] = { #define ALG_WC_SRC_PORT 0 /* If the total number of connections goes above this value, no new connections - * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */ + * are accepted. */ #define DEFAULT_N_CONN_LIMIT 3000000 /* Does a member by member comparison of two conn_keys; this @@ -232,61 +233,6 @@ conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2) return 1; } -static void -ct_print_conn_info(const struct conn *c, const char *log_msg, - enum vlog_level vll, bool force, bool rl_on) -{ -#define CT_VLOG(RL_ON, LEVEL, ...) \ - do { \ - if (RL_ON) { \ - static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \ - vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \ - } else { \ - vlog(&this_module, LEVEL, __VA_ARGS__); \ - } \ - } while (0) - - if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) { - if (c->key.dl_type == htons(ETH_TYPE_IP)) { - CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src " - "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports " - "%"PRIu16"/%"PRIu16" rev src/dst ports " - "%"PRIu16"/%"PRIu16" zone/rev zone " - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto " - "%"PRIu8"/%"PRIu8, log_msg, - IP_ARGS(c->key.src.addr.ipv4), - IP_ARGS(c->key.dst.addr.ipv4), - IP_ARGS(c->rev_key.src.addr.ipv4), - IP_ARGS(c->rev_key.dst.addr.ipv4), - ntohs(c->key.src.port), ntohs(c->key.dst.port), - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port), - c->key.zone, c->rev_key.zone, c->key.nw_proto, - c->rev_key.nw_proto); - } else { - char ip6_s[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s); - char ip6_d[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d); - char ip6_rs[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs, - sizeof ip6_rs); - char ip6_rd[INET6_ADDRSTRLEN]; - inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd, - sizeof ip6_rd); - - CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s" - " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16 - " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone " - "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto " - "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs, - ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port), - ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port), - c->key.zone, c->rev_key.zone, c->key.nw_proto, - c->rev_key.nw_proto); - } - } -} - /* Initializes the connection tracker 'ct'. The caller is responsible for * calling 'conntrack_destroy()', when the instance is not needed anymore */ struct conntrack * @@ -308,7 +254,9 @@ conntrack_init(void) ovs_mutex_init_adaptive(&ct->ct_lock); ovs_mutex_lock(&ct->ct_lock); - cmap_init(&ct->conns); + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + cmap_init(&ct->conns[i]); + } for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) { rculist_init(&ct->exp_lists[i]); } @@ -320,6 +268,7 @@ conntrack_init(void) atomic_count_init(&ct->n_conn, 0); atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); atomic_init(&ct->tcp_seq_chk, true); + atomic_init(&ct->sweep_ms, 20000); latch_init(&ct->clean_thread_exit); ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct); ct->ipf = ipf_init(); @@ -451,19 +400,21 @@ zone_limit_clean(struct conntrack *ct, struct zone_limit *zl) } int -zone_limit_delete(struct conntrack *ct, uint16_t zone) +zone_limit_delete(struct conntrack *ct, int32_t zone) { ovs_mutex_lock(&ct->ct_lock); struct zone_limit *zl = zone_limit_lookup_protected(ct, zone); if (zl) { zone_limit_clean(ct, zl); - ovs_mutex_unlock(&ct->ct_lock); - VLOG_INFO("Deleted zone limit for zone %d", zone); - } else { - ovs_mutex_unlock(&ct->ct_lock); - VLOG_INFO("Attempted delete of non-existent zone limit: zone %d", + } + + if (zone != DEFAULT_ZONE) { + VLOG_INFO(zl ? "Deleted zone limit for zone %d" + : "Attempted delete of non-existent zone limit: zone %d", zone); } + + ovs_mutex_unlock(&ct->ct_lock); return 0; } @@ -474,28 +425,29 @@ conn_clean__(struct conntrack *ct, struct conn *conn) uint32_t hash; if (conn->alg) { - expectation_clean(ct, &conn->key); + expectation_clean(ct, &conn->key_node[CT_DIR_FWD].key); } - hash = conn_key_hash(&conn->key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->cm_node, hash); + hash = conn_key_hash(&conn->key_node[CT_DIR_FWD].key, ct->hash_basis); + cmap_remove(&ct->conns[conn->key_node[CT_DIR_FWD].key.zone], + &conn->key_node[CT_DIR_FWD].cm_node, hash); - if (conn->nat_conn) { - hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis); - cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash); + if (conn->nat_action) { + hash = conn_key_hash(&conn->key_node[CT_DIR_REV].key, + ct->hash_basis); + cmap_remove(&ct->conns[conn->key_node[CT_DIR_REV].key.zone], + &conn->key_node[CT_DIR_REV].cm_node, hash); } rculist_remove(&conn->node); } -/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT. Also - * removes the associated nat 'conn' from the lookup datastructures. */ +/* Also removes the associated nat 'conn' from the lookup + datastructures. */ static void conn_clean(struct conntrack *ct, struct conn *conn) OVS_EXCLUDED(conn->lock, ct->ct_lock) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); - if (atomic_flag_test_and_set(&conn->reclaimed)) { return; } @@ -555,7 +507,9 @@ conntrack_destroy(struct conntrack *ct) ovs_mutex_lock(&ct->ct_lock); - cmap_destroy(&ct->conns); + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + cmap_destroy(&ct->conns[i]); + } cmap_destroy(&ct->zone_limits); cmap_destroy(&ct->timeout_policies); @@ -582,34 +536,39 @@ conn_key_lookup(struct conntrack *ct, const struct conn_key *key, uint32_t hash, long long now, struct conn **conn_out, bool *reply) { - struct conn *conn; + struct conn_key_node *keyn; + struct conn *conn = NULL; bool found = false; - CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) { + CMAP_FOR_EACH_WITH_HASH (keyn, cm_node, hash, &ct->conns[key->zone]) { + if (keyn->dir == CT_DIR_FWD) { + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + } else { + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_REV]); + } + if (conn_expired(conn, now)) { continue; } - if (!conn_key_cmp(&conn->key, key)) { - found = true; - if (reply) { - *reply = false; - } - break; - } - if (!conn_key_cmp(&conn->rev_key, key)) { - found = true; - if (reply) { - *reply = true; + + for (int i = CT_DIR_FWD; i < CT_DIRS; i++) { + if (!conn_key_cmp(&conn->key_node[i].key, key)) { + found = true; + if (reply) { + *reply = (i == CT_DIR_REV); + } + goto out_found; } - break; } } +out_found: if (found && conn_out) { *conn_out = conn; } else if (conn_out) { *conn_out = NULL; } + return found; } @@ -643,7 +602,7 @@ write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn, if (conn->alg_related) { key = &conn->parent_key; } else { - key = &conn->key; + key = &conn->key_node[CT_DIR_FWD].key; } } else if (alg_exp) { pkt->md.ct_mark = alg_exp->parent_mark; @@ -704,8 +663,7 @@ is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl) } static enum ct_alg_ctl_type -get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst, - const char *helper) +get_alg_ctl_type(const struct dp_packet *pkt, const char *helper) { /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined * in OSX, at least in in.h. Since these values will never change, remove @@ -715,26 +673,24 @@ get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst, uint8_t ip_proto = get_ip_proto(pkt); struct udp_header *uh = dp_packet_l4(pkt); struct tcp_header *th = dp_packet_l4(pkt); - ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP); - ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP); - ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP); + ovs_be16 ftp_port = htons(CT_IPPORT_FTP); + ovs_be16 tftp_port = htons(CT_IPPORT_TFTP); - if (OVS_UNLIKELY(tp_dst)) { - if (helper && !strncmp(helper, "ftp", strlen("ftp"))) { - ftp_dst_port = tp_dst; - } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) { - tftp_dst_port = tp_dst; + if (helper) { + if ((ip_proto == IPPROTO_TCP) && + !strncmp(helper, "ftp", strlen("ftp"))) { + return CT_ALG_CTL_FTP; } - } else if (OVS_UNLIKELY(tp_src)) { - if (helper && !strncmp(helper, "ftp", strlen("ftp"))) { - ftp_src_port = tp_src; + if ((ip_proto == IPPROTO_UDP) && + !strncmp(helper, "tftp", strlen("tftp"))) { + return CT_ALG_CTL_TFTP; } } - if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) { + if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_port) { return CT_ALG_CTL_TFTP; } else if (ip_proto == IPPROTO_TCP && - (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) { + (th->tcp_src == ftp_port || th->tcp_dst == ftp_port)) { return CT_ALG_CTL_FTP; } return CT_ALG_CTL_NONE; @@ -764,109 +720,61 @@ handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, } static void -pat_packet(struct dp_packet *pkt, const struct conn *conn) +pat_packet(struct dp_packet *pkt, const struct conn_key *key) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh = dp_packet_l4(pkt); - packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->rev_key.dst.port, - conn->rev_key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->rev_key.dst.port, - conn->rev_key.src.port); - } + if (key->nw_proto == IPPROTO_TCP) { + packet_set_tcp_port(pkt, key->dst.port, key->src.port); + } else if (key->nw_proto == IPPROTO_UDP) { + packet_set_udp_port(pkt, key->dst.port, key->src.port); + } else if (key->nw_proto == IPPROTO_SCTP) { + packet_set_sctp_port(pkt, key->dst.port, key->src.port); } } -static void -nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related) +static uint16_t +nat_action_reverse(uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - pkt->md.ct_state |= CS_SRC_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_src, - conn->rev_key.dst.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_src.be32, - &conn->rev_key.dst.addr.ipv6, true); - } - if (!related) { - pat_packet(pkt, conn); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - pkt->md.ct_state |= CS_DST_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_dst, - conn->rev_key.src.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_dst.be32, - &conn->rev_key.src.addr.ipv6, true); - } - if (!related) { - pat_packet(pkt, conn); - } + if (nat_action & NAT_ACTION_SRC) { + nat_action ^= NAT_ACTION_SRC; + nat_action |= NAT_ACTION_DST; + } else if (nat_action & NAT_ACTION_DST) { + nat_action ^= NAT_ACTION_DST; + nat_action |= NAT_ACTION_SRC; } + return nat_action; } static void -un_pat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_packet_ipv4(struct dp_packet *pkt, const struct conn_key *key, + uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh = dp_packet_l4(pkt); - packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->key.dst.port, conn->key.src.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->key.dst.port, conn->key.src.port); - } + struct ip_header *nh = dp_packet_l3(pkt); + + if (nat_action & NAT_ACTION_SRC) { + packet_set_ipv4_addr(pkt, &nh->ip_src, key->dst.addr.ipv4); + } else if (nat_action & NAT_ACTION_DST) { + packet_set_ipv4_addr(pkt, &nh->ip_dst, key->src.addr.ipv4); } } static void -reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_packet_ipv6(struct dp_packet *pkt, const struct conn_key *key, + uint16_t nat_action) { - if (conn->nat_action & NAT_ACTION_SRC) { - if (conn->key.nw_proto == IPPROTO_TCP) { - struct tcp_header *th_in = dp_packet_l4(pkt); - packet_set_tcp_port(pkt, conn->key.src.port, - th_in->tcp_dst); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - struct udp_header *uh_in = dp_packet_l4(pkt); - packet_set_udp_port(pkt, conn->key.src.port, - uh_in->udp_dst); - } - } else if (conn->nat_action & NAT_ACTION_DST) { - if (conn->key.nw_proto == IPPROTO_TCP) { - packet_set_tcp_port(pkt, conn->key.src.port, - conn->key.dst.port); - } else if (conn->key.nw_proto == IPPROTO_UDP) { - packet_set_udp_port(pkt, conn->key.src.port, - conn->key.dst.port); - } + struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); + + if (nat_action & NAT_ACTION_SRC) { + packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_src.be32, + &key->dst.addr.ipv6, true); + } else if (nat_action & NAT_ACTION_DST) { + packet_set_ipv6_addr(pkt, key->nw_proto, nh6->ip6_dst.be32, + &key->src.addr.ipv6, true); } } static void -reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn) +nat_inner_packet(struct dp_packet *pkt, struct conn_key *key, + uint16_t nat_action) { char *tail = dp_packet_tail(pkt); uint16_t pad = dp_packet_l2_pad_size(pkt); @@ -875,98 +783,78 @@ reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn) uint16_t orig_l3_ofs = pkt->l3_ofs; uint16_t orig_l4_ofs = pkt->l4_ofs; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - struct icmp_header *icmp = dp_packet_l4(pkt); - struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1); - /* This call is already verified to succeed during the code path from - * 'conn_key_extract()' which calls 'extract_l4_icmp()'. */ - extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad, + void *l3 = dp_packet_l3(pkt); + void *l4 = dp_packet_l4(pkt); + void *inner_l3; + /* These calls are already verified to succeed during the code path from + * 'conn_key_extract()' which calls + * 'extract_l4_icmp()'/'extract_l4_icmp6()'. */ + if (key->dl_type == htons(ETH_TYPE_IP)) { + inner_l3 = (char *) l4 + sizeof(struct icmp_header); + extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad, &inner_l4, false); - pkt->l3_ofs += (char *) inner_l3 - (char *) nh; - pkt->l4_ofs += inner_l4 - (char *) icmp; + } else { + inner_l3 = (char *) l4 + sizeof(struct icmp6_data_header); + extract_l3_ipv6(&inner_key, inner_l3, tail - ((char *) inner_l3) - pad, + &inner_l4); + } + pkt->l3_ofs += (char *) inner_l3 - (char *) l3; + pkt->l4_ofs += inner_l4 - (char *) l4; - if (conn->nat_action & NAT_ACTION_SRC) { - packet_set_ipv4_addr(pkt, &inner_l3->ip_src, - conn->key.src.addr.ipv4); - } else if (conn->nat_action & NAT_ACTION_DST) { - packet_set_ipv4_addr(pkt, &inner_l3->ip_dst, - conn->key.dst.addr.ipv4); - } + /* Reverse the key for inner packet. */ + struct conn_key rev_key = *key; + conn_key_reverse(&rev_key); - reverse_pat_packet(pkt, conn); + pat_packet(pkt, &rev_key); + + if (key->dl_type == htons(ETH_TYPE_IP)) { + nat_packet_ipv4(pkt, &rev_key, nat_action); + + struct icmp_header *icmp = (struct icmp_header *) l4; icmp->icmp_csum = 0; icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad); } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - struct icmp6_data_header *icmp6 = dp_packet_l4(pkt); - struct ovs_16aligned_ip6_hdr *inner_l3_6 = - (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1); - /* This call is already verified to succeed during the code path from - * 'conn_key_extract()' which calls 'extract_l4_icmp6()'. */ - extract_l3_ipv6(&inner_key, inner_l3_6, - tail - ((char *)inner_l3_6) - pad, - &inner_l4); - pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6; - pkt->l4_ofs += inner_l4 - (char *) icmp6; - - if (conn->nat_action & NAT_ACTION_SRC) { - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - inner_l3_6->ip6_src.be32, - &conn->key.src.addr.ipv6, true); - } else if (conn->nat_action & NAT_ACTION_DST) { - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - inner_l3_6->ip6_dst.be32, - &conn->key.dst.addr.ipv6, true); - } - reverse_pat_packet(pkt, conn); + nat_packet_ipv6(pkt, &rev_key, nat_action); + + struct icmp6_data_header *icmp6 = (struct icmp6_data_header *) l4; icmp6->icmp6_base.icmp6_cksum = 0; - icmp6->icmp6_base.icmp6_cksum = packet_csum_upperlayer6(nh6, icmp6, - IPPROTO_ICMPV6, tail - (char *) icmp6 - pad); + icmp6->icmp6_base.icmp6_cksum = + packet_csum_upperlayer6(l3, icmp6, IPPROTO_ICMPV6, + tail - (char *) icmp6 - pad); } + pkt->l3_ofs = orig_l3_ofs; pkt->l4_ofs = orig_l4_ofs; } static void -un_nat_packet(struct dp_packet *pkt, const struct conn *conn, - bool related) +nat_packet(struct dp_packet *pkt, struct conn *conn, bool reply, bool related) { - if (conn->nat_action & NAT_ACTION_SRC) { - pkt->md.ct_state |= CS_DST_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_dst, - conn->key.src.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_dst.be32, - &conn->key.src.addr.ipv6, true); - } + enum key_dir dir = reply ? CT_DIR_FWD : CT_DIR_REV; + struct conn_key *key = &conn->key_node[dir].key; + uint16_t nat_action = reply ? nat_action_reverse(conn->nat_action) + : conn->nat_action; - if (OVS_UNLIKELY(related)) { - reverse_nat_packet(pkt, conn); - } else { - un_pat_packet(pkt, conn); - } - } else if (conn->nat_action & NAT_ACTION_DST) { + /* Update ct_state. */ + if (nat_action & NAT_ACTION_SRC) { pkt->md.ct_state |= CS_SRC_NAT; - if (conn->key.dl_type == htons(ETH_TYPE_IP)) { - struct ip_header *nh = dp_packet_l3(pkt); - packet_set_ipv4_addr(pkt, &nh->ip_src, - conn->key.dst.addr.ipv4); - } else { - struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt); - packet_set_ipv6_addr(pkt, conn->key.nw_proto, - nh6->ip6_src.be32, - &conn->key.dst.addr.ipv6, true); - } + } else if (nat_action & NAT_ACTION_DST) { + pkt->md.ct_state |= CS_DST_NAT; + } + + /* Reverse the key for outer header. */ + if (key->dl_type == htons(ETH_TYPE_IP)) { + nat_packet_ipv4(pkt, key, nat_action); + } else { + nat_packet_ipv6(pkt, key, nat_action); + } + if (nat_action & NAT_ACTION_SRC || nat_action & NAT_ACTION_DST) { if (OVS_UNLIKELY(related)) { - reverse_nat_packet(pkt, conn); + nat_action = nat_action_reverse(nat_action); + nat_inner_packet(pkt, key, nat_action); } else { - un_pat_packet(pkt, conn); + pat_packet(pkt, key); } } } @@ -977,7 +865,7 @@ conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in, { struct conn *conn; - conn_lookup(ct, &conn_in->key, now, &conn, NULL); + conn_lookup(ct, &conn_in->key_node[CT_DIR_FWD].key, now, &conn, NULL); if (conn && seq_skew) { conn->seq_skew = seq_skew; conn->seq_skew_dir = seq_skew_dir; @@ -1013,7 +901,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, OVS_REQUIRES(ct->ct_lock) { struct conn *nc = NULL; - struct conn *nat_conn = NULL; if (!valid_new(pkt, &ctx->key)) { pkt->md.ct_state = CS_INVALID; @@ -1027,6 +914,7 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, } if (commit) { + struct conn_key_node *fwd_key_node, *rev_key_node; struct zone_limit *zl = zone_limit_lookup_or_default(ct, ctx->key.zone); if (zl && atomic_count_get(&zl->czl.count) >= zl->czl.limit) { @@ -1041,9 +929,12 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, } nc = new_conn(ct, pkt, &ctx->key, now, tp_id); - memcpy(&nc->key, &ctx->key, sizeof nc->key); - memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key); - conn_key_reverse(&nc->rev_key); + fwd_key_node = &nc->key_node[CT_DIR_FWD]; + rev_key_node = &nc->key_node[CT_DIR_REV]; + memcpy(&fwd_key_node->key, &ctx->key, sizeof fwd_key_node->key); + memcpy(&rev_key_node->key, &fwd_key_node->key, + sizeof rev_key_node->key); + conn_key_reverse(&rev_key_node->key); if (ct_verify_helper(helper, ct_alg_ctl)) { nc->alg = nullable_xstrdup(helper); @@ -1056,58 +947,53 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, nc->parent_key = alg_exp->parent_key; } + ovs_mutex_init_adaptive(&nc->lock); + atomic_flag_clear(&nc->reclaimed); + fwd_key_node->dir = CT_DIR_FWD; + rev_key_node->dir = CT_DIR_REV; + + if (zl) { + nc->admit_zone = zl->czl.zone; + nc->zone_limit_seq = zl->czl.zone_limit_seq; + } else { + nc->admit_zone = INVALID_ZONE; + } + if (nat_action_info) { nc->nat_action = nat_action_info->nat_action; - nat_conn = xzalloc(sizeof *nat_conn); if (alg_exp) { if (alg_exp->nat_rpl_dst) { - nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr; + rev_key_node->key.dst.addr = alg_exp->alg_nat_repl_addr; nc->nat_action = NAT_ACTION_SRC; } else { - nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr; + rev_key_node->key.src.addr = alg_exp->alg_nat_repl_addr; nc->nat_action = NAT_ACTION_DST; } } else { - memcpy(nat_conn, nc, sizeof *nat_conn); - bool nat_res = nat_get_unique_tuple(ct, nc, nat_conn, - nat_action_info); - + bool nat_res = nat_get_unique_tuple(ct, nc, nat_action_info); if (!nat_res) { goto nat_res_exhaustion; } - - /* Update nc with nat adjustments made to nat_conn by - * nat_get_unique_tuple(). */ - memcpy(nc, nat_conn, sizeof *nc); } - nat_packet(pkt, nc, ctx->icmp_related); - memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key); - memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key); - nat_conn->conn_type = CT_CONN_TYPE_UN_NAT; - nat_conn->nat_action = 0; - nat_conn->alg = NULL; - nat_conn->nat_conn = NULL; - uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis); - cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash); + nat_packet(pkt, nc, false, ctx->icmp_related); + uint32_t rev_hash = conn_key_hash(&rev_key_node->key, + ct->hash_basis); + cmap_insert(&ct->conns[ctx->key.zone], + &rev_key_node->cm_node, rev_hash); } - nc->nat_conn = nat_conn; - ovs_mutex_init_adaptive(&nc->lock); - nc->conn_type = CT_CONN_TYPE_DEFAULT; - atomic_flag_clear(&nc->reclaimed); - cmap_insert(&ct->conns, &nc->cm_node, ctx->hash); + cmap_insert(&ct->conns[ctx->key.zone], + &fwd_key_node->cm_node, ctx->hash); conn_expire_push_front(ct, nc); atomic_count_inc(&ct->n_conn); - ctx->conn = nc; /* For completeness. */ + if (zl) { - nc->admit_zone = zl->czl.zone; - nc->zone_limit_seq = zl->czl.zone_limit_seq; atomic_count_inc(&zl->czl.count); - } else { - nc->admit_zone = INVALID_ZONE; } + + ctx->conn = nc; /* For completeness. */ } return nc; @@ -1118,7 +1004,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt, * firewall rules or a separate firewall. Also using zone partitioning * can limit DoS impact. */ nat_res_exhaustion: - free(nat_conn); delete_conn__(nc); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - " @@ -1131,7 +1016,6 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, struct conn *conn, long long now) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); bool create_new_conn = false; if (ctx->icmp_related) { @@ -1158,7 +1042,8 @@ conn_update_state(struct conntrack *ct, struct dp_packet *pkt, pkt->md.ct_state = CS_INVALID; break; case CT_UPDATE_NEW: - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key, + now, NULL, NULL)) { conn_force_expire(conn); } create_new_conn = true; @@ -1185,11 +1070,8 @@ handle_nat(struct dp_packet *pkt, struct conn *conn, if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) { pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT); } - if (reply) { - un_nat_packet(pkt, conn, related); - } else { - nat_packet(pkt, conn, related); - } + + nat_packet(pkt, conn, reply, related); } } @@ -1337,8 +1219,10 @@ initial_conn_lookup(struct conntrack *ct, struct conn_lookup_ctx *ctx, if (natted) { if (OVS_LIKELY(ctx->conn)) { + enum key_dir dir; ctx->reply = !ctx->reply; - ctx->key = ctx->reply ? ctx->conn->rev_key : ctx->conn->key; + dir = ctx->reply ? CT_DIR_REV : CT_DIR_FWD; + ctx->key = ctx->conn->key_node[dir].key; ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); } else { /* A lookup failure does not necessarily imply that an @@ -1356,8 +1240,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, bool force, bool commit, long long now, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, const struct nat_action_info_t *nat_action_info, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, - uint32_t tp_id) + const char *helper, uint32_t tp_id) { /* Reset ct_state whenever entering a new zone. */ if (pkt->md.ct_state && pkt->md.ct_zone != zone) { @@ -1371,33 +1254,18 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, /* Delete found entry if in wrong direction. 'force' implies commit. */ if (OVS_UNLIKELY(force && ctx->reply && conn)) { - if (conn_lookup(ct, &conn->key, now, NULL, NULL)) { + if (conn_lookup(ct, &conn->key_node[CT_DIR_FWD].key, + now, NULL, NULL)) { conn_force_expire(conn); } conn = NULL; } - if (OVS_LIKELY(conn)) { - if (conn->conn_type == CT_CONN_TYPE_UN_NAT) { - - ctx->reply = true; - struct conn *rev_conn = conn; /* Save for debugging. */ - uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis); - conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply); - - if (!conn) { - pkt->md.ct_state |= CS_INVALID; - write_ct_md(pkt, zone, NULL, NULL, NULL); - char *log_msg = xasprintf("Missing parent conn %p", rev_conn); - ct_print_conn_info(rev_conn, log_msg, VLL_INFO, true, true); - free(log_msg); - return; - } - } + if (conn && helper == NULL) { + helper = conn->alg; } - enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst, - helper); + enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, helper); if (OVS_LIKELY(conn)) { if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn, @@ -1474,7 +1342,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, ovs_be16 dl_type, bool force, bool commit, uint16_t zone, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, + const char *helper, const struct nat_action_info_t *nat_action_info, long long now, uint32_t tp_id) { @@ -1486,10 +1354,16 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) { struct conn *conn = packet->md.conn; + + if (helper == NULL && conn != NULL) { + helper = conn->alg; + } + if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) { write_ct_md(packet, zone, NULL, NULL, NULL); - } else if (conn && conn->key.zone == zone && !force - && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) { + } else if (conn && + conn->key_node[CT_DIR_FWD].key.zone == zone && !force && + !get_alg_ctl_type(packet, helper)) { process_one_fast(zone, setmark, setlabel, nat_action_info, conn, packet); } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx, @@ -1498,8 +1372,7 @@ conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, write_ct_md(packet, zone, NULL, NULL, NULL); } else { process_one(ct, packet, &ctx, zone, force, commit, now, setmark, - setlabel, nat_action_info, tp_src, tp_dst, helper, - tp_id); + setlabel, nat_action_info, helper, tp_id); } } @@ -1554,6 +1427,21 @@ set_label(struct dp_packet *pkt, struct conn *conn, } +int +conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms) +{ + atomic_store_relaxed(&ct->sweep_ms, ms); + return 0; +} + +uint32_t +conntrack_get_sweep_interval(struct conntrack *ct) +{ + uint32_t ms; + atomic_read_relaxed(&ct->sweep_ms, &ms); + return ms; +} + static size_t ct_sweep(struct conntrack *ct, struct rculist *list, long long now) OVS_NO_THREAD_SAFETY_ANALYSIS @@ -1578,7 +1466,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long long now) static long long conntrack_clean(struct conntrack *ct, long long now) { - long long next_wakeup = now + 20 * 1000; + long long next_wakeup = now + conntrack_get_sweep_interval(ct); unsigned int n_conn_limit, i; size_t clean_end, count = 0; @@ -1586,12 +1474,12 @@ conntrack_clean(struct conntrack *ct, long long now) clean_end = n_conn_limit / 64; for (i = ct->next_sweep; i < N_EXP_LISTS; i++) { - count += ct_sweep(ct, &ct->exp_lists[i], now); - if (count > clean_end) { next_wakeup = 0; break; } + + count += ct_sweep(ct, &ct->exp_lists[i], now); } ct->next_sweep = (i < N_EXP_LISTS) ? i : 0; @@ -1691,8 +1579,8 @@ extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, uint8_t nw_proto = ip6->ip6_nxt; uint8_t nw_frag = 0; - const struct ovs_16aligned_ip6_frag *frag_hdr; - if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { return false; } @@ -1733,6 +1621,26 @@ checksum_valid(const struct conn_key *key, const void *data, size_t size, return valid; } +static inline bool +sctp_checksum_valid(const void *data, size_t size) +{ + struct sctp_header *sctp = (struct sctp_header *) data; + ovs_be32 rcvd_csum, csum; + bool ret; + + rcvd_csum = get_16aligned_be32(&sctp->sctp_csum); + put_16aligned_be32(&sctp->sctp_csum, 0); + csum = crc32c(data, size); + put_16aligned_be32(&sctp->sctp_csum, rcvd_csum); + + ret = (rcvd_csum == csum); + if (!ret) { + COVERAGE_INC(conntrack_l4csum_err); + } + + return ret; +} + static inline bool check_l4_tcp(const struct conn_key *key, const void *data, size_t size, const void *l3, bool validate_checksum) @@ -1769,6 +1677,47 @@ check_l4_udp(const struct conn_key *key, const void *data, size_t size, || (validate_checksum ? checksum_valid(key, data, size, l3) : true); } +static inline bool +sctp_check_len(const struct sctp_header *sh, size_t size) +{ + const struct sctp_chunk_header *sch; + size_t next; + + if (size < SCTP_HEADER_LEN) { + return false; + } + + /* rfc4960: Chunks (including Type, Length, and Value fields) are padded + * out by the sender with all zero bytes to be a multiple of 4 bytes long. + */ + for (next = sizeof(struct sctp_header), + sch = SCTP_NEXT_CHUNK(sh, next); + next < size; + next += ROUND_UP(ntohs(sch->length), 4), + sch = SCTP_NEXT_CHUNK(sh, next)) { + /* rfc4960: This value represents the size of the chunk in bytes, + * including the Chunk Type, Chunk Flags, Chunk Length, and Chunk Value + * fields. + * Therefore, if the Chunk Value field is zero-length, the Length + * field will be set to 4. */ + if (ntohs(sch->length) < sizeof *sch) { + return false; + } + } + + return (next == size); +} + +static inline bool +check_l4_sctp(const void *data, size_t size, bool validate_checksum) +{ + if (OVS_UNLIKELY(!sctp_check_len(data, size))) { + return false; + } + + return validate_checksum ? sctp_checksum_valid(data, size) : true; +} + static inline bool check_l4_icmp(const void *data, size_t size, bool validate_checksum) { @@ -1819,6 +1768,21 @@ extract_l4_udp(struct conn_key *key, const void *data, size_t size, return key->src.port && key->dst.port; } +static inline bool +extract_l4_sctp(struct conn_key *key, const void *data, size_t size, + size_t *chk_len) +{ + if (OVS_UNLIKELY(size < (chk_len ? *chk_len : SCTP_HEADER_LEN))) { + return false; + } + + const struct sctp_header *sctp = data; + key->src.port = sctp->sctp_src; + key->dst.port = sctp->sctp_dst; + + return key->src.port && key->dst.port; +} + static inline bool extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, const void *l3, bool validate_checksum, size_t *chk_len); @@ -2034,6 +1998,9 @@ extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, return (!related || check_l4_udp(key, data, size, l3, validate_checksum)) && extract_l4_udp(key, data, size, chk_len); + } else if (key->nw_proto == IPPROTO_SCTP) { + return (!related || check_l4_sctp(data, size, validate_checksum)) + && extract_l4_sctp(key, data, size, chk_len); } else if (key->dl_type == htons(ETH_TYPE_IP) && key->nw_proto == IPPROTO_ICMP) { return (!related || check_l4_icmp(data, size, validate_checksum)) @@ -2101,16 +2068,15 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, ctx->key.dl_type = dl_type; if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { - bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt); - if (hwol_bad_l3_csum) { + if (dp_packet_ip_checksum_bad(pkt)) { ok = false; COVERAGE_INC(conntrack_l3csum_err); } else { - bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt) - || dp_packet_hwol_is_ipv4(pkt); - /* Validate the checksum only when hwol is not supported. */ + /* Validate the checksum only when hwol is not supported and the + * packet's checksum status is not known. */ ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL, - !hwol_good_l3_csum); + !dp_packet_hwol_is_ipv4(pkt) && + !dp_packet_ip_checksum_good(pkt)); } } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL); @@ -2119,13 +2085,12 @@ conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type, } if (ok) { - bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt); - if (!hwol_bad_l4_csum) { - bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt) - || dp_packet_hwol_tx_l4_checksum(pkt); + if (!dp_packet_l4_checksum_bad(pkt)) { /* Validate the checksum only when hwol is not supported. */ if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt), - &ctx->icmp_related, l3, !hwol_good_l4_csum, + &ctx->icmp_related, l3, + !dp_packet_l4_checksum_good(pkt) && + !dp_packet_hwol_tx_l4_checksum(pkt), NULL)) { ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); return true; @@ -2246,22 +2211,26 @@ nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment) } static uint32_t -nat_range_hash(const struct conn *conn, uint32_t basis, +nat_range_hash(const struct conn_key *key, uint32_t basis, const struct nat_action_info_t *nat_info) { uint32_t hash = basis; + if (!basis) { + hash = ct_addr_hash_add(hash, &key->src.addr); + } else { + hash = ct_endpoint_hash_add(hash, &key->src); + hash = ct_endpoint_hash_add(hash, &key->dst); + } + hash = ct_addr_hash_add(hash, &nat_info->min_addr); hash = ct_addr_hash_add(hash, &nat_info->max_addr); hash = hash_add(hash, ((uint32_t) nat_info->max_port << 16) | nat_info->min_port); - hash = ct_endpoint_hash_add(hash, &conn->key.src); - hash = ct_endpoint_hash_add(hash, &conn->key.dst); - hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type); - hash = hash_add(hash, conn->key.nw_proto); - hash = hash_add(hash, conn->key.zone); - + hash = hash_add(hash, (OVS_FORCE uint32_t) key->dl_type); + hash = hash_add(hash, key->nw_proto); + hash = hash_add(hash, key->zone); /* The purpose of the second parameter is to distinguish hashes of data of * different length; our data always has the same length so there is no * value in counting. */ @@ -2271,7 +2240,7 @@ nat_range_hash(const struct conn *conn, uint32_t basis, /* Ports are stored in host byte order for convenience. */ static void set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k, - uint32_t hash, uint16_t *curr, uint16_t *min, + uint32_t off, uint16_t *curr, uint16_t *min, uint16_t *max) { if (((ni->nat_action & NAT_ACTION_SNAT_ALL) == NAT_ACTION_SRC) || @@ -2290,19 +2259,19 @@ set_sport_range(const struct nat_action_info_t *ni, const struct conn_key *k, } else { *min = ni->min_port; *max = ni->max_port; - *curr = *min + (hash % ((*max - *min) + 1)); + *curr = *min + (off % ((*max - *min) + 1)); } } static void set_dport_range(const struct nat_action_info_t *ni, const struct conn_key *k, - uint32_t hash, uint16_t *curr, uint16_t *min, + uint32_t off, uint16_t *curr, uint16_t *min, uint16_t *max) { if (ni->nat_action & NAT_ACTION_DST_PORT) { *min = ni->min_port; *max = ni->max_port; - *curr = *min + (hash % ((*max - *min) + 1)); + *curr = *min + (off % ((*max - *min) + 1)); } else { *curr = ntohs(k->dst.port); *min = *max = *curr; @@ -2334,19 +2303,21 @@ get_addr_in_range(union ct_addr *min, union ct_addr *max, } static void -find_addr(const struct conn *conn, union ct_addr *min, +find_addr(const struct conn_key *key, union ct_addr *min, union ct_addr *max, union ct_addr *curr, uint32_t hash, bool ipv4, const struct nat_action_info_t *nat_info) { - const union ct_addr zero_ip = {0}; + union ct_addr zero_ip; + + memset(&zero_ip, 0, sizeof zero_ip); /* All-zero case. */ if (!memcmp(min, &zero_ip, sizeof *min)) { if (nat_info->nat_action & NAT_ACTION_SRC) { - *curr = conn->key.src.addr; + *curr = key->src.addr; } else if (nat_info->nat_action & NAT_ACTION_DST) { - *curr = conn->key.dst.addr; + *curr = key->dst.addr; } } else { get_addr_in_range(min, max, curr, hash, ipv4); @@ -2365,7 +2336,7 @@ store_addr_to_key(union ct_addr *addr, struct conn_key *key, } static bool -nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, +nat_get_unique_l4(struct conntrack *ct, struct conn_key *rev_key, ovs_be16 *port, uint16_t curr, uint16_t min, uint16_t max) { @@ -2388,8 +2359,7 @@ nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, } *port = htons(curr); - if (!conn_lookup(ct, &nat_conn->rev_key, - time_msec(), NULL, NULL)) { + if (!conn_lookup(ct, rev_key, time_msec(), NULL, NULL)) { return true; } } @@ -2427,53 +2397,64 @@ nat_get_unique_l4(struct conntrack *ct, struct conn *nat_conn, * * If none can be found, return exhaustion to the caller. */ static bool -nat_get_unique_tuple(struct conntrack *ct, const struct conn *conn, - struct conn *nat_conn, +nat_get_unique_tuple(struct conntrack *ct, struct conn *conn, const struct nat_action_info_t *nat_info) { - uint32_t hash = nat_range_hash(conn, ct->hash_basis, nat_info); - union ct_addr min_addr = {0}, max_addr = {0}, addr = {0}; - bool pat_proto = conn->key.nw_proto == IPPROTO_TCP || - conn->key.nw_proto == IPPROTO_UDP; + struct conn_key *fwd_key = &conn->key_node[CT_DIR_FWD].key; + struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key; + bool pat_proto = fwd_key->nw_proto == IPPROTO_TCP || + fwd_key->nw_proto == IPPROTO_UDP || + fwd_key->nw_proto == IPPROTO_SCTP; uint16_t min_dport, max_dport, curr_dport; uint16_t min_sport, max_sport, curr_sport; + union ct_addr min_addr, max_addr, addr; + uint32_t hash, port_off, basis; + + memset(&min_addr, 0, sizeof min_addr); + memset(&max_addr, 0, sizeof max_addr); + memset(&addr, 0, sizeof addr); + + basis = (nat_info->nat_flags & NAT_PERSISTENT) ? 0 : ct->hash_basis; + hash = nat_range_hash(fwd_key, basis, nat_info); + + if (nat_info->nat_flags & NAT_RANGE_RANDOM) { + port_off = random_uint32(); + } else if (basis) { + port_off = hash; + } else { + port_off = nat_range_hash(fwd_key, ct->hash_basis, nat_info); + } min_addr = nat_info->min_addr; max_addr = nat_info->max_addr; - find_addr(conn, &min_addr, &max_addr, &addr, hash, - (conn->key.dl_type == htons(ETH_TYPE_IP)), nat_info); + find_addr(fwd_key, &min_addr, &max_addr, &addr, hash, + (fwd_key->dl_type == htons(ETH_TYPE_IP)), nat_info); - set_sport_range(nat_info, &conn->key, hash, &curr_sport, + set_sport_range(nat_info, fwd_key, port_off, &curr_sport, &min_sport, &max_sport); - set_dport_range(nat_info, &conn->key, hash, &curr_dport, + set_dport_range(nat_info, fwd_key, port_off, &curr_dport, &min_dport, &max_dport); if (pat_proto) { - nat_conn->rev_key.src.port = htons(curr_dport); - nat_conn->rev_key.dst.port = htons(curr_sport); + rev_key->src.port = htons(curr_dport); + rev_key->dst.port = htons(curr_sport); } - store_addr_to_key(&addr, &nat_conn->rev_key, - nat_info->nat_action); + store_addr_to_key(&addr, rev_key, nat_info->nat_action); if (!pat_proto) { - if (!conn_lookup(ct, &nat_conn->rev_key, - time_msec(), NULL, NULL)) { - return true; - } - - return false; + return !conn_lookup(ct, rev_key, time_msec(), NULL, NULL); } bool found = false; if (nat_info->nat_action & NAT_ACTION_DST_PORT) { - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.src.port, + found = nat_get_unique_l4(ct, rev_key, &rev_key->src.port, curr_dport, min_dport, max_dport); } if (!found) { - found = nat_get_unique_l4(ct, nat_conn, &nat_conn->rev_key.dst.port, + found = nat_get_unique_l4(ct, rev_key, &rev_key->dst.port, curr_sport, min_sport, max_sport); } @@ -2489,9 +2470,9 @@ conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt, struct conn_lookup_ctx *ctx, long long now) { ovs_mutex_lock(&conn->lock); + uint8_t nw_proto = conn->key_node[CT_DIR_FWD].key.nw_proto; enum ct_update_res update_res = - l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply, - now); + l4_protos[nw_proto]->conn_update(ct, conn, pkt, ctx->reply, now); ovs_mutex_unlock(&conn->lock); return update_res; } @@ -2517,12 +2498,9 @@ conn_expiration(const struct conn *conn) } static bool -conn_expired(struct conn *conn, long long now) +conn_expired(const struct conn *conn, long long now) { - if (conn->conn_type == CT_CONN_TYPE_DEFAULT) { - return now >= conn_expiration(conn); - } - return false; + return now >= conn_expiration(conn); } static bool @@ -2548,9 +2526,7 @@ delete_conn__(struct conn *conn) static void delete_conn(struct conn *conn) { - ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT); ovs_mutex_destroy(&conn->lock); - free(conn->nat_conn); delete_conn__(conn); } @@ -2630,7 +2606,9 @@ tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone, key->src.icmp_type = tuple->icmp_type; key->src.icmp_code = tuple->icmp_code; key->dst.icmp_id = tuple->icmp_id; - key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type); + key->dst.icmp_type = (tuple->ip_proto == IPPROTO_ICMP) + ? reverse_icmp_type(tuple->icmp_type) + : reverse_icmp6_type(tuple->icmp_type); key->dst.icmp_code = tuple->icmp_code; } else { key->src.port = tuple->src_port; @@ -2643,11 +2621,18 @@ static void conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, long long now) { + const struct conn_key *rev_key = &conn->key_node[CT_DIR_REV].key; + const struct conn_key *key = &conn->key_node[CT_DIR_FWD].key; + memset(entry, 0, sizeof *entry); - conn_key_to_tuple(&conn->key, &entry->tuple_orig); - conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply); + conn_key_to_tuple(key, &entry->tuple_orig); + conn_key_to_tuple(rev_key, &entry->tuple_reply); - entry->zone = conn->key.zone; + if (conn->alg_related) { + conn_key_to_tuple(&conn->parent_key, &entry->tuple_parent); + } + + entry->zone = key->zone; ovs_mutex_lock(&conn->lock); entry->mark = conn->mark; @@ -2655,7 +2640,7 @@ conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry, long long expiration = conn_expiration(conn) - now; - struct ct_l4_proto *class = l4_protos[conn->key.nw_proto]; + struct ct_l4_proto *class = l4_protos[key->nw_proto]; if (class->conn_get_protoinfo) { class->conn_get_protoinfo(conn, &entry->protoinfo); } @@ -2684,37 +2669,43 @@ conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump, if (pzone) { dump->zone = *pzone; dump->filter_zone = true; + dump->current_zone = dump->zone; } dump->ct = ct; *ptot_bkts = 1; /* Need to clean up the callers. */ + dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]); return 0; } int conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry) { - struct conntrack *ct = dump->ct; long long now = time_msec(); - for (;;) { - struct cmap_node *cm_node = cmap_next_position(&ct->conns, - &dump->cm_pos); - if (!cm_node) { - break; - } - struct conn *conn; - INIT_CONTAINER(conn, cm_node, cm_node); + struct conn_key_node *keyn; + struct conn *conn; - if (conn_expired(conn, now)) { - continue; - } + while (true) { + CMAP_CURSOR_FOR_EACH_CONTINUE (keyn, cm_node, &dump->cursor) { + if (keyn->dir != CT_DIR_FWD) { + continue; + } + + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + if (conn_expired(conn, now)) { + continue; + } - if ((!dump->filter_zone || conn->key.zone == dump->zone) && - (conn->conn_type != CT_CONN_TYPE_UN_NAT)) { conn_to_ct_dpif_entry(conn, entry, now); return 0; } + + if (dump->filter_zone || dump->current_zone == UINT16_MAX) { + break; + } + dump->current_zone++; + dump->cursor = cmap_cursor_start(&dump->ct->conns[dump->current_zone]); } return EOF; @@ -2726,19 +2717,98 @@ conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED) return 0; } +static void +exp_node_to_ct_dpif_exp(const struct alg_exp_node *exp, + struct ct_dpif_exp *entry) +{ + memset(entry, 0, sizeof *entry); + + conn_key_to_tuple(&exp->key, &entry->tuple_orig); + conn_key_to_tuple(&exp->parent_key, &entry->tuple_parent); + entry->zone = exp->key.zone; + entry->mark = exp->parent_mark; + memcpy(&entry->labels, &exp->parent_label, sizeof entry->labels); + entry->protoinfo.proto = exp->key.nw_proto; +} + int -conntrack_flush(struct conntrack *ct, const uint16_t *zone) +conntrack_exp_dump_start(struct conntrack *ct, struct conntrack_dump *dump, + const uint16_t *pzone) +{ + memset(dump, 0, sizeof(*dump)); + + if (pzone) { + dump->zone = *pzone; + dump->filter_zone = true; + } + + dump->ct = ct; + + return 0; +} + +int +conntrack_exp_dump_next(struct conntrack_dump *dump, struct ct_dpif_exp *entry) +{ + struct conntrack *ct = dump->ct; + struct alg_exp_node *enode; + int ret = EOF; + + ovs_rwlock_rdlock(&ct->resources_lock); + + for (;;) { + struct hmap_node *node = hmap_at_position(&ct->alg_expectations, + &dump->hmap_pos); + if (!node) { + break; + } + + enode = CONTAINER_OF(node, struct alg_exp_node, node); + + if (!dump->filter_zone || enode->key.zone == dump->zone) { + ret = 0; + exp_node_to_ct_dpif_exp(enode, entry); + break; + } + } + + ovs_rwlock_unlock(&ct->resources_lock); + + return ret; +} + +int +conntrack_exp_dump_done(struct conntrack_dump *dump OVS_UNUSED) { + return 0; +} + +static int +conntrack_flush_zone(struct conntrack *ct, const uint16_t zone) +{ + struct conn_key_node *keyn; struct conn *conn; - CMAP_FOR_EACH (conn, cm_node, &ct->conns) { - if (conn->conn_type != CT_CONN_TYPE_DEFAULT) { + CMAP_FOR_EACH (keyn, cm_node, &ct->conns[zone]) { + if (keyn->dir != CT_DIR_FWD) { continue; } + conn = CONTAINER_OF(keyn, struct conn, key_node[CT_DIR_FWD]); + conn_clean(ct, conn); + } - if (!zone || *zone == conn->key.zone) { - conn_clean(ct, conn); - } + return 0; +} + +int +conntrack_flush(struct conntrack *ct, const uint16_t *zone) +{ + if (zone) { + return conntrack_flush_zone(ct, *zone); + } + + for (unsigned i = 0; i < ARRAY_SIZE(ct->conns); i++) { + conntrack_flush_zone(ct, i); } return 0; @@ -2748,18 +2818,18 @@ int conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple, uint16_t zone) { - int error = 0; struct conn_key key; struct conn *conn; + int error = 0; memset(&key, 0, sizeof(key)); tuple_to_conn_key(tuple, zone, &key); conn_lookup(ct, &key, time_msec(), &conn, NULL); - if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) { + if (conn) { conn_clean(ct, conn); } else { - VLOG_WARN("Must flush tuple using the original pre-NATed tuple"); + VLOG_WARN("Tuple not found"); error = ENOENT; } @@ -2902,50 +2972,54 @@ expectation_create(struct conntrack *ct, ovs_be16 dst_port, const struct conn *parent_conn, bool reply, bool src_ip_wc, bool skip_nat) { + const struct conn_key *pconn_key, *pconn_rev_key; union ct_addr src_addr; union ct_addr dst_addr; union ct_addr alg_nat_repl_addr; struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node); + pconn_key = &parent_conn->key_node[CT_DIR_FWD].key; + pconn_rev_key = &parent_conn->key_node[CT_DIR_REV].key; + if (reply) { - src_addr = parent_conn->key.src.addr; - dst_addr = parent_conn->key.dst.addr; + src_addr = pconn_key->src.addr; + dst_addr = pconn_key->dst.addr; alg_exp_node->nat_rpl_dst = true; if (skip_nat) { alg_nat_repl_addr = dst_addr; } else if (parent_conn->nat_action & NAT_ACTION_DST) { - alg_nat_repl_addr = parent_conn->rev_key.src.addr; + alg_nat_repl_addr = pconn_rev_key->src.addr; alg_exp_node->nat_rpl_dst = false; } else { - alg_nat_repl_addr = parent_conn->rev_key.dst.addr; + alg_nat_repl_addr = pconn_rev_key->dst.addr; } } else { - src_addr = parent_conn->rev_key.src.addr; - dst_addr = parent_conn->rev_key.dst.addr; + src_addr = pconn_rev_key->src.addr; + dst_addr = pconn_rev_key->dst.addr; alg_exp_node->nat_rpl_dst = false; if (skip_nat) { alg_nat_repl_addr = src_addr; } else if (parent_conn->nat_action & NAT_ACTION_DST) { - alg_nat_repl_addr = parent_conn->key.dst.addr; + alg_nat_repl_addr = pconn_key->dst.addr; alg_exp_node->nat_rpl_dst = true; } else { - alg_nat_repl_addr = parent_conn->key.src.addr; + alg_nat_repl_addr = pconn_key->src.addr; } } if (src_ip_wc) { memset(&src_addr, 0, sizeof src_addr); } - alg_exp_node->key.dl_type = parent_conn->key.dl_type; - alg_exp_node->key.nw_proto = parent_conn->key.nw_proto; - alg_exp_node->key.zone = parent_conn->key.zone; + alg_exp_node->key.dl_type = pconn_key->dl_type; + alg_exp_node->key.nw_proto = pconn_key->nw_proto; + alg_exp_node->key.zone = pconn_key->zone; alg_exp_node->key.src.addr = src_addr; alg_exp_node->key.dst.addr = dst_addr; alg_exp_node->key.src.port = ALG_WC_SRC_PORT; alg_exp_node->key.dst.port = dst_port; alg_exp_node->parent_mark = parent_conn->mark; alg_exp_node->parent_label = parent_conn->label; - memcpy(&alg_exp_node->parent_key, &parent_conn->key, + memcpy(&alg_exp_node->parent_key, pconn_key, sizeof alg_exp_node->parent_key); /* Take the write lock here because it is almost 100% * likely that the lookup will fail and @@ -3197,12 +3271,16 @@ process_ftp_ctl_v4(struct conntrack *ct, switch (mode) { case CT_FTP_MODE_ACTIVE: - *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4; - conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4; + *v4_addr_rep = + conn_for_expectation->key_node[CT_DIR_REV].key.dst.addr.ipv4; + conn_ipv4_addr = + conn_for_expectation->key_node[CT_DIR_FWD].key.src.addr.ipv4; break; case CT_FTP_MODE_PASSIVE: - *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4; - conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4; + *v4_addr_rep = + conn_for_expectation->key_node[CT_DIR_FWD].key.dst.addr.ipv4; + conn_ipv4_addr = + conn_for_expectation->key_node[CT_DIR_REV].key.src.addr.ipv4; break; case CT_TFTP_MODE: default: @@ -3234,7 +3312,7 @@ skip_ipv6_digits(char *str) static enum ftp_ctl_pkt process_ftp_ctl_v6(struct conntrack *ct, struct dp_packet *pkt, - const struct conn *conn_for_expectation, + const struct conn *conn_for_exp, union ct_addr *v6_addr_rep, char **ftp_data_start, size_t *addr_offset_from_ftp_data_start, size_t *addr_size, enum ct_alg_mode *mode) @@ -3302,24 +3380,25 @@ process_ftp_ctl_v6(struct conntrack *ct, switch (*mode) { case CT_FTP_MODE_ACTIVE: - *v6_addr_rep = conn_for_expectation->rev_key.dst.addr; + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_REV].key.dst.addr; /* Although most servers will block this exploit, there may be some * less well managed. */ if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) && - memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6, + memcmp(&ip6_addr, + &conn_for_exp->key_node[CT_DIR_FWD].key.src.addr.ipv6, sizeof ip6_addr)) { return CT_FTP_CTL_INVALID; } break; case CT_FTP_MODE_PASSIVE: - *v6_addr_rep = conn_for_expectation->key.dst.addr; + *v6_addr_rep = conn_for_exp->key_node[CT_DIR_FWD].key.dst.addr; break; case CT_TFTP_MODE: default: OVS_NOT_REACHED(); } - expectation_create(ct, port, conn_for_expectation, + expectation_create(ct, port, conn_for_exp, !!(pkt->md.ct_state & CS_REPLY_DIR), false, false); return CT_FTP_CTL_INTEREST; } @@ -3427,7 +3506,9 @@ handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, } if (seq_skew) { ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew; - if (!dp_packet_hwol_is_ipv4(pkt)) { + if (dp_packet_hwol_tx_ip_csum(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum, l3_hdr->ip_tot_len, htons(ip_len)); @@ -3448,8 +3529,10 @@ handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx, adj_seqnum(&th->tcp_seq, ec->seq_skew); } - th->tcp_csum = 0; - if (!dp_packet_hwol_tx_l4_checksum(pkt)) { + if (dp_packet_hwol_tx_l4_checksum(pkt)) { + dp_packet_ol_reset_l4_csum_good(pkt); + } else { + th->tcp_csum = 0; if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto, dp_packet_l4_size(pkt)); @@ -3473,7 +3556,8 @@ handle_tftp_ctl(struct conntrack *ct, long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED) { - expectation_create(ct, conn_for_expectation->key.src.port, + expectation_create(ct, + conn_for_expectation->key_node[CT_DIR_FWD].key.src.port, conn_for_expectation, !!(pkt->md.ct_state & CS_REPLY_DIR), false, false); } diff --git a/lib/conntrack.h b/lib/conntrack.h index b064abc9fa4..13bb02ea934 100644 --- a/lib/conntrack.h +++ b/lib/conntrack.h @@ -77,12 +77,18 @@ enum nat_action_e { NAT_ACTION_DST_PORT = 1 << 3, }; +enum nat_flags_e { + NAT_RANGE_RANDOM = 1 << 0, + NAT_PERSISTENT = 1 << 1, +}; + struct nat_action_info_t { union ct_addr min_addr; union ct_addr max_addr; uint16_t min_port; uint16_t max_port; uint16_t nat_action; + uint16_t nat_flags; }; struct conntrack *conntrack_init(void); @@ -92,7 +98,7 @@ int conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, ovs_be16 dl_type, bool force, bool commit, uint16_t zone, const uint32_t *setmark, const struct ovs_key_ct_labels *setlabel, - ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper, + const char *helper, const struct nat_action_info_t *nat_action_info, long long now, uint32_t tp_id); void conntrack_clear(struct dp_packet *packet); @@ -100,9 +106,13 @@ void conntrack_clear(struct dp_packet *packet); struct conntrack_dump { struct conntrack *ct; unsigned bucket; - struct cmap_position cm_pos; + union { + struct hmap_position hmap_pos; + struct cmap_cursor cursor; + }; bool filter_zone; uint16_t zone; + uint16_t current_zone; }; struct conntrack_zone_limit { @@ -119,11 +129,14 @@ struct timeout_policy { enum { INVALID_ZONE = -2, - DEFAULT_ZONE = -1, /* Default zone for zone limit management. */ + DEFAULT_ZONE = OVS_ZONE_LIMIT_DEFAULT_ZONE, /* Default zone for zone + * limit management. */ MIN_ZONE = 0, MAX_ZONE = 0xFFFF, }; +BUILD_ASSERT_DECL(DEFAULT_ZONE > INVALID_ZONE && DEFAULT_ZONE < MIN_ZONE); + struct ct_dpif_entry; struct ct_dpif_tuple; @@ -132,6 +145,11 @@ int conntrack_dump_start(struct conntrack *, struct conntrack_dump *, int conntrack_dump_next(struct conntrack_dump *, struct ct_dpif_entry *); int conntrack_dump_done(struct conntrack_dump *); +int conntrack_exp_dump_start(struct conntrack *, struct conntrack_dump *, + const uint16_t *); +int conntrack_exp_dump_next(struct conntrack_dump *, struct ct_dpif_exp *); +int conntrack_exp_dump_done(struct conntrack_dump *); + int conntrack_flush(struct conntrack *, const uint16_t *zone); int conntrack_flush_tuple(struct conntrack *, const struct ct_dpif_tuple *, uint16_t zone); @@ -139,11 +157,13 @@ int conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns); int conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns); int conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns); int conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled); +int conntrack_set_sweep_interval(struct conntrack *ct, uint32_t ms); +uint32_t conntrack_get_sweep_interval(struct conntrack *ct); bool conntrack_get_tcp_seq_chk(struct conntrack *ct); struct ipf *conntrack_ipf_ctx(struct conntrack *ct); struct conntrack_zone_limit zone_limit_get(struct conntrack *ct, int32_t zone); int zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit); -int zone_limit_delete(struct conntrack *ct, uint16_t zone); +int zone_limit_delete(struct conntrack *ct, int32_t zone); #endif /* conntrack.h */ diff --git a/lib/cooperative-multitasking-private.h b/lib/cooperative-multitasking-private.h new file mode 100644 index 00000000000..cb83823779e --- /dev/null +++ b/lib/cooperative-multitasking-private.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COOPERATIVE_MULTITASKING_PRIVATE_H +#define COOPERATIVE_MULTITASKING_PRIVATE_H 1 + +#include "openvswitch/hmap.h" + +extern struct hmap cooperative_multitasking_callbacks; + +struct cm_entry { + struct hmap_node node; + void (*cb)(void *); + void *arg; + long long int threshold; + long long int last_run; + const char *name; +}; + +#endif /* COOPERATIVE_MULTITASKING_PRIVATE_H */ diff --git a/lib/cooperative-multitasking.c b/lib/cooperative-multitasking.c new file mode 100644 index 00000000000..3a91af26fe1 --- /dev/null +++ b/lib/cooperative-multitasking.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "backtrace.h" +#include "cooperative-multitasking-private.h" +#include "cooperative-multitasking.h" +#include "hash.h" +#include "openvswitch/hmap.h" +#include "openvswitch/vlog.h" +#include "timeval.h" + +VLOG_DEFINE_THIS_MODULE(cooperative_multitasking); + +struct hmap cooperative_multitasking_callbacks = HMAP_INITIALIZER( + &cooperative_multitasking_callbacks); + +/* Free any data allocated by calls to cooperative_multitasking_set(). */ +void +cooperative_multitasking_destroy(void) +{ + struct cm_entry *cm_entry; + HMAP_FOR_EACH_SAFE (cm_entry, node, &cooperative_multitasking_callbacks) { + hmap_remove(&cooperative_multitasking_callbacks, &cm_entry->node); + free(cm_entry); + } +} + +/* Set/update callback as identified by 'cb' and 'arg'. + * + * 'name' is used for logging events related to this callback. + * + * The value for 'last_run' must be updated each time the callback is run. + * + * Updating the value for 'threshold' may be necessary as a consequence of + * change in runtime configuration or requirements of the part of the program + * serviced by the callback. + * + * Providing a value of 0 for 'last_run' or 'threshold' will leave the stored + * value untouched. */ +void +cooperative_multitasking_set(void (*cb)(void *), void *arg, + long long int last_run, long long int threshold, + const char *name) +{ + struct cm_entry *cm_entry; + + HMAP_FOR_EACH_WITH_HASH (cm_entry, node, hash_pointer((void *) cb, 0), + &cooperative_multitasking_callbacks) { + if (cm_entry->cb == cb && cm_entry->arg == arg) { + if (last_run) { + cm_entry->last_run = last_run; + } + + if (threshold) { + cm_entry->threshold = threshold; + } + return; + } + } + + cm_entry = xzalloc(sizeof *cm_entry); + cm_entry->cb = cb; + cm_entry->arg = arg; + cm_entry->threshold = threshold; + cm_entry->last_run = last_run ? last_run : time_msec(); + cm_entry->name = name; + + hmap_insert(&cooperative_multitasking_callbacks, + &cm_entry->node, hash_pointer((void *) cm_entry->cb, 0)); +} + +/* Remove callback identified by 'cb' and 'arg'. */ +void +cooperative_multitasking_remove(void (*cb)(void *), void *arg) +{ + struct cm_entry *cm_entry; + + HMAP_FOR_EACH_WITH_HASH (cm_entry, node, hash_pointer((void *) cb, 0), + &cooperative_multitasking_callbacks) { + if (cm_entry->cb == cb && cm_entry->arg == arg) { + hmap_remove(&cooperative_multitasking_callbacks, &cm_entry->node); + free(cm_entry); + return; + } + } +} + +static void +cooperative_multitasking_yield_at__(const char *source_location) +{ + long long int start = time_msec(); + struct cm_entry *cm_entry; + long long int elapsed; + bool warn; + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + elapsed = time_msec() - cm_entry->last_run; + + if (elapsed >= cm_entry->threshold) { + warn = elapsed - cm_entry->threshold > cm_entry->threshold / 8; + + VLOG(warn ? VLL_WARN : VLL_DBG, "%s: yield for %s(%p): " + "elapsed(%lld) >= threshold(%lld), overrun: %lld", + source_location, cm_entry->name, cm_entry->arg, elapsed, + cm_entry->threshold, elapsed - cm_entry->threshold); + + if (warn && VLOG_IS_DBG_ENABLED()) { + log_backtrace(); + } + + (*cm_entry->cb)(cm_entry->arg); + } + } + + elapsed = time_msec() - start; + if (elapsed > 1000) { + VLOG_WARN("Unreasonably long %lldms runtime for callbacks.", elapsed); + } +} + +/* Iterate over registered callbacks and execute callbacks as demanded by the + * recorded time threshold. */ +void +cooperative_multitasking_yield_at(const char *source_location) +{ + static bool yield_in_progress = false; + + if (yield_in_progress) { + VLOG_ERR_ONCE("Nested yield avoided, this is a bug! " + "Enable debug logging for more details."); + if (VLOG_IS_DBG_ENABLED()) { + VLOG_DBG("%s: nested yield.", source_location); + log_backtrace(); + } + return; + } + yield_in_progress = true; + + cooperative_multitasking_yield_at__(source_location); + + yield_in_progress = false; +} diff --git a/lib/cooperative-multitasking.h b/lib/cooperative-multitasking.h new file mode 100644 index 00000000000..9185c18810e --- /dev/null +++ b/lib/cooperative-multitasking.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef COOPERATIVE_MULTITASKING_H +#define COOPERATIVE_MULTITASKING_H 1 + +/* + * cooperative-multitasking, interleaved execution for Open vSwitch. + * + * Overview + * ======== + * + * One of the goals of Open vSwitch is to be as resource efficient as + * possible. Core parts of the program has been implemented as asynchronous + * state machines, and when absolutely necessary additional threads are used. + * + * Modules with mostly synchronous and single threaded code that are expected + * to have heavy processing, can make use of the cooperative-multitasking + * interface to yield to modules that have registered callbacks at a time + * threshold. + * + * Typical Usage + * ============= + * + * The module that provides the callback typically has a run() function that is + * already part of the main processing loop and can then register like this: + * + * static void my_run_cb(void *arg); + * + * static void + * my_run(struct data *my_data) + * { + * ... + * + * cooperative_multitasking_set(&my_run_cb, (void *) my_data, + * time_msec(), 1000, "my_run"); + * } + * + * static void + * my_run_cb (void *arg) + * { + * struct data *my_data = (struct data *) arg; + * + * my_run(my_data); + * } + * + * static void + * my_destroy(struct data *my_data) + * { + * ... + * + * cooperatrive_multitasking_remove(&my_run_cb, (void *) my_data); + * } + * + * The module that is expected to have heavy processing can yield like this: + * + * HMAP_FOR_EACH (row, hmap_node, &src_table->rows) { + * cooperative_multitasking_yield(); + * + * ... + * } + * + * Rules for implementation + * ======================== + * + * - The module that registers itself with a callback must not use the yield + * functionality inside nor should it be possible to do so via calls to other + * modules. + * + * - The module that registers the callback should be self-sufficient, i.e. + * the internal state of that module should not matter to the outside world, + * at least it should not matter for the call stack that enters the + * cooperative_multitasking_yield(). + * + * - cooperative_multitasking_yield() must not be called from places that can + * loop indefinitely, only in places that eventually end, otherwise it may + * give a false impression that the server is working fine while it is stuck + * and not actually doing any useful work. + * + * Thread-safety + * ============= + * + * The cooperative-multitasking module and functions therein are not thread + * safe and must only be used by one thread. + */ + +void cooperative_multitasking_destroy(void); + +void cooperative_multitasking_set(void (*cb)(void *), void *arg, + long long int last_run, + long long int threshold, + const char *name); + +void cooperative_multitasking_remove(void (*cb)(void *), void *arg); + +void cooperative_multitasking_yield_at(const char *source_location); +#define cooperative_multitasking_yield() \ + cooperative_multitasking_yield_at(OVS_SOURCE_LOCATOR) + +#endif /* COOPERATIVE_MULTITASKING_H */ diff --git a/lib/cpu.c b/lib/cpu.c index 0292f715ec4..fbbea400535 100644 --- a/lib/cpu.c +++ b/lib/cpu.c @@ -37,7 +37,9 @@ static bool x86_has_isa(uint32_t leaf, enum x86_reg reg, uint32_t bit) { uint32_t regs[4]; - ovs_assert(__get_cpuid_max(leaf & X86_LEAF_MASK, NULL) >= leaf); + if (__get_cpuid_max(leaf & X86_LEAF_MASK, NULL) < leaf) { + return false; + } __cpuid_count(leaf, 0, regs[EAX], regs[EBX], regs[ECX], regs[EDX]); return (regs[reg] & ((uint32_t) 1 << bit)) != 0; diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c index cfc2315e3dc..5a836b6683f 100644 --- a/lib/ct-dpif.c +++ b/lib/ct-dpif.c @@ -20,8 +20,10 @@ #include #include "ct-dpif.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-parse.h" #include "openvswitch/vlog.h" +#include "sset.h" VLOG_DEFINE_THIS_MODULE(ct_dpif); @@ -31,24 +33,19 @@ struct flags { const char *name; }; +/* Protection for CT zone limit per datapath. */ +static struct sset ct_limit_protection = + SSET_INITIALIZER(&ct_limit_protection); + static void ct_dpif_format_counters(struct ds *, const struct ct_dpif_counters *); static void ct_dpif_format_timestamp(struct ds *, const struct ct_dpif_timestamp *); -static void ct_dpif_format_flags(struct ds *, const char *title, - uint32_t flags, const struct flags *); static void ct_dpif_format_protoinfo(struct ds *, const char *title, const struct ct_dpif_protoinfo *, bool verbose); static void ct_dpif_format_helper(struct ds *, const char *title, const struct ct_dpif_helper *); - -static const struct flags ct_dpif_status_flags[] = { -#define CT_DPIF_STATUS_FLAG(FLAG) { CT_DPIF_STATUS_##FLAG, #FLAG }, - CT_DPIF_STATUS_FLAGS -#undef CT_DPIF_STATUS_FLAG - { 0, NULL } /* End marker. */ -}; /* Dumping */ @@ -109,25 +106,259 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) ? dpif->dpif_class->ct_dump_done(dpif, dump) : EOPNOTSUPP); } + +/* Start dumping the expectations from the connection tracker. + * + * 'dump' must be the address of a pointer to a struct ct_dpif_dump_state, + * which should be passed (unaltered) to ct_exp_dpif_dump_{next,done}(). + * + * If 'zone' is not NULL, it should point to an integer identifing a + * conntrack zone to which the dump will be limited. If it is NULL, + * conntrack entries from all zones will be dumped. + * + * If there has been a problem the function returns a non-zero value + * that represents the error. Otherwise it returns zero. */ +int +ct_exp_dpif_dump_start(struct dpif *dpif, struct ct_dpif_dump_state **dump, + const uint16_t *zone) +{ + int err; + + err = (dpif->dpif_class->ct_exp_dump_start + ? dpif->dpif_class->ct_exp_dump_start(dpif, dump, zone) + : EOPNOTSUPP); + + if (!err) { + (*dump)->dpif = dpif; + } + + return err; +} + +/* Dump one expectation and put it in 'entry'. + * + * 'dump' should have been initialized by ct_exp_dpif_dump_start(). + * + * The function returns 0, if an entry has been dumped succesfully. + * Otherwise it returns a non-zero value which can be: + * - EOF: meaning that there are no more entries to dump. + * - an error value. + * In both cases, the user should call ct_exp_dpif_dump_done(). */ +int +ct_exp_dpif_dump_next(struct ct_dpif_dump_state *dump, + struct ct_dpif_exp *entry) +{ + struct dpif *dpif = dump->dpif; + + return (dpif->dpif_class->ct_exp_dump_next + ? dpif->dpif_class->ct_exp_dump_next(dpif, dump, entry) + : EOPNOTSUPP); +} + +/* Free resources used by 'dump', if any. */ +int +ct_exp_dpif_dump_done(struct ct_dpif_dump_state *dump) +{ + struct dpif *dpif = dump->dpif; + + return (dpif->dpif_class->ct_exp_dump_done + ? dpif->dpif_class->ct_exp_dump_done(dpif, dump) + : EOPNOTSUPP); +} +/* Flushing. */ + +static void +ct_dpif_tuple_from_ofp_ct_tuple(const struct ofp_ct_tuple *ofp_tuple, + struct ct_dpif_tuple *tuple, + uint16_t l3_type, uint8_t ip_proto) +{ + if (l3_type == AF_INET) { + tuple->src.ip = in6_addr_get_mapped_ipv4(&ofp_tuple->src); + tuple->dst.ip = in6_addr_get_mapped_ipv4(&ofp_tuple->dst); + } else { + tuple->src.in6 = ofp_tuple->src; + tuple->dst.in6 = ofp_tuple->dst; + } + + tuple->l3_type = l3_type; + tuple->ip_proto = ip_proto; + tuple->src_port = ofp_tuple->src_port; + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + tuple->icmp_code = ofp_tuple->icmp_code; + tuple->icmp_type = ofp_tuple->icmp_type; + } else { + tuple->dst_port = ofp_tuple->dst_port; + } +} + +static inline bool +ct_dpif_inet_addr_cmp_partial(const union ct_dpif_inet_addr *addr, + const struct in6_addr *partial, uint16_t l3_type) +{ + if (ipv6_is_zero(partial)) { + return true; + } + + if (l3_type == AF_INET && in6_addr_get_mapped_ipv4(partial) != addr->ip) { + return false; + } + + if (l3_type == AF_INET6 && !ipv6_addr_equals(partial, &addr->in6)) { + return false; + } + + return true; +} + +static inline bool +ct_dpif_tuple_ip_cmp_partial(const struct ct_dpif_tuple *tuple, + const struct ofp_ct_tuple *partial, + uint16_t l3_type, uint8_t ip_proto) +{ + if (!ct_dpif_inet_addr_cmp_partial(&tuple->src, &partial->src, l3_type)) { + return false; + } + + if (!ct_dpif_inet_addr_cmp_partial(&tuple->dst, &partial->dst, l3_type)) { + return false; + } + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + if (partial->icmp_id != tuple->icmp_id) { + return false; + } + + if (partial->icmp_type != tuple->icmp_type) { + return false; + } + + if (partial->icmp_code != tuple->icmp_code) { + return false; + } + } else { + if (partial->src_port && partial->src_port != tuple->src_port) { + return false; + } + + if (partial->dst_port && partial->dst_port != tuple->dst_port) { + return false; + } + } + + return true; +} + +/* Returns 'true' if all non-zero members of 'match' equal to corresponding + * members of 'entry'. */ +static bool +ct_dpif_entry_cmp(const struct ct_dpif_entry *entry, + const struct ofp_ct_match *match) +{ + if (match->l3_type && match->l3_type != entry->tuple_orig.l3_type) { + return false; + } + + if (match->ip_proto && match->ip_proto != entry->tuple_orig.ip_proto) { + return false; + } + + if (!ct_dpif_tuple_ip_cmp_partial(&entry->tuple_orig, &match->tuple_orig, + match->l3_type, match->ip_proto)) { + return false; + } + + if (!ct_dpif_tuple_ip_cmp_partial(&entry->tuple_reply, &match->tuple_reply, + match->l3_type, match->ip_proto)) { + return false; + } + + if ((match->mark & match->mark_mask) != (entry->mark & match->mark_mask)) { + return false; + } + + if (!ovs_u128_equals(ovs_u128_and(match->labels, match->labels_mask), + ovs_u128_and(entry->labels, match->labels_mask))) { + return false; + } + + return true; +} + +static int +ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t *zone, + const struct ofp_ct_match *match) +{ + struct ct_dpif_dump_state *dump; + struct ct_dpif_entry cte; + int error; + int tot_bkts; + + if (!dpif->dpif_class->ct_flush) { + return EOPNOTSUPP; + } + + if (VLOG_IS_DBG_ENABLED()) { + struct ds ds = DS_EMPTY_INITIALIZER; + ofp_ct_match_format(&ds, match); + VLOG_DBG("%s: ct_flush: zone=%d %s", dpif_name(dpif), zone ? *zone : 0, + ds_cstr(&ds)); + ds_destroy(&ds); + } + + /* If we have full five tuple in original and empty reply tuple just + * do the flush over original tuple directly. */ + if (ofp_ct_match_is_five_tuple(match)) { + struct ct_dpif_tuple tuple; + + ct_dpif_tuple_from_ofp_ct_tuple(&match->tuple_orig, &tuple, + match->l3_type, match->ip_proto); + return dpif->dpif_class->ct_flush(dpif, zone, &tuple); + } + + error = ct_dpif_dump_start(dpif, &dump, zone, &tot_bkts); + if (error) { + return error; + } + + while (!(error = ct_dpif_dump_next(dump, &cte))) { + if (zone && *zone != cte.zone) { + continue; + } + + if (ct_dpif_entry_cmp(&cte, match)) { + error = dpif->dpif_class->ct_flush(dpif, &cte.zone, + &cte.tuple_orig); + if (error) { + break; + } + } + } + if (error == EOF) { + error = 0; + } + + ct_dpif_dump_done(dump); + return error; +} + /* Flush the entries in the connection tracker used by 'dpif'. The * arguments have the following behavior: * - * - If both 'zone' and 'tuple' are NULL, flush all the conntrack entries. - * - If 'zone' is not NULL, and 'tuple' is NULL, flush all the conntrack + * - If both 'zone' is NULL and 'match' is NULL or zero, flush all the + * conntrack entries. + * - If 'zone' is not NULL, and 'match' is NULL, flush all the conntrack * entries in '*zone'. - * - If 'tuple' is not NULL, flush the conntrack entry specified by 'tuple' - * in '*zone'. If 'zone' is NULL, use the default zone (zone 0). */ + * - If 'match' is not NULL or zero, flush the conntrack entry specified + * by 'match' in '*zone'. If 'zone' is NULL, use the default zone + * (zone 0). */ int ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, - const struct ct_dpif_tuple *tuple) + const struct ofp_ct_match *match) { - if (tuple) { - struct ds ds = DS_EMPTY_INITIALIZER; - ct_dpif_format_tuple(&ds, tuple); - VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), ds_cstr(&ds), - zone ? *zone : 0); - ds_destroy(&ds); + if (match && !ofp_ct_match_is_zero(match)) { + return ct_dpif_flush_tuple(dpif, zone, match); } else if (zone) { VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone); } else { @@ -135,7 +366,7 @@ ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, } return (dpif->dpif_class->ct_flush - ? dpif->dpif_class->ct_flush(dpif, zone, tuple) + ? dpif->dpif_class->ct_flush(dpif, zone, NULL) : EOPNOTSUPP); } @@ -180,23 +411,19 @@ ct_dpif_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) } int -ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, - const struct ovs_list *zone_limits) +ct_dpif_set_limits(struct dpif *dpif, const struct ovs_list *zone_limits) { return (dpif->dpif_class->ct_set_limits - ? dpif->dpif_class->ct_set_limits(dpif, default_limit, - zone_limits) + ? dpif->dpif_class->ct_set_limits(dpif, zone_limits) : EOPNOTSUPP); } int -ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, - const struct ovs_list *zone_limits_in, +ct_dpif_get_limits(struct dpif *dpif, const struct ovs_list *zone_limits_in, struct ovs_list *zone_limits_out) { return (dpif->dpif_class->ct_get_limits - ? dpif->dpif_class->ct_get_limits(dpif, default_limit, - zone_limits_in, + ? dpif->dpif_class->ct_get_limits(dpif, zone_limits_in, zone_limits_out) : EOPNOTSUPP); } @@ -209,6 +436,20 @@ ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *zone_limits) : EOPNOTSUPP); } +int +ct_dpif_sweep(struct dpif *dpif, uint32_t *ms) +{ + if (*ms) { + return (dpif->dpif_class->ct_set_sweep_interval + ? dpif->dpif_class->ct_set_sweep_interval(dpif, *ms) + : EOPNOTSUPP); + } else { + return (dpif->dpif_class->ct_get_sweep_interval + ? dpif->dpif_class->ct_get_sweep_interval(dpif, ms) + : EOPNOTSUPP); + } +} + int ct_dpif_ipf_set_enabled(struct dpif *dpif, bool v6, bool enable) { @@ -275,6 +516,48 @@ ct_dpif_entry_uninit(struct ct_dpif_entry *entry) } } +static const char * +ct_dpif_status_flags(uint32_t flags) +{ + switch (flags) { +#define CT_DPIF_STATUS_FLAG(FLAG) \ + case CT_DPIF_STATUS_##FLAG: \ + return #FLAG; + CT_DPIF_STATUS_FLAGS +#undef CT_DPIF_TCP_FLAG + default: + return NULL; + } +} + +void +ct_dpif_format_exp_entry(const struct ct_dpif_exp *entry, struct ds *ds) +{ + ct_dpif_format_ipproto(ds, entry->tuple_orig.ip_proto); + + ds_put_cstr(ds, ",orig=("); + ct_dpif_format_tuple(ds, &entry->tuple_orig); + ds_put_cstr(ds, ")"); + + if (entry->zone) { + ds_put_format(ds, ",zone=%"PRIu16, entry->zone); + } + if (entry->mark) { + ds_put_format(ds, ",mark=%"PRIu32, entry->mark); + } + if (!ovs_u128_is_zero(entry->labels)) { + ovs_be128 value; + + ds_put_cstr(ds, ",labels="); + value = hton128(entry->labels); + ds_put_hex(ds, &value, sizeof value); + } + + ds_put_cstr(ds, ",parent=("); + ct_dpif_format_tuple(ds, &entry->tuple_parent); + ds_put_cstr(ds, ")"); +} + void ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds, bool verbose, bool print_stats) @@ -305,8 +588,9 @@ ct_dpif_format_entry(const struct ct_dpif_entry *entry, struct ds *ds, ds_put_format(ds, ",zone=%"PRIu16, entry->zone); } if (verbose) { - ct_dpif_format_flags(ds, ",status=", entry->status, - ct_dpif_status_flags); + format_flags_masked(ds, ",status", ct_dpif_status_flags, + entry->status, CT_DPIF_STATUS_MASK, + CT_DPIF_STATUS_MASK); } if (print_stats) { ds_put_format(ds, ",timeout=%"PRIu32, entry->timeout); @@ -415,28 +699,6 @@ ct_dpif_format_tuple(struct ds *ds, const struct ct_dpif_tuple *tuple) } } -static void -ct_dpif_format_flags(struct ds *ds, const char *title, uint32_t flags, - const struct flags *table) -{ - if (title) { - ds_put_cstr(ds, title); - } - for (; table->name; table++) { - if (flags & table->flag) { - ds_put_format(ds, "%s|", table->name); - } - } - ds_chomp(ds, '|'); -} - -static const struct flags tcp_flags[] = { -#define CT_DPIF_TCP_FLAG(FLAG) { CT_DPIF_TCPF_##FLAG, #FLAG }, - CT_DPIF_TCP_FLAGS -#undef CT_DPIF_TCP_FLAG - { 0, NULL } /* End marker. */ -}; - const char *ct_dpif_tcp_state_string[] = { #define CT_DPIF_TCP_STATE(STATE) [CT_DPIF_TCPS_##STATE] = #STATE, CT_DPIF_TCP_STATES @@ -498,6 +760,20 @@ ct_dpif_format_protoinfo_tcp(struct ds *ds, ct_dpif_format_enum(ds, "state=", tcp_state, ct_dpif_tcp_state_string); } +static const char * +ct_dpif_tcp_flags(uint32_t flags) +{ + switch (flags) { +#define CT_DPIF_TCP_FLAG(FLAG) \ + case CT_DPIF_TCPF_##FLAG: \ + return #FLAG; + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG + default: + return NULL; + } +} + static void ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds, const struct ct_dpif_protoinfo *protoinfo) @@ -512,10 +788,14 @@ ct_dpif_format_protoinfo_tcp_verbose(struct ds *ds, protoinfo->tcp.wscale_orig, protoinfo->tcp.wscale_reply); } - ct_dpif_format_flags(ds, ",flags_orig=", protoinfo->tcp.flags_orig, - tcp_flags); - ct_dpif_format_flags(ds, ",flags_reply=", protoinfo->tcp.flags_reply, - tcp_flags); + + format_flags_masked(ds, ",flags_orig", ct_dpif_tcp_flags, + protoinfo->tcp.flags_orig, CT_DPIF_TCPF_MASK, + CT_DPIF_TCPF_MASK); + + format_flags_masked(ds, ",flags_reply", ct_dpif_tcp_flags, + protoinfo->tcp.flags_reply, CT_DPIF_TCPF_MASK, + CT_DPIF_TCPF_MASK); } static void @@ -581,115 +861,9 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, int conn_per_state) ds_put_format(ds, "=%u", conn_per_state); } -/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. - * Returns true on success. Otherwise, returns false and puts the error - * message in 'ds'. */ -bool -ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const char *s, struct ds *ds) -{ - char *pos, *key, *value, *copy; - memset(tuple, 0, sizeof *tuple); - - pos = copy = xstrdup(s); - while (ofputil_parse_key_value(&pos, &key, &value)) { - if (!*value) { - ds_put_format(ds, "field %s missing value", key); - goto error; - } - - if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { - if (tuple->l3_type && tuple->l3_type != AF_INET) { - ds_put_cstr(ds, "L3 type set multiple times"); - goto error; - } else { - tuple->l3_type = AF_INET; - } - if (!ip_parse(value, key[6] == 's' ? &tuple->src.ip : - &tuple->dst.ip)) { - goto error_with_msg; - } - } else if (!strcmp(key, "ct_ipv6_src") || - !strcmp(key, "ct_ipv6_dst")) { - if (tuple->l3_type && tuple->l3_type != AF_INET6) { - ds_put_cstr(ds, "L3 type set multiple times"); - goto error; - } else { - tuple->l3_type = AF_INET6; - } - if (!ipv6_parse(value, key[8] == 's' ? &tuple->src.in6 : - &tuple->dst.in6)) { - goto error_with_msg; - } - } else if (!strcmp(key, "ct_nw_proto")) { - char *err = str_to_u8(value, key, &tuple->ip_proto); - if (err) { - free(err); - goto error_with_msg; - } - } else if (!strcmp(key, "ct_tp_src") || !strcmp(key,"ct_tp_dst")) { - uint16_t port; - char *err = str_to_u16(value, key, &port); - if (err) { - free(err); - goto error_with_msg; - } - if (key[6] == 's') { - tuple->src_port = htons(port); - } else { - tuple->dst_port = htons(port); - } - } else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") || - !strcmp(key, "icmp_id") ) { - if (tuple->ip_proto != IPPROTO_ICMP && - tuple->ip_proto != IPPROTO_ICMPV6) { - ds_put_cstr(ds, "invalid L4 fields"); - goto error; - } - uint16_t icmp_id; - char *err; - if (key[5] == 't') { - err = str_to_u8(value, key, &tuple->icmp_type); - } else if (key[5] == 'c') { - err = str_to_u8(value, key, &tuple->icmp_code); - } else { - err = str_to_u16(value, key, &icmp_id); - tuple->icmp_id = htons(icmp_id); - } - if (err) { - free(err); - goto error_with_msg; - } - } else { - ds_put_format(ds, "invalid conntrack tuple field: %s", key); - goto error; - } - } - - if (ipv6_is_zero(&tuple->src.in6) || ipv6_is_zero(&tuple->dst.in6) || - !tuple->ip_proto) { - /* icmp_type, icmp_code, and icmp_id can be 0. */ - if (tuple->ip_proto != IPPROTO_ICMP && - tuple->ip_proto != IPPROTO_ICMPV6) { - if (!tuple->src_port || !tuple->dst_port) { - ds_put_cstr(ds, "at least one of the conntrack 5-tuple fields " - "is missing."); - goto error; - } - } - } - - free(copy); - return true; - -error_with_msg: - ds_put_format(ds, "failed to parse field %s", key); -error: - free(copy); - return false; -} void -ct_dpif_push_zone_limit(struct ovs_list *zone_limits, uint16_t zone, +ct_dpif_push_zone_limit(struct ovs_list *zone_limits, int32_t zone, uint32_t limit, uint32_t count) { struct ct_dpif_zone_limit *zone_limit = xmalloc(sizeof *zone_limit); @@ -763,15 +937,21 @@ ct_dpif_parse_zone_limit_tuple(const char *s, uint16_t *pzone, } void -ct_dpif_format_zone_limits(uint32_t default_limit, - const struct ovs_list *zone_limits, struct ds *ds) +ct_dpif_format_zone_limits(const struct ovs_list *zone_limits, struct ds *ds) { struct ct_dpif_zone_limit *zone_limit; - ds_put_format(ds, "default limit=%"PRIu32, default_limit); + LIST_FOR_EACH (zone_limit, node, zone_limits) { + if (zone_limit->zone == OVS_ZONE_LIMIT_DEFAULT_ZONE) { + ds_put_format(ds, "default limit=%"PRIu32, zone_limit->limit); + } + } LIST_FOR_EACH (zone_limit, node, zone_limits) { - ds_put_format(ds, "\nzone=%"PRIu16, zone_limit->zone); + if (zone_limit->zone == OVS_ZONE_LIMIT_DEFAULT_ZONE) { + continue; + } + ds_put_format(ds, "\nzone=%"PRIu16, (uint16_t) zone_limit->zone); ds_put_format(ds, ",limit=%"PRIu32, zone_limit->limit); ds_put_format(ds, ",count=%"PRIu32, zone_limit->count); } @@ -897,3 +1077,23 @@ ct_dpif_get_features(struct dpif *dpif, enum ct_features *features) ? dpif->dpif_class->ct_get_features(dpif, features) : EOPNOTSUPP); } + +void +ct_dpif_set_zone_limit_protection(struct dpif *dpif, bool protected) +{ + if (sset_contains(&ct_limit_protection, dpif->full_name) == protected) { + return; + } + + if (protected) { + sset_add(&ct_limit_protection, dpif->full_name); + } else { + sset_find_and_delete(&ct_limit_protection, dpif->full_name); + } +} + +bool +ct_dpif_is_zone_limit_protected(struct dpif *dpif) +{ + return sset_contains(&ct_limit_protection, dpif->full_name); +} diff --git a/lib/ct-dpif.h b/lib/ct-dpif.h index b59cba962a7..c3786d5ae54 100644 --- a/lib/ct-dpif.h +++ b/lib/ct-dpif.h @@ -20,6 +20,8 @@ #include "openvswitch/types.h" #include "packets.h" +struct ofp_ct_match; + union ct_dpif_inet_addr { ovs_be32 ip; ovs_be32 ip6[4]; @@ -103,6 +105,8 @@ enum ct_dpif_tcp_flags { #undef CT_DPIF_TCP_FLAG }; +#define CT_DPIF_TCPF_MASK ((CT_DPIF_TCPF_MAXACK_SET << 1) - 1) + extern const char *ct_dpif_sctp_state_string[]; #define CT_DPIF_SCTP_STATES \ @@ -173,6 +177,18 @@ enum ct_dpif_status_flags { #undef CT_DPIF_STATUS_FLAG }; +#define CT_DPIF_STATUS_MASK ((CT_DPIF_STATUS_UNTRACKED << 1) - 1) + +struct ct_dpif_exp { + struct ct_dpif_tuple tuple_orig; + struct ct_dpif_tuple tuple_parent; + uint16_t zone; + struct ct_dpif_protoinfo protoinfo; + ovs_u128 labels; + uint32_t status; + uint32_t mark; +}; + struct ct_dpif_entry { /* Const members. */ struct ct_dpif_tuple tuple_orig; @@ -221,7 +237,7 @@ struct ct_dpif_dump_state { }; struct ct_dpif_zone_limit { - uint16_t zone; + int32_t zone; uint32_t limit; /* Limit on number of entries. */ uint32_t count; /* Current number of entries. */ struct ovs_list node; @@ -280,18 +296,22 @@ int ct_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, const uint16_t *zone, int *); int ct_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_entry *); int ct_dpif_dump_done(struct ct_dpif_dump_state *); +int ct_exp_dpif_dump_start(struct dpif *, struct ct_dpif_dump_state **, + const uint16_t *zone); +int ct_exp_dpif_dump_next(struct ct_dpif_dump_state *, struct ct_dpif_exp *); +int ct_exp_dpif_dump_done(struct ct_dpif_dump_state *); int ct_dpif_flush(struct dpif *, const uint16_t *zone, - const struct ct_dpif_tuple *); + const struct ofp_ct_match *); int ct_dpif_set_maxconns(struct dpif *dpif, uint32_t maxconns); int ct_dpif_get_maxconns(struct dpif *dpif, uint32_t *maxconns); int ct_dpif_get_nconns(struct dpif *dpif, uint32_t *nconns); int ct_dpif_set_tcp_seq_chk(struct dpif *dpif, bool enabled); int ct_dpif_get_tcp_seq_chk(struct dpif *dpif, bool *enabled); -int ct_dpif_set_limits(struct dpif *dpif, const uint32_t *default_limit, - const struct ovs_list *); -int ct_dpif_get_limits(struct dpif *dpif, uint32_t *default_limit, - const struct ovs_list *, struct ovs_list *); +int ct_dpif_set_limits(struct dpif *dpif, const struct ovs_list *); +int ct_dpif_get_limits(struct dpif *dpif, const struct ovs_list *, + struct ovs_list *); int ct_dpif_del_limits(struct dpif *dpif, const struct ovs_list *); +int ct_dpif_sweep(struct dpif *, uint32_t *ms); int ct_dpif_ipf_set_enabled(struct dpif *, bool v6, bool enable); int ct_dpif_ipf_set_min_frag(struct dpif *, bool v6, uint32_t min_frag); int ct_dpif_ipf_set_max_nfrags(struct dpif *, uint32_t max_frags); @@ -303,18 +323,17 @@ int ct_dpif_ipf_dump_done(struct dpif *dpif, void *); void ct_dpif_entry_uninit(struct ct_dpif_entry *); void ct_dpif_format_entry(const struct ct_dpif_entry *, struct ds *, bool verbose, bool print_stats); +void ct_dpif_format_exp_entry(const struct ct_dpif_exp *, struct ds *); void ct_dpif_format_ipproto(struct ds *ds, uint16_t ipproto); void ct_dpif_format_tuple(struct ds *, const struct ct_dpif_tuple *); uint8_t ct_dpif_coalesce_tcp_state(uint8_t state); void ct_dpif_format_tcp_stat(struct ds *, int, int); -bool ct_dpif_parse_tuple(struct ct_dpif_tuple *, const char *s, struct ds *); -void ct_dpif_push_zone_limit(struct ovs_list *, uint16_t zone, uint32_t limit, +void ct_dpif_push_zone_limit(struct ovs_list *, int32_t zone, uint32_t limit, uint32_t count); void ct_dpif_free_zone_limits(struct ovs_list *); bool ct_dpif_parse_zone_limit_tuple(const char *s, uint16_t *pzone, uint32_t *plimit, struct ds *); -void ct_dpif_format_zone_limits(uint32_t default_limit, - const struct ovs_list *, struct ds *); +void ct_dpif_format_zone_limits(const struct ovs_list *, struct ds *); bool ct_dpif_set_timeout_policy_attr_by_name(struct ct_dpif_timeout_policy *tp, const char *key, uint32_t value); bool ct_dpif_timeout_policy_support_ipproto(uint8_t ipproto); @@ -331,5 +350,7 @@ int ct_dpif_get_timeout_policy_name(struct dpif *dpif, uint32_t tp_id, uint16_t dl_type, uint8_t nw_proto, char **tp_name, bool *is_generic); int ct_dpif_get_features(struct dpif *dpif, enum ct_features *features); +void ct_dpif_set_zone_limit_protection(struct dpif *, bool protected); +bool ct_dpif_is_zone_limit_protected(struct dpif *); #endif /* CT_DPIF_H */ diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c index 52f3d4bc635..4fdc6e3c496 100644 --- a/lib/daemon-unix.c +++ b/lib/daemon-unix.c @@ -88,7 +88,8 @@ static bool switch_user = false; static uid_t uid; static gid_t gid; static char *user = NULL; -static void daemon_become_new_user__(bool access_datapath); +static void daemon_become_new_user__(bool access_datapath, + bool access_hardware_ports); static void check_already_running(void); static int lock_pidfile(FILE *, int command); @@ -396,6 +397,8 @@ monitor_daemon(pid_t daemon_pid) } log_received_backtrace(daemonize_fd); + close(daemonize_fd); + daemonize_fd = -1; /* Throttle restarts to no more than once every 10 seconds. */ if (time(NULL) < last_restart + 10) { @@ -441,13 +444,13 @@ monitor_daemon(pid_t daemon_pid) * daemonize_complete()) or that it failed to start up (by exiting with a * nonzero exit code). */ void -daemonize_start(bool access_datapath) +daemonize_start(bool access_datapath, bool access_hardware_ports) { assert_single_threaded(); daemonize_fd = -1; if (switch_user) { - daemon_become_new_user__(access_datapath); + daemon_become_new_user__(access_datapath, access_hardware_ports); switch_user = false; } @@ -805,7 +808,8 @@ daemon_become_new_user_unix(void) /* Linux specific implementation of daemon_become_new_user() * using libcap-ng. */ static void -daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) +daemon_become_new_user_linux(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { #if defined __linux__ && HAVE_LIBCAPNG int ret; @@ -825,6 +829,20 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN) || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW) || capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST); +#ifdef DPDK_NETDEV + if (access_hardware_ports && !ret) { + ret = capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO); + if (!ret) { + VLOG_INFO("The Linux capability CAP_SYS_RAWIO " + "is enabled."); + } + } +#else + if (access_hardware_ports) { + VLOG_WARN("No driver requires Linux capability " + "CAP_SYS_RAWIO, disabling it."); + } +#endif } } else { ret = -1; @@ -852,7 +870,7 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED) } static void -daemon_become_new_user__(bool access_datapath) +daemon_become_new_user__(bool access_datapath, bool access_hardware_ports) { /* If vlog file has been created, change its owner to the non-root user * as specifed by the --user option. */ @@ -860,7 +878,8 @@ daemon_become_new_user__(bool access_datapath) if (LINUX) { if (LIBCAPNG) { - daemon_become_new_user_linux(access_datapath); + daemon_become_new_user_linux(access_datapath, + access_hardware_ports); } else { VLOG_FATAL("%s: fail to downgrade user using libcap-ng. " "(libcap-ng is not configured at compile time), " @@ -875,11 +894,11 @@ daemon_become_new_user__(bool access_datapath) * However, there in case the user switch needs to be done * before daemonize_start(), the following API can be used. */ void -daemon_become_new_user(bool access_datapath) +daemon_become_new_user(bool access_datapath, bool access_hardware_ports) { assert_single_threaded(); if (switch_user) { - daemon_become_new_user__(access_datapath); + daemon_become_new_user__(access_datapath, access_hardware_ports); /* daemonize_start() should not switch user again. */ switch_user = false; } diff --git a/lib/daemon-windows.c b/lib/daemon-windows.c index 7e5f264f5b9..4e6bbe0f040 100644 --- a/lib/daemon-windows.c +++ b/lib/daemon-windows.c @@ -498,7 +498,8 @@ make_pidfile(void) } void -daemonize_start(bool access_datapath OVS_UNUSED) +daemonize_start(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { if (pidfile) { make_pidfile(); @@ -526,7 +527,8 @@ daemonize_complete(void) } void -daemon_become_new_user(bool access_datapath OVS_UNUSED) +daemon_become_new_user(bool access_datapath OVS_UNUSED, + bool access_hardware_ports OVS_UNUSED) { } diff --git a/lib/daemon.c b/lib/daemon.c index 3249c5ab4b5..1e1c019eb1b 100644 --- a/lib/daemon.c +++ b/lib/daemon.c @@ -48,7 +48,7 @@ get_detach(void) void daemonize(void) { - daemonize_start(false); + daemonize_start(false, false); daemonize_complete(); } diff --git a/lib/daemon.h b/lib/daemon.h index 09415749636..42372d14630 100644 --- a/lib/daemon.h +++ b/lib/daemon.h @@ -167,10 +167,10 @@ void set_detach(void); bool get_detach(void); void daemon_save_fd(int fd); void daemonize(void); -void daemonize_start(bool access_datapath); +void daemonize_start(bool access_datapath, bool access_hardware_ports); void daemonize_complete(void); void daemon_set_new_user(const char * user_spec); -void daemon_become_new_user(bool access_datapath); +void daemon_become_new_user(bool access_datapath, bool access_hardware_ports); void daemon_usage(void); void daemon_disable_self_confinement(void); bool daemon_should_self_confine(void); diff --git a/lib/db-ctl-base.c b/lib/db-ctl-base.c index bc85e992173..b3e9b92d197 100644 --- a/lib/db-ctl-base.c +++ b/lib/db-ctl-base.c @@ -75,7 +75,7 @@ static struct shash all_commands = SHASH_INITIALIZER(&all_commands); static char *get_table(const char *, const struct ovsdb_idl_table_class **); static char *set_column(const struct ovsdb_idl_table_class *, const struct ovsdb_idl_row *, const char *, - struct ovsdb_symbol_table *); + struct ovsdb_symbol_table *, bool use_partial_update); static struct option * @@ -820,6 +820,7 @@ check_condition(const struct ovsdb_idl_table_class *table, type.value.type = OVSDB_TYPE_VOID; error = ovsdb_datum_from_string(&b, &type, value_string, symtab); if (error) { + ovsdb_atom_destroy(&want_key, column->type.key.type); goto out; } @@ -1325,11 +1326,17 @@ cmd_find(struct ctl_context *ctx) } /* Sets the column of 'row' in 'table'. Returns NULL on success or a - * malloc()'ed error message on failure. */ + * malloc()'ed error message on failure. + * + * If 'use_partial_update' is true, then this function will try to use + * partial set/map updates, if possible. As a side effect, result will + * not be reflected in the IDL until the transaction is committed. + * The last access to a particular column is a good candidate to use + * this option. */ static char * OVS_WARN_UNUSED_RESULT set_column(const struct ovsdb_idl_table_class *table, const struct ovsdb_idl_row *row, const char *arg, - struct ovsdb_symbol_table *symtab) + struct ovsdb_symbol_table *symtab, bool use_partial_update) { const struct ovsdb_idl_column *column; char *key_string = NULL; @@ -1352,7 +1359,7 @@ set_column(const struct ovsdb_idl_table_class *table, if (key_string) { union ovsdb_atom key, value; - struct ovsdb_datum datum; + struct ovsdb_datum *datum; if (column->type.value.type == OVSDB_TYPE_VOID) { error = xasprintf("cannot specify key to set for non-map column " @@ -1368,19 +1375,26 @@ set_column(const struct ovsdb_idl_table_class *table, error = ovsdb_atom_from_string(&value, NULL, &column->type.value, value_string, symtab); if (error) { + ovsdb_atom_destroy(&key, column->type.key.type); goto out; } - ovsdb_datum_init_empty(&datum); - ovsdb_datum_add_unsafe(&datum, &key, &value, &column->type, NULL); + datum = xmalloc(sizeof *datum); + ovsdb_datum_init_empty(datum); + ovsdb_datum_add_unsafe(datum, &key, &value, &column->type, NULL); ovsdb_atom_destroy(&key, column->type.key.type); ovsdb_atom_destroy(&value, column->type.value.type); - ovsdb_datum_union(&datum, ovsdb_idl_read(row, column), - &column->type); - ovsdb_idl_txn_verify(row, column); - ovsdb_idl_txn_write(row, column, &datum); + if (use_partial_update) { + ovsdb_idl_txn_write_partial_map(row, column, datum); + } else { + ovsdb_datum_union(datum, ovsdb_idl_read(row, column), + &column->type); + ovsdb_idl_txn_verify(row, column); + ovsdb_idl_txn_write(row, column, datum); + free(datum); + } } else { struct ovsdb_datum datum; @@ -1441,7 +1455,8 @@ cmd_set(struct ctl_context *ctx) } for (i = 3; i < ctx->argc; i++) { - ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab); + ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab, + ctx->last_command); if (ctx->error) { return; } @@ -1731,31 +1746,45 @@ cmd_create(struct ctl_context *ctx) const struct ovsdb_idl_table_class *table; const struct ovsdb_idl_row *row; const struct uuid *uuid = NULL; + bool persist_uuid = false; + struct uuid uuid_; int i; ctx->error = get_table(table_name, &table); if (ctx->error) { return; } + if (id) { - struct ovsdb_symbol *symbol = NULL; + if (uuid_from_string(&uuid_, id)) { + uuid = &uuid_; + persist_uuid = true; + } else { + struct ovsdb_symbol *symbol = NULL; - ctx->error = create_symbol(ctx->symtab, id, &symbol, NULL); - if (ctx->error) { - return; - } - if (table->is_root) { - /* This table is in the root set, meaning that rows created in it - * won't disappear even if they are unreferenced, so disable - * warnings about that by pretending that there is a reference. */ - symbol->strong_ref = true; + ctx->error = create_symbol(ctx->symtab, id, &symbol, NULL); + if (ctx->error) { + return; + } + if (table->is_root) { + /* This table is in the root set, meaning that rows created in + * it won't disappear even if they are unreferenced, so disable + * warnings about that by pretending that there is a + * reference. */ + symbol->strong_ref = true; + } + uuid = &symbol->uuid; } - uuid = &symbol->uuid; } - row = ovsdb_idl_txn_insert(ctx->txn, table, uuid); + if (persist_uuid) { + row = ovsdb_idl_txn_insert_persist_uuid(ctx->txn, table, uuid); + } else { + row = ovsdb_idl_txn_insert(ctx->txn, table, uuid); + } + for (i = 2; i < ctx->argc; i++) { - ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab); + ctx->error = set_column(table, row, ctx->argv[i], ctx->symtab, false); if (ctx->error) { return; } @@ -2606,7 +2635,8 @@ ctl_list_db_tables_usage(void) /* Initializes 'ctx' from 'command'. */ void ctl_context_init_command(struct ctl_context *ctx, - struct ctl_command *command) + struct ctl_command *command, + bool last) { ctx->argc = command->argc; ctx->argv = command->argv; @@ -2615,6 +2645,7 @@ ctl_context_init_command(struct ctl_context *ctx, ds_swap(&ctx->output, &command->output); ctx->table = command->table; ctx->try_again = false; + ctx->last_command = last; ctx->error = NULL; } @@ -2625,8 +2656,9 @@ ctl_context_init(struct ctl_context *ctx, struct ctl_command *command, struct ovsdb_symbol_table *symtab, void (*invalidate_cache_cb)(struct ctl_context *)) { + ds_init(&ctx->output); if (command) { - ctl_context_init_command(ctx, command); + ctl_context_init_command(ctx, command, false); } ctx->idl = idl; ctx->txn = txn; @@ -2657,6 +2689,7 @@ ctl_context_done(struct ctl_context *ctx, ctl_context_done_command(ctx, command); } invalidate_cache(ctx); + ds_destroy(&ctx->output); } char * OVS_WARN_UNUSED_RESULT @@ -2670,7 +2703,7 @@ ctl_set_column(const char *table_name, const struct ovsdb_idl_row *row, if (error) { return error; } - error = set_column(table, row, arg, symtab); + error = set_column(table, row, arg, symtab, false); if (error) { return error; } diff --git a/lib/db-ctl-base.h b/lib/db-ctl-base.h index 284b573d0bc..ea7e97b7844 100644 --- a/lib/db-ctl-base.h +++ b/lib/db-ctl-base.h @@ -239,9 +239,15 @@ struct ctl_context { /* A command may set this member to true if some prerequisite is not met * and the caller should wait for something to change and then retry. */ bool try_again; + + /* If set during the context initialization, command implementation + * may use optimizations that will leave database changes invisible + * to IDL, e.g. use partial set updates. */ + bool last_command; }; -void ctl_context_init_command(struct ctl_context *, struct ctl_command *); +void ctl_context_init_command(struct ctl_context *, struct ctl_command *, + bool last); void ctl_context_init(struct ctl_context *, struct ctl_command *, struct ovsdb_idl *, struct ovsdb_idl_txn *, struct ovsdb_symbol_table *, diff --git a/lib/db-ctl-base.man b/lib/db-ctl-base.man index a529d8b4d3f..c8111c9efbe 100644 --- a/lib/db-ctl-base.man +++ b/lib/db-ctl-base.man @@ -203,7 +203,7 @@ Without \fB\-\-if-exists\fR, it is an error if \fIrecord\fR does not exist. With \fB\-\-if-exists\fR, this command does nothing if \fIrecord\fR does not exist. . -.IP "[\fB\-\-id=@\fIname\fR] \fBcreate\fR \fItable column\fR[\fB:\fIkey\fR]\fB=\fIvalue\fR..." +.IP "[\fB\-\-id=(@\fIname\fR | \fIuuid\fR] \fBcreate\fR \fItable column\fR[\fB:\fIkey\fR]\fB=\fIvalue\fR..." Creates a new record in \fItable\fR and sets the initial values of each \fIcolumn\fR. Columns not explicitly set will receive their default values. Outputs the UUID of the new row. @@ -212,6 +212,9 @@ If \fB@\fIname\fR is specified, then the UUID for the new row may be referred to by that name elsewhere in the same \fB\*(PN\fR invocation in contexts where a UUID is expected. Such references may precede or follow the \fBcreate\fR command. +.IP +If a valid \fIuuid\fR is specified, then it is used as the UUID +of the new row. . .RS .IP "Caution (ovs-vsctl as example)" diff --git a/lib/db-ctl-base.xml b/lib/db-ctl-base.xml index f6efe98eaf0..27c999fe71f 100644 --- a/lib/db-ctl-base.xml +++ b/lib/db-ctl-base.xml @@ -310,7 +310,7 @@

-
[--id=@name] create table column[:key]=value...
+
[--id=(@name|uuid)] create table column[:key]=value...

Creates a new record in table and sets the initial values of @@ -323,6 +323,10 @@ invocation in contexts where a UUID is expected. Such references may precede or follow the create command.

+

+ If a valid uuid is specified, then it is used as the + UUID of the new row. +

Caution (ovs-vsctl as example)
diff --git a/lib/dhparams.c b/lib/dhparams.c index 85123863fc5..50209d5d813 100644 --- a/lib/dhparams.c +++ b/lib/dhparams.c @@ -6,6 +6,7 @@ #include "lib/dhparams.h" #include "openvswitch/util.h" +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static int my_DH_set0_pqg(DH *dh, BIGNUM *p, const BIGNUM **q OVS_UNUSED, BIGNUM *g) { @@ -142,3 +143,4 @@ DH *get_dh4096(void) } return dh; } +#endif diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c new file mode 100644 index 00000000000..847685ad989 --- /dev/null +++ b/lib/dp-packet-gso.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "dp-packet.h" +#include "dp-packet-gso.h" +#include "netdev-provider.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(dp_packet_gso); + +/* Retuns a new packet that is a segment of packet 'p'. + * + * The new packet is initialized with 'hdr_len' bytes from the + * start of packet 'p' and then appended with 'data_len' bytes + * from the 'data' buffer. + * + * Note: The packet headers are not updated. */ +static struct dp_packet * +dp_packet_gso_seg_new(const struct dp_packet *p, size_t hdr_len, + const char *data, size_t data_len) +{ + struct dp_packet *seg = dp_packet_new_with_headroom(hdr_len + data_len, + dp_packet_headroom(p)); + + /* Append the original packet headers and then the payload. */ + dp_packet_put(seg, dp_packet_data(p), hdr_len); + dp_packet_put(seg, data, data_len); + + /* The new segment should have the same offsets. */ + seg->l2_5_ofs = p->l2_5_ofs; + seg->l3_ofs = p->l3_ofs; + seg->l4_ofs = p->l4_ofs; + + /* The protocol headers remain the same, so preserve hash and mark. */ + *dp_packet_rss_ptr(seg) = *dp_packet_rss_ptr(p); + *dp_packet_flow_mark_ptr(seg) = *dp_packet_flow_mark_ptr(p); + + /* The segment should inherit all the offloading flags from the + * original packet, except for the TCP segmentation, external + * buffer and indirect buffer flags. */ + *dp_packet_ol_flags_ptr(seg) = *dp_packet_ol_flags_ptr(p) + & DP_PACKET_OL_SUPPORTED_MASK; + + dp_packet_hwol_reset_tcp_seg(seg); + + return seg; +} + +/* Returns the calculated number of TCP segments in packet 'p'. */ +int +dp_packet_gso_nr_segs(struct dp_packet *p) +{ + uint16_t segsz = dp_packet_get_tso_segsz(p); + const char *data_tail; + const char *data_pos; + + data_pos = dp_packet_get_tcp_payload(p); + data_tail = (char *) dp_packet_tail(p) - dp_packet_l2_pad_size(p); + + return DIV_ROUND_UP(data_tail - data_pos, segsz); +} + +/* Perform software segmentation on packet 'p'. + * + * Segments packet 'p' into the array of preallocated batches in 'batches', + * updating the 'batches' pointer as needed and returns true. + * + * Returns false if the packet cannot be segmented. */ +bool +dp_packet_gso(struct dp_packet *p, struct dp_packet_batch **batches) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct dp_packet_batch *curr_batch = *batches; + struct tcp_header *tcp_hdr; + struct ip_header *ip_hdr; + struct dp_packet *seg; + uint16_t tcp_offset; + uint16_t tso_segsz; + uint32_t tcp_seq; + uint16_t ip_id; + int hdr_len; + int seg_len; + + tso_segsz = dp_packet_get_tso_segsz(p); + if (!tso_segsz) { + VLOG_WARN_RL(&rl, "GSO packet with len %d with no segment size.", + dp_packet_size(p)); + return false; + } + + tcp_hdr = dp_packet_l4(p); + tcp_offset = TCP_OFFSET(tcp_hdr->tcp_ctl); + tcp_seq = ntohl(get_16aligned_be32(&tcp_hdr->tcp_seq)); + hdr_len = ((char *) dp_packet_l4(p) - (char *) dp_packet_eth(p)) + + tcp_offset * 4; + ip_id = 0; + if (dp_packet_hwol_is_ipv4(p)) { + ip_hdr = dp_packet_l3(p); + ip_id = ntohs(ip_hdr->ip_id); + } + + const char *data_tail = (char *) dp_packet_tail(p) + - dp_packet_l2_pad_size(p); + const char *data_pos = dp_packet_get_tcp_payload(p); + int n_segs = dp_packet_gso_nr_segs(p); + + for (int i = 0; i < n_segs; i++) { + seg_len = data_tail - data_pos; + if (seg_len > tso_segsz) { + seg_len = tso_segsz; + } + + seg = dp_packet_gso_seg_new(p, hdr_len, data_pos, seg_len); + data_pos += seg_len; + + /* Update L3 header. */ + if (dp_packet_hwol_is_ipv4(seg)) { + ip_hdr = dp_packet_l3(seg); + ip_hdr->ip_tot_len = htons(sizeof *ip_hdr + + dp_packet_l4_size(seg)); + ip_hdr->ip_id = htons(ip_id); + ip_hdr->ip_csum = 0; + ip_id++; + } else { + struct ovs_16aligned_ip6_hdr *ip6_hdr = dp_packet_l3(seg); + + ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen + = htons(dp_packet_l3_size(seg) - sizeof *ip6_hdr); + } + + /* Update L4 header. */ + tcp_hdr = dp_packet_l4(seg); + put_16aligned_be32(&tcp_hdr->tcp_seq, htonl(tcp_seq)); + tcp_seq += seg_len; + if (OVS_LIKELY(i < (n_segs - 1))) { + /* Reset flags PUSH and FIN unless it is the last segment. */ + uint16_t tcp_flags = TCP_FLAGS(tcp_hdr->tcp_ctl) + & ~(TCP_PSH | TCP_FIN); + tcp_hdr->tcp_ctl = TCP_CTL(tcp_flags, tcp_offset); + } + + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, seg); + } + + *batches = curr_batch; + return true; +} diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h new file mode 100644 index 00000000000..9c282fb86cc --- /dev/null +++ b/lib/dp-packet-gso.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DP_PACKET_GSO_H +#define DP_PACKET_GSO_H 1 + +bool dp_packet_gso(struct dp_packet *, struct dp_packet_batch **); +int dp_packet_gso_nr_segs(struct dp_packet *); + +#endif /* dp-packet-gso.h */ diff --git a/lib/dp-packet.c b/lib/dp-packet.c index 4538d2a6148..df7bf8e6b3a 100644 --- a/lib/dp-packet.c +++ b/lib/dp-packet.c @@ -21,6 +21,7 @@ #include "dp-packet.h" #include "netdev-afxdp.h" #include "netdev-dpdk.h" +#include "netdev-provider.h" #include "openvswitch/dynamic-string.h" #include "util.h" @@ -33,10 +34,14 @@ dp_packet_init__(struct dp_packet *b, size_t allocated, enum dp_packet_source so pkt_metadata_init(&b->md, 0); dp_packet_reset_cutlen(b); dp_packet_reset_offload(b); + dp_packet_set_tso_segsz(b, 0); /* Initialize implementation-specific fields of dp_packet. */ dp_packet_init_specific(b); /* By default assume the packet type to be Ethernet. */ b->packet_type = htonl(PT_ETH); + /* Reset csum start and offset. */ + b->csum_start = 0; + b->csum_offset = 0; } static void @@ -134,11 +139,7 @@ dp_packet_uninit(struct dp_packet *b) if (b->source == DPBUF_MALLOC) { free(dp_packet_base(b)); } else if (b->source == DPBUF_DPDK) { -#ifdef DPDK_NETDEV - /* If this dp_packet was allocated by DPDK it must have been - * created as a dp_packet */ - free_dpdk_buf((struct dp_packet*) b); -#endif + free_dpdk_buf(b); } else if (b->source == DPBUF_AFXDP) { free_afxdp_buf(b); } @@ -150,7 +151,11 @@ dp_packet_uninit(struct dp_packet *b) struct dp_packet * dp_packet_new(size_t size) { +#ifdef DPDK_NETDEV + struct dp_packet *b = xmalloc_cacheline(sizeof *b); +#else struct dp_packet *b = xmalloc(sizeof *b); +#endif dp_packet_init(b, size); return b; } @@ -171,6 +176,7 @@ dp_packet_new_with_headroom(size_t size, size_t headroom) struct dp_packet * dp_packet_clone(const struct dp_packet *buffer) { + ovs_assert(buffer); return dp_packet_clone_with_headroom(buffer, 0); } @@ -180,12 +186,15 @@ dp_packet_clone(const struct dp_packet *buffer) struct dp_packet * dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) { + const void *data_dp = dp_packet_data(buffer); struct dp_packet *new_buffer; uint32_t mark; - new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(buffer), - dp_packet_size(buffer), - headroom); + ovs_assert(data_dp); + + new_buffer = dp_packet_clone_data_with_headroom(data_dp, + dp_packet_size(buffer), + headroom); /* Copy the following fields into the returned buffer: l2_pad_size, * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */ memcpy(&new_buffer->l2_pad_size, &buffer->l2_pad_size, @@ -195,6 +204,8 @@ dp_packet_clone_with_headroom(const struct dp_packet *buffer, size_t headroom) *dp_packet_ol_flags_ptr(new_buffer) = *dp_packet_ol_flags_ptr(buffer); *dp_packet_ol_flags_ptr(new_buffer) &= DP_PACKET_OL_SUPPORTED_MASK; + dp_packet_set_tso_segsz(new_buffer, dp_packet_get_tso_segsz(buffer)); + if (dp_packet_rss_valid(buffer)) { dp_packet_set_rss_hash(new_buffer, dp_packet_get_rss_hash(buffer)); } @@ -322,8 +333,12 @@ dp_packet_shift(struct dp_packet *b, int delta) : true); if (delta != 0) { - char *dst = (char *) dp_packet_data(b) + delta; - memmove(dst, dp_packet_data(b), dp_packet_size(b)); + const void *data_dp = dp_packet_data(b); + char *dst = (char *) data_dp + delta; + + ovs_assert(data_dp); + + memmove(dst, data_dp, dp_packet_size(b)); dp_packet_set_data(b, dst); } } @@ -348,7 +363,7 @@ void * dp_packet_put_zeros(struct dp_packet *b, size_t size) { void *dst = dp_packet_put_uninit(b, size); - memset(dst, 0, size); + nullable_memset(dst, 0, size); return dst; } @@ -359,7 +374,7 @@ void * dp_packet_put(struct dp_packet *b, const void *p, size_t size) { void *dst = dp_packet_put_uninit(b, size); - memcpy(dst, p, size); + nullable_memcpy(dst, p, size); return dst; } @@ -431,7 +446,7 @@ void * dp_packet_push_zeros(struct dp_packet *b, size_t size) { void *dst = dp_packet_push_uninit(b, size); - memset(dst, 0, size); + nullable_memset(dst, 0, size); return dst; } @@ -442,7 +457,7 @@ void * dp_packet_push(struct dp_packet *b, const void *p, size_t size) { void *dst = dp_packet_push_uninit(b, size); - memcpy(dst, p, size); + nullable_memcpy(dst, p, size); return dst; } @@ -492,6 +507,8 @@ dp_packet_resize_l2_5(struct dp_packet *b, int increment) /* Adjust layer offsets after l2_5. */ dp_packet_adjust_layer_offset(&b->l3_ofs, increment); dp_packet_adjust_layer_offset(&b->l4_ofs, increment); + dp_packet_adjust_layer_offset(&b->inner_l3_ofs, increment); + dp_packet_adjust_layer_offset(&b->inner_l4_ofs, increment); return dp_packet_data(b); } @@ -514,19 +531,121 @@ dp_packet_compare_offsets(struct dp_packet *b1, struct dp_packet *b2, if ((b1->l2_pad_size != b2->l2_pad_size) || (b1->l2_5_ofs != b2->l2_5_ofs) || (b1->l3_ofs != b2->l3_ofs) || - (b1->l4_ofs != b2->l4_ofs)) { + (b1->l4_ofs != b2->l4_ofs) || + (b1->inner_l3_ofs != b2->inner_l3_ofs) || + (b1->inner_l4_ofs != b2->inner_l4_ofs)) { if (err_str) { ds_put_format(err_str, "Packet offset comparison failed\n"); ds_put_format(err_str, "Buffer 1 offsets: l2_pad_size %u," - " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n", + " l2_5_ofs : %u l3_ofs %u, l4_ofs %u," + " inner_l3_ofs %u, inner_l4_ofs %u\n", b1->l2_pad_size, b1->l2_5_ofs, - b1->l3_ofs, b1->l4_ofs); + b1->l3_ofs, b1->l4_ofs, + b1->inner_l3_ofs, b1->inner_l4_ofs); ds_put_format(err_str, "Buffer 2 offsets: l2_pad_size %u," - " l2_5_ofs : %u l3_ofs %u, l4_ofs %u\n", + " l2_5_ofs : %u l3_ofs %u, l4_ofs %u," + " inner_l3_ofs %u, inner_l4_ofs %u\n", b2->l2_pad_size, b2->l2_5_ofs, - b2->l3_ofs, b2->l4_ofs); + b2->l3_ofs, b2->l4_ofs, + b2->inner_l3_ofs, b2->inner_l4_ofs); } return false; } return true; } + +void +dp_packet_tnl_outer_ol_send_prepare(struct dp_packet *p, + uint64_t flags) +{ + if (dp_packet_hwol_is_outer_ipv4_cksum(p)) { + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM)) { + dp_packet_ip_set_header_csum(p, false); + dp_packet_ol_set_ip_csum_good(p); + dp_packet_hwol_reset_outer_ipv4_csum(p); + } + } + + if (!dp_packet_hwol_is_outer_udp_cksum(p)) { + return; + } + + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM)) { + packet_udp_complete_csum(p, false); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_outer_udp_csum(p); + } +} + +/* Checks if the packet 'p' is compatible with netdev_ol_flags 'flags' + * and if not, updates the packet with the software fall back. */ +void +dp_packet_ol_send_prepare(struct dp_packet *p, uint64_t flags) +{ + bool tnl_inner = false; + + if (!dp_packet_hwol_tx_is_any_csum(p)) { + /* Only checksumming needs actions. */ + return; + } + + if (dp_packet_hwol_is_tunnel_geneve(p) || + dp_packet_hwol_is_tunnel_vxlan(p)) { + tnl_inner = true; + + /* If the TX interface doesn't support UDP tunnel offload but does + * support inner checksum offload and an outer UDP checksum is + * required, then we can't offload inner checksum either. As that would + * invalidate the outer checksum. */ + if (!(flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM) && + dp_packet_hwol_is_outer_udp_cksum(p)) { + flags &= ~(NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM | + NETDEV_TX_OFFLOAD_SCTP_CKSUM | + NETDEV_TX_OFFLOAD_IPV4_CKSUM); + } + } + + if (dp_packet_hwol_tx_ip_csum(p)) { + if (dp_packet_ip_checksum_good(p)) { + dp_packet_hwol_reset_tx_ip_csum(p); + } else if (!(flags & NETDEV_TX_OFFLOAD_IPV4_CKSUM)) { + dp_packet_ip_set_header_csum(p, tnl_inner); + dp_packet_ol_set_ip_csum_good(p); + dp_packet_hwol_reset_tx_ip_csum(p); + } + } + + if (!dp_packet_hwol_tx_l4_checksum(p)) { + if (tnl_inner) { + dp_packet_tnl_outer_ol_send_prepare(p, flags); + } + return; + } + + if (dp_packet_l4_checksum_good(p) && !tnl_inner) { + dp_packet_hwol_reset_tx_l4_csum(p); + return; + } + + if (dp_packet_hwol_l4_is_tcp(p) + && !(flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { + packet_tcp_complete_csum(p, tnl_inner); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (dp_packet_hwol_l4_is_udp(p) + && !(flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { + packet_udp_complete_csum(p, tnl_inner); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } else if (!(flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM) + && dp_packet_hwol_l4_is_sctp(p)) { + packet_sctp_complete_csum(p, tnl_inner); + dp_packet_ol_set_l4_csum_good(p); + dp_packet_hwol_reset_tx_l4_csum(p); + } + + if (tnl_inner) { + dp_packet_tnl_outer_ol_send_prepare(p, flags); + } +} diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 55eeaab2ce8..a75b1c5cdbb 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -25,6 +25,7 @@ #include #endif +#include "csum.h" #include "netdev-afxdp.h" #include "netdev-dpdk.h" #include "openvswitch/list.h" @@ -83,25 +84,57 @@ enum dp_packet_offload_mask { DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, RTE_MBUF_F_TX_UDP_CKSUM, 0x400), /* Offload SCTP checksum. */ DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800), + /* Offload IP checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000), + /* Offload packet is tunnel GENEVE. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE, + RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000), + /* Offload packet is tunnel VXLAN. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, + RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000), + /* Offload tunnel packet, outer header is IPv4. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4, + RTE_MBUF_F_TX_OUTER_IPV4, 0x8000), + /* Offload tunnel outer IPv4 checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM, + RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000), + /* Offload tunnel outer UDP checksum. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM, + RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000), + /* Offload tunnel packet, outer header is IPv6. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV6, + RTE_MBUF_F_TX_OUTER_IPV6, 0x40000), + /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ }; -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH | \ - DP_PACKET_OL_FLOW_MARK | \ - DP_PACKET_OL_RX_L4_CKSUM_BAD | \ - DP_PACKET_OL_RX_IP_CKSUM_BAD | \ - DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ - DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ - DP_PACKET_OL_TX_TCP_SEG | \ - DP_PACKET_OL_TX_IPV4 | \ - DP_PACKET_OL_TX_IPV6 | \ - DP_PACKET_OL_TX_TCP_CKSUM | \ - DP_PACKET_OL_TX_UDP_CKSUM | \ - DP_PACKET_OL_TX_SCTP_CKSUM) +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH | \ + DP_PACKET_OL_FLOW_MARK | \ + DP_PACKET_OL_RX_L4_CKSUM_BAD | \ + DP_PACKET_OL_RX_IP_CKSUM_BAD | \ + DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ + DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ + DP_PACKET_OL_TX_TCP_SEG | \ + DP_PACKET_OL_TX_IPV4 | \ + DP_PACKET_OL_TX_IPV6 | \ + DP_PACKET_OL_TX_TCP_CKSUM | \ + DP_PACKET_OL_TX_UDP_CKSUM | \ + DP_PACKET_OL_TX_SCTP_CKSUM | \ + DP_PACKET_OL_TX_IP_CKSUM | \ + DP_PACKET_OL_TX_TUNNEL_GENEVE | \ + DP_PACKET_OL_TX_TUNNEL_VXLAN | \ + DP_PACKET_OL_TX_OUTER_IPV4 | \ + DP_PACKET_OL_TX_OUTER_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_UDP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_IPV6) #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ DP_PACKET_OL_TX_SCTP_CKSUM) +#define DP_PACKET_OL_TX_ANY_CKSUM (DP_PACKET_OL_TX_L4_MASK | \ + DP_PACKET_OL_TX_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_IP_CKSUM | \ + DP_PACKET_OL_TX_OUTER_UDP_CKSUM) #define DP_PACKET_OL_RX_IP_CKSUM_MASK (DP_PACKET_OL_RX_IP_CKSUM_GOOD | \ DP_PACKET_OL_RX_IP_CKSUM_BAD) #define DP_PACKET_OL_RX_L4_CKSUM_MASK (DP_PACKET_OL_RX_L4_CKSUM_GOOD | \ @@ -122,6 +155,7 @@ struct dp_packet { uint32_t ol_flags; /* Offloading flags. */ uint32_t rss_hash; /* Packet hash. */ uint32_t flow_mark; /* Packet flow mark. */ + uint16_t tso_segsz; /* TCP segment size. */ #endif enum dp_packet_source source; /* Source of memory allocated as 'base'. */ @@ -134,8 +168,14 @@ struct dp_packet { * or UINT16_MAX. */ uint16_t l4_ofs; /* Transport-level header offset, or UINT16_MAX. */ + uint16_t inner_l3_ofs; /* Inner Network-level header offset, + * or UINT16_MAX. */ + uint16_t inner_l4_ofs; /* Inner Transport-level header offset, + or UINT16_MAX. */ uint32_t cutlen; /* length in bytes to cut from the end. */ ovs_be32 packet_type; /* Packet type as defined in OpenFlow */ + uint16_t csum_start; /* Position to start checksumming from. */ + uint16_t csum_offset; /* Offset to place checksum. */ union { struct pkt_metadata md; uint64_t data[DP_PACKET_CONTEXT_SIZE / 8]; @@ -160,10 +200,14 @@ static inline void dp_packet_set_size(struct dp_packet *, uint32_t); static inline uint16_t dp_packet_get_allocated(const struct dp_packet *); static inline void dp_packet_set_allocated(struct dp_packet *, uint16_t); +static inline uint16_t dp_packet_get_tso_segsz(const struct dp_packet *); +static inline void dp_packet_set_tso_segsz(struct dp_packet *, uint16_t); + void *dp_packet_resize_l2(struct dp_packet *, int increment); void *dp_packet_resize_l2_5(struct dp_packet *, int increment); static inline void *dp_packet_eth(const struct dp_packet *); static inline void dp_packet_reset_offsets(struct dp_packet *); +static inline void dp_packet_reset_offload(struct dp_packet *); static inline uint16_t dp_packet_l2_pad_size(const struct dp_packet *); static inline void dp_packet_set_l2_pad_size(struct dp_packet *, uint16_t); static inline void *dp_packet_l2_5(const struct dp_packet *); @@ -239,6 +283,8 @@ static inline bool dp_packet_equal(const struct dp_packet *, bool dp_packet_compare_offsets(struct dp_packet *good, struct dp_packet *test, struct ds *err_str); +void dp_packet_ol_send_prepare(struct dp_packet *, uint64_t); +void dp_packet_tnl_outer_ol_send_prepare(struct dp_packet *, uint64_t); /* Frees memory that 'b' points to, as well as 'b' itself. */ @@ -247,9 +293,7 @@ dp_packet_delete(struct dp_packet *b) { if (b) { if (b->source == DPBUF_DPDK) { - /* If this dp_packet was allocated by DPDK it must have been - * created as a dp_packet */ - free_dpdk_buf((struct dp_packet*) b); + free_dpdk_buf(b); return; } @@ -259,7 +303,11 @@ dp_packet_delete(struct dp_packet *b) } dp_packet_uninit(b); +#ifdef DPDK_NETDEV + free_cacheline(b); +#else free(b); +#endif } } @@ -333,6 +381,8 @@ dp_packet_clear(struct dp_packet *b) { dp_packet_set_data(b, dp_packet_base(b)); dp_packet_set_size(b, 0); + dp_packet_reset_offsets(b); + dp_packet_reset_offload(b); } /* Removes 'size' bytes from the head end of 'b', which must contain at least @@ -388,6 +438,8 @@ dp_packet_reset_offsets(struct dp_packet *b) b->l2_5_ofs = UINT16_MAX; b->l3_ofs = UINT16_MAX; b->l4_ofs = UINT16_MAX; + b->inner_l3_ofs = UINT16_MAX; + b->inner_l4_ofs = UINT16_MAX; } static inline uint16_t @@ -469,6 +521,32 @@ dp_packet_l4_size(const struct dp_packet *b) : 0; } +static inline void * +dp_packet_inner_l3(const struct dp_packet *b) +{ + return b->inner_l3_ofs != UINT16_MAX + ? (char *) dp_packet_data(b) + b->inner_l3_ofs + : NULL; +} + +static inline void * +dp_packet_inner_l4(const struct dp_packet *b) +{ + return b->inner_l4_ofs != UINT16_MAX + ? (char *) dp_packet_data(b) + b->inner_l4_ofs + : NULL; +} + +static inline size_t +dp_packet_inner_l4_size(const struct dp_packet *b) +{ + return OVS_LIKELY(b->inner_l4_ofs != UINT16_MAX) + ? (const char *) dp_packet_tail(b) + - (const char *) dp_packet_inner_l4(b) + - dp_packet_l2_pad_size(b) + : 0; +} + static inline const void * dp_packet_get_tcp_payload(const struct dp_packet *b) { @@ -606,6 +684,8 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v) * (and thus 'v') will always be <= UINT16_MAX; this means that there is no * loss of accuracy in assigning 'v' to 'data_len'. */ + + ovs_assert(v <= UINT16_MAX); b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ b->mbuf.pkt_len = v; /* Total length of all segments linked to * this segment. */ @@ -635,6 +715,17 @@ dp_packet_set_allocated(struct dp_packet *b, uint16_t s) b->mbuf.buf_len = s; } +static inline uint16_t +dp_packet_get_tso_segsz(const struct dp_packet *p) +{ + return p->mbuf.tso_segsz; +} + +static inline void +dp_packet_set_tso_segsz(struct dp_packet *p, uint16_t s) +{ + p->mbuf.tso_segsz = s; +} #else /* DPDK_NETDEV */ static inline void @@ -691,6 +782,17 @@ dp_packet_set_allocated(struct dp_packet *b, uint16_t s) b->allocated_ = s; } +static inline uint16_t +dp_packet_get_tso_segsz(const struct dp_packet *p) +{ + return p->tso_segsz; +} + +static inline void +dp_packet_set_tso_segsz(struct dp_packet *p, uint16_t s) +{ + p->tso_segsz = s; +} #endif /* DPDK_NETDEV */ static inline void @@ -744,14 +846,6 @@ dp_packet_set_data(struct dp_packet *b, void *data) } } -static inline void -dp_packet_reset_packet(struct dp_packet *b, int off) -{ - dp_packet_set_size(b, dp_packet_size(b) - off); - dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off)); - dp_packet_reset_offsets(b); -} - enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */ struct dp_packet_batch { @@ -990,6 +1084,13 @@ dp_packet_hwol_is_ipv4(const struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_IPV4); } +/* Returns 'true' if packet 'p' is marked as IPv6. */ +static inline bool +dp_packet_hwol_tx_ipv6(const struct dp_packet *p) +{ + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IPV6); +} + /* Returns 'true' if packet 'b' is marked for TCP checksum offloading. */ static inline bool dp_packet_hwol_l4_is_tcp(const struct dp_packet *b) @@ -1014,18 +1115,105 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b) DP_PACKET_OL_TX_SCTP_CKSUM; } -/* Mark packet 'b' for IPv4 checksum offloading. */ +/* Returns 'true' if packet 'b' is marked as having an outer IPv6 header. */ +static inline bool +dp_packet_hwol_is_outer_ipv6(const struct dp_packet *b) +{ + return *dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV6; +} + +/* Returns 'true' if packet 'b' is marked as having an outer IPv4 header. */ +static inline bool +dp_packet_hwol_is_outer_ipv4(const struct dp_packet *b) +{ + return *dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV4; +} + +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE + * checksum offloading. */ +static inline bool +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_GENEVE); +} + +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN + * checksum offloading. */ +static inline bool +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN); +} + +/* Returns 'true' if packet 'b' is marked for outer IPv4 checksum offload. */ +static inline bool +dp_packet_hwol_is_outer_ipv4_cksum(const struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM); +} + +/* Returns 'true' if packet 'b' is marked for outer UDP checksum offload. */ +static inline bool +dp_packet_hwol_is_outer_udp_cksum(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM); +} + +/* Returns 'true' if packet 'b' is marked for any checksum offload. */ +static inline bool +dp_packet_hwol_tx_is_any_csum(struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_ANY_CKSUM); +} + +static inline void +dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_L4_MASK; +} + +/* Mark packet 'p' as IPv4. */ +static inline void +dp_packet_hwol_set_tx_ipv4(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IPV4; +} + +/* Mark packet 'a' as IPv6. */ +static inline void +dp_packet_hwol_set_tx_ipv6(struct dp_packet *a) +{ + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_IPV6; +} + +/* Mark packet 'a' as a tunnel packet with outer IPv6 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv6(struct dp_packet *a) +{ + *dp_packet_ol_flags_ptr(a) &= ~DP_PACKET_OL_TX_OUTER_IPV4; + *dp_packet_ol_flags_ptr(a) |= DP_PACKET_OL_TX_OUTER_IPV6; +} + +/* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */ +static inline bool +dp_packet_hwol_tx_ip_csum(const struct dp_packet *p) +{ + return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IP_CKSUM); +} + +/* Marks packet 'p' for IPv4 checksum offloading. */ static inline void -dp_packet_hwol_set_tx_ipv4(struct dp_packet *b) +dp_packet_hwol_set_tx_ip_csum(struct dp_packet *p) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV4; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IP_CKSUM; } -/* Mark packet 'b' for IPv6 checksum offloading. */ static inline void -dp_packet_hwol_set_tx_ipv6(struct dp_packet *b) +dp_packet_hwol_reset_tx_ip_csum(struct dp_packet *p) { - *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6; + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IP_CKSUM; } /* Mark packet 'b' for TCP checksum offloading. It implies that either @@ -1061,13 +1249,105 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Mark packet 'b' for tunnel GENEVE offloading. */ +static inline void +dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE; +} + +/* Mark packet 'b' for tunnel VXLAN offloading. */ +static inline void +dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; +} + +/* Clears tunnel offloading marks. */ +static inline void +dp_packet_hwol_reset_tunnel(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) &= ~(DP_PACKET_OL_TX_TUNNEL_VXLAN | + DP_PACKET_OL_TX_TUNNEL_GENEVE); +} + +/* Mark packet 'b' as a tunnel packet with outer IPv4 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv4(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; +} + +/* Mark packet 'b' for csum offloading in outer IPv4 header. */ +static inline void +dp_packet_hwol_set_tx_outer_ipv4_csum(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; +} + +static inline void +dp_packet_hwol_reset_outer_ipv4_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_OUTER_IP_CKSUM; +} + +static inline void +dp_packet_hwol_reset_outer_udp_csum(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_OUTER_UDP_CKSUM; +} + +/* Mark packet 'b' for csum offloading in outer UDP header. */ +static inline void +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; +} + +/* Resets TCP Segmentation in packet 'p' and adjust flags to indicate + * L3 and L4 checksumming is now required. */ +static inline void +dp_packet_hwol_reset_tcp_seg(struct dp_packet *p) +{ + uint64_t ol_flags = *dp_packet_ol_flags_ptr(p) + | DP_PACKET_OL_TX_TCP_CKSUM; + + ol_flags = ol_flags & ~(DP_PACKET_OL_TX_TCP_SEG + | DP_PACKET_OL_RX_L4_CKSUM_GOOD + | DP_PACKET_OL_RX_IP_CKSUM_GOOD); + + if (ol_flags & DP_PACKET_OL_TX_IPV4) { + ol_flags |= DP_PACKET_OL_TX_IP_CKSUM; + } + + *dp_packet_ol_flags_ptr(p) = ol_flags; +} + +/* Returns 'true' if the IP header has good integrity and the + * checksum in it is complete. */ static inline bool -dp_packet_ip_checksum_valid(const struct dp_packet *p) +dp_packet_ip_checksum_good(const struct dp_packet *p) { return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_IP_CKSUM_MASK) == DP_PACKET_OL_RX_IP_CKSUM_GOOD; } +/* Marks packet 'p' with good IPv4 checksum. */ +static inline void +dp_packet_ol_set_ip_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_BAD; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_IP_CKSUM_GOOD; +} + +/* Resets IP good checksum flag in packet 'p'. */ +static inline void +dp_packet_ol_reset_ip_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_GOOD; +} + +/* Marks packet 'p' with bad IPv4 checksum. */ static inline bool dp_packet_ip_checksum_bad(const struct dp_packet *p) { @@ -1075,8 +1355,52 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_IP_CKSUM_BAD; } +/* Return 'true' is packet 'b' is not encapsulated and is marked for IPv4 + * checksum offload, or if 'b' is encapsulated and the outer layer is marked + * for IPv4 checksum offload. IPv6 packets, non offloaded packets, and IPv4 + * packets that are marked as good return 'false'. */ +static inline bool +dp_packet_hwol_l3_csum_ipv4_ol(const struct dp_packet *b) +{ + if (dp_packet_hwol_is_outer_ipv4(b)) { + return dp_packet_hwol_is_outer_ipv4_cksum(b); + } else if (!dp_packet_hwol_is_outer_ipv6(b)) { + return dp_packet_hwol_tx_ip_csum(b) && + !dp_packet_ip_checksum_good(b); + } + return false; +} + +/* Return 'true' is packet 'b' is not encapsulated and is marked for IPv4 + * checksum offload, or if 'b' is encapsulated and the outer layer is marked + * for IPv4 checksum offload. IPv6 packets and non offloaded packets return + * 'false'. */ static inline bool -dp_packet_l4_checksum_valid(const struct dp_packet *p) +dp_packet_hwol_l3_ipv4(const struct dp_packet *b) +{ + if (dp_packet_hwol_is_outer_ipv4(b)) { + return true; + } else if (!dp_packet_hwol_is_outer_ipv6(b)) { + return dp_packet_hwol_tx_ip_csum(b); + } + return false; +} + +/* Calculate and set the IPv4 header checksum in packet 'p'. */ +static inline void +dp_packet_ip_set_header_csum(struct dp_packet *p, bool inner) +{ + struct ip_header *ip = (inner) ? dp_packet_inner_l3(p) : dp_packet_l3(p); + + ovs_assert(ip); + ip->ip_csum = 0; + ip->ip_csum = csum(ip, sizeof *ip); +} + +/* Returns 'true' if the packet 'p' has good integrity and the + * checksum in it is correct. */ +static inline bool +dp_packet_l4_checksum_good(const struct dp_packet *p) { return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) == DP_PACKET_OL_RX_L4_CKSUM_GOOD; @@ -1089,35 +1413,130 @@ dp_packet_l4_checksum_bad(const struct dp_packet *p) DP_PACKET_OL_RX_L4_CKSUM_BAD; } -static inline void ALWAYS_INLINE -dp_packet_update_rss_hash_ipv4_tcp_udp(struct dp_packet *packet) +/* Returns 'true' if the packet has good integrity though the + * checksum in the packet 'p' is not complete. */ +static inline bool +dp_packet_ol_l4_csum_partial(const struct dp_packet *p) { - if (dp_packet_rss_valid(packet)) { - return; + return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) == + DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good integrity though the checksum in the + * packet is not complete. */ +static inline void +dp_packet_ol_set_l4_csum_partial(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_MASK; +} + +/* Marks packet 'p' with good L4 checksum. */ +static inline void +dp_packet_ol_set_l4_csum_good(struct dp_packet *p) +{ + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_BAD; + *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_GOOD; +} + +/* Marks packet 'p' with good L4 checksum as modified. */ +static inline void +dp_packet_ol_reset_l4_csum_good(struct dp_packet *p) +{ + if (!dp_packet_ol_l4_csum_partial(p)) { + *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_L4_CKSUM_GOOD; } +} - const uint8_t *pkt = dp_packet_data(packet); - const uint16_t l3_ofs = packet->l3_ofs; +/* Marks packet 'p' with good integrity if checksum offload locations + * were provided. In the case of encapsulated packets, these values may + * be deeper into the packet than OVS might expect. But the packet + * should still be considered to have good integrity. + * The 'csum_start' is the offset from the begin of the packet headers. + * The 'csum_offset' is the offset from start to place the checksum. + * The csum_start and csum_offset fields are set from the virtio_net_hdr + * struct that may be provided by a netdev on packet ingress. */ +static inline void +dp_packet_ol_l4_csum_check_partial(struct dp_packet *p) +{ + if (p->csum_start && p->csum_offset) { + dp_packet_ol_set_l4_csum_partial(p); + } +} + +static inline void +dp_packet_reset_packet(struct dp_packet *b, int off) +{ + dp_packet_set_size(b, dp_packet_size(b) - off); + dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off)); + dp_packet_reset_offsets(b); + + if (b->csum_start >= off && b->csum_offset) { + /* Adjust values for decapsulation. */ + b->csum_start -= off; + dp_packet_ol_set_l4_csum_partial(b); + } +} + +static inline uint32_t ALWAYS_INLINE +dp_packet_calc_hash_ipv4(const uint8_t *pkt, const uint16_t l3_ofs, + uint32_t hash) +{ const void *ipv4_src = &pkt[l3_ofs + offsetof(struct ip_header, ip_src)]; const void *ipv4_dst = &pkt[l3_ofs + offsetof(struct ip_header, ip_dst)]; - const void *l4_ports = &pkt[packet->l4_ofs]; - uint32_t ip_src, ip_dst, ports; - uint32_t hash = 0; + uint32_t ip_src, ip_dst; memcpy(&ip_src, ipv4_src, sizeof ip_src); memcpy(&ip_dst, ipv4_dst, sizeof ip_dst); - memcpy(&ports, l4_ports, sizeof ports); /* IPv4 Src and Dst. */ hash = hash_add(hash, ip_src); hash = hash_add(hash, ip_dst); + /* IPv4 proto. */ - hash = hash_add(hash, - pkt[l3_ofs + offsetof(struct ip_header, ip_proto)]); + hash = hash_add(hash, pkt[l3_ofs + offsetof(struct ip_header, ip_proto)]); + + return hash; +} + +static inline void ALWAYS_INLINE +dp_packet_update_rss_hash_ipv4(struct dp_packet *packet) +{ + if (dp_packet_rss_valid(packet)) { + return; + } + + const uint8_t *pkt = dp_packet_data(packet); + const uint16_t l3_ofs = packet->l3_ofs; + uint32_t hash = 0; + + /* IPv4 Src, Dst and proto. */ + hash = dp_packet_calc_hash_ipv4(pkt, l3_ofs, hash); + + hash = hash_finish(hash, 42); + dp_packet_set_rss_hash(packet, hash); +} + +static inline void ALWAYS_INLINE +dp_packet_update_rss_hash_ipv4_tcp_udp(struct dp_packet *packet) +{ + if (dp_packet_rss_valid(packet)) { + return; + } + + const uint8_t *pkt = dp_packet_data(packet); + const void *l4_ports = &pkt[packet->l4_ofs]; + const uint16_t l3_ofs = packet->l3_ofs; + uint32_t hash = 0; + uint32_t ports; + + /* IPv4 Src, Dst and proto. */ + hash = dp_packet_calc_hash_ipv4(pkt, l3_ofs, hash); + /* L4 ports. */ + memcpy(&ports, l4_ports, sizeof ports); hash = hash_add(hash, ports); - hash = hash_finish(hash, 42); + hash = hash_finish(hash, 42); dp_packet_set_rss_hash(packet, hash); } diff --git a/lib/dpctl.c b/lib/dpctl.c index 29041fa3e30..f764cf16410 100644 --- a/lib/dpctl.c +++ b/lib/dpctl.c @@ -41,6 +41,7 @@ #include "netlink.h" #include "odp-util.h" #include "openvswitch/ofpbuf.h" +#include "openvswitch/ofp-ct.h" #include "packets.h" #include "openvswitch/shash.h" #include "simap.h" @@ -335,6 +336,12 @@ dpctl_add_if(int argc OVS_UNUSED, const char *argv[], value = ""; } + if (!key) { + dpctl_error(dpctl_p, 0, "Invalid option format"); + error = EINVAL; + goto next; + } + if (!strcmp(key, "type")) { type = value; } else if (!strcmp(key, "port_no")) { @@ -453,6 +460,12 @@ dpctl_set_if(int argc, const char *argv[], struct dpctl_params *dpctl_p) value = ""; } + if (!key) { + dpctl_error(dpctl_p, 0, "Invalid option format"); + error = EINVAL; + goto next_destroy_args; + } + if (!strcmp(key, "type")) { if (strcmp(value, type)) { dpctl_error(dpctl_p, 0, @@ -672,7 +685,7 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) } for (int i = 0; i < n_port_nos; i++) { - if (dpif_port_query_by_number(dpif, port_nos[i], &dpif_port)) { + if (dpif_port_query_by_number(dpif, port_nos[i], &dpif_port, true)) { continue; } @@ -725,8 +738,8 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) continue; } error = netdev_get_stats(netdev, &s); + netdev_close(netdev); if (!error) { - netdev_close(netdev); print_stat(dpctl_p, " RX packets:", s.rx_packets); print_stat(dpctl_p, " errors:", s.rx_errors); print_stat(dpctl_p, " dropped:", s.rx_dropped); @@ -749,6 +762,10 @@ show_dpif(struct dpif *dpif, struct dpctl_params *dpctl_p) print_stat(dpctl_p, " TX bytes:", s.tx_bytes); print_human_size(dpctl_p, s.tx_bytes); dpctl_print(dpctl_p, "\n"); + + print_stat(dpctl_p, " UPCALL packets:", s.upcall_packets); + print_stat(dpctl_p, " errors:", s.upcall_errors); + dpctl_print(dpctl_p, "\n"); } else { dpctl_print(dpctl_p, ", could not retrieve stats (%s)", ovs_strerror(error)); @@ -1342,19 +1359,17 @@ static int dpctl_del_flow_dpif(struct dpif *dpif, const char *key_s, struct dpctl_params *dpctl_p) { + struct dpif_port_dump port_dump; struct dpif_flow_stats stats; + bool ufid_generated = false; struct dpif_port dpif_port; - struct dpif_port_dump port_dump; - struct ofpbuf key; + bool ufid_present = false; + struct simap port_names; struct ofpbuf mask; /* To be ignored. */ - + struct ofpbuf key; ovs_u128 ufid; - bool ufid_generated; - bool ufid_present; - struct simap port_names; int n, error; - ufid_present = false; n = odp_ufid_from_string(key_s, &ufid); if (n < 0) { dpctl_error(dpctl_p, -n, "parsing flow ufid"); @@ -1702,42 +1717,82 @@ dpctl_dump_conntrack(int argc, const char *argv[], return error; } +static int +dpctl_dump_conntrack_exp(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct ct_dpif_dump_state *dump; + uint16_t zone, *pzone = NULL; + struct ct_dpif_exp cte; + struct dpif *dpif; + int error; + + if (argc > 1 && ovs_scan(argv[argc - 1], "zone=%"SCNu16, &zone)) { + pzone = &zone; + argc--; + } + + error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif); + if (error) { + return error; + } + + error = ct_exp_dpif_dump_start(dpif, &dump, pzone); + if (error) { + dpctl_error(dpctl_p, error, "starting conntrack expectations dump"); + dpif_close(dpif); + return error; + } + + while (!(error = ct_exp_dpif_dump_next(dump, &cte))) { + struct ds s = DS_EMPTY_INITIALIZER; + + ct_dpif_format_exp_entry(&cte, &s); + + dpctl_print(dpctl_p, "%s\n", ds_cstr(&s)); + ds_destroy(&s); + } + if (error == EOF) { + error = 0; + } else if (error) { + dpctl_error(dpctl_p, error, "dumping conntrack expectation"); + } + + ct_exp_dpif_dump_done(dump); + dpif_close(dpif); + + return error; +} + static int dpctl_flush_conntrack(int argc, const char *argv[], struct dpctl_params *dpctl_p) { struct dpif *dpif = NULL; - struct ct_dpif_tuple tuple, *ptuple = NULL; + struct ofp_ct_match match = {0}; struct ds ds = DS_EMPTY_INITIALIZER; - uint16_t zone, *pzone = NULL; + uint16_t zone; int error; int args = argc - 1; + bool with_zone = false; - /* Parse ct tuple */ - if (args && ct_dpif_parse_tuple(&tuple, argv[args], &ds)) { - ptuple = &tuple; - args--; - } - - /* Parse zone */ - if (args && ovs_scan(argv[args], "zone=%"SCNu16, &zone)) { - pzone = &zone; + if (dp_arg_exists(argc, argv)) { args--; } - /* Report error if there are more than one unparsed argument. */ - if (args > 1) { - ds_put_cstr(&ds, "invalid arguments"); + if (args && !ofp_ct_match_parse(&argv[argc - args], args, &ds, &match, + &with_zone, &zone)) { error = EINVAL; goto error; } - error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); + error = opt_dpif_open(argc, argv, dpctl_p, 5, &dpif); if (error) { + dpctl_error(dpctl_p, error, "Cannot open dpif"); return error; } - error = ct_dpif_flush(dpif, pzone, ptuple); + error = ct_dpif_flush(dpif, with_zone ? &zone : NULL, &match); if (!error) { dpif_close(dpif); return 0; @@ -2111,13 +2166,20 @@ static int dpctl_ct_set_limits(int argc, const char *argv[], struct dpctl_params *dpctl_p) { - struct dpif *dpif; - struct ds ds = DS_EMPTY_INITIALIZER; - int i = dp_arg_exists(argc, argv) ? 2 : 1; - uint32_t default_limit, *p_default_limit = NULL; struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); + int i = dp_arg_exists(argc, argv) ? 2 : 1; + struct ds ds = DS_EMPTY_INITIALIZER; + struct dpif *dpif = NULL; + uint32_t default_limit; + int error; + + if (i >= argc) { + ds_put_cstr(&ds, "too few arguments"); + error = EINVAL; + goto error; + } - int error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, &dpif); + error = opt_dpif_open(argc, argv, dpctl_p, INT_MAX, &dpif); if (error) { return error; } @@ -2125,7 +2187,8 @@ dpctl_ct_set_limits(int argc, const char *argv[], /* Parse default limit */ if (!strncmp(argv[i], "default=", 8)) { if (ovs_scan(argv[i], "default=%"SCNu32, &default_limit)) { - p_default_limit = &default_limit; + ct_dpif_push_zone_limit(&zone_limits, OVS_ZONE_LIMIT_DEFAULT_ZONE, + default_limit, 0); i++; } else { ds_put_cstr(&ds, "invalid default limit"); @@ -2145,7 +2208,14 @@ dpctl_ct_set_limits(int argc, const char *argv[], ct_dpif_push_zone_limit(&zone_limits, zone, limit, 0); } - error = ct_dpif_set_limits(dpif, p_default_limit, &zone_limits); + if (ct_dpif_is_zone_limit_protected(dpif)) { + ds_put_cstr(&ds, "the zone limits are set via database, " + "use 'ovs-vsctl set-zone-limit <...>' instead."); + error = EPERM; + goto error; + } + + error = ct_dpif_set_limits(dpif, &zone_limits); if (!error) { ct_dpif_free_zone_limits(&zone_limits); dpif_close(dpif); @@ -2177,7 +2247,7 @@ parse_ct_limit_zones(const char *argv, struct ovs_list *zone_limits, argcopy = xstrdup(argv + 5); next_zone = strtok_r(argcopy, ",", &save_ptr); - do { + while (next_zone != NULL) { if (ovs_scan(next_zone, "%"SCNu16, &zone)) { ct_dpif_push_zone_limit(zone_limits, zone, 0, 0); } else { @@ -2185,7 +2255,8 @@ parse_ct_limit_zones(const char *argv, struct ovs_list *zone_limits, free(argcopy); return EINVAL; } - } while ((next_zone = strtok_r(NULL, ",", &save_ptr)) != NULL); + next_zone = strtok_r(NULL, ",", &save_ptr); + } free(argcopy); return 0; @@ -2195,19 +2266,41 @@ static int dpctl_ct_del_limits(int argc, const char *argv[], struct dpctl_params *dpctl_p) { - struct dpif *dpif; + struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); + int i = dp_arg_exists(argc, argv) ? 2 : 1; struct ds ds = DS_EMPTY_INITIALIZER; + struct dpif *dpif = NULL; int error; - int i = dp_arg_exists(argc, argv) ? 2 : 1; - struct ovs_list zone_limits = OVS_LIST_INITIALIZER(&zone_limits); - error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif); + if (i >= argc) { + ds_put_cstr(&ds, "too few arguments"); + error = EINVAL; + goto error; + } + + error = opt_dpif_open(argc, argv, dpctl_p, 4, &dpif); if (error) { return error; } - error = parse_ct_limit_zones(argv[i], &zone_limits, &ds); - if (error) { + /* Parse default limit. */ + if (!strcmp(argv[i], "default")) { + ct_dpif_push_zone_limit(&zone_limits, OVS_ZONE_LIMIT_DEFAULT_ZONE, + 0, 0); + i++; + } + + if (argc > i) { + error = parse_ct_limit_zones(argv[i], &zone_limits, &ds); + if (error) { + goto error; + } + } + + if (ct_dpif_is_zone_limit_protected(dpif)) { + ds_put_cstr(&ds, "the zone limits are set via database, " + "use 'ovs-vsctl del-zone-limit <...>' instead."); + error = EPERM; goto error; } @@ -2233,7 +2326,6 @@ dpctl_ct_get_limits(int argc, const char *argv[], { struct dpif *dpif; struct ds ds = DS_EMPTY_INITIALIZER; - uint32_t default_limit; int i = dp_arg_exists(argc, argv) ? 2 : 1; struct ovs_list list_query = OVS_LIST_INITIALIZER(&list_query); struct ovs_list list_reply = OVS_LIST_INITIALIZER(&list_reply); @@ -2244,16 +2336,17 @@ dpctl_ct_get_limits(int argc, const char *argv[], } if (argc > i) { + ct_dpif_push_zone_limit(&list_query, OVS_ZONE_LIMIT_DEFAULT_ZONE, + 0, 0); error = parse_ct_limit_zones(argv[i], &list_query, &ds); if (error) { goto error; } } - error = ct_dpif_get_limits(dpif, &default_limit, &list_query, - &list_reply); + error = ct_dpif_get_limits(dpif, &list_query, &list_reply); if (!error) { - ct_dpif_format_zone_limits(default_limit, &list_reply, &ds); + ct_dpif_format_zone_limits(&list_reply, &ds); dpctl_print(dpctl_p, "%s\n", ds_cstr(&ds)); goto out; } else { @@ -2271,6 +2364,65 @@ dpctl_ct_get_limits(int argc, const char *argv[], return error; } +static int +dpctl_ct_get_sweep(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + uint32_t sweep_ms = 0; + struct dpif *dpif; + + int error = opt_dpif_open(argc, argv, dpctl_p, 2, &dpif); + if (error) { + return error; + } + + error = ct_dpif_sweep(dpif, &sweep_ms); + if (error) { + dpctl_error(dpctl_p, error, "failed to get the sweep interval"); + } else { + dpctl_print(dpctl_p, "%"PRIu32, sweep_ms); + } + + dpif_close(dpif); + return error; +} + +static int +dpctl_ct_set_sweep(int argc, const char *argv[], + struct dpctl_params *dpctl_p) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + uint32_t sweep_ms = 0; + struct dpif *dpif; + + int error = opt_dpif_open(argc, argv, dpctl_p, 3, &dpif); + if (error) { + return error; + } + + if (!ovs_scan(argv[argc - 1], "%"SCNu32, &sweep_ms) || + sweep_ms == 0) { + ds_put_format(&ds, "invalid sweep value"); + error = EINVAL; + goto error; + } + + error = ct_dpif_sweep(dpif, &sweep_ms); + if (!error) { + dpctl_print(dpctl_p, "setting sweep interval successful\n"); + goto out; + } + + ds_put_format(&ds, "failed to set the sweep interval"); + +error: + dpctl_error(dpctl_p, error, "%s", ds_cstr(&ds)); + ds_destroy(&ds); +out: + dpif_close(dpif); + return error; +} + static int ipf_set_enabled__(int argc, const char *argv[], struct dpctl_params *dpctl_p, bool enabled) @@ -2862,8 +3014,11 @@ static const struct dpctl_command all_commands[] = { 0, 1, dpctl_offload_stats_show, DP_RO }, { "dump-conntrack", "[-m] [-s] [dp] [zone=N]", 0, 4, dpctl_dump_conntrack, DP_RO }, - { "flush-conntrack", "[dp] [zone=N] [ct-tuple]", 0, 3, - dpctl_flush_conntrack, DP_RW }, + { "dump-conntrack-exp", "[dp] [zone=N]", + 0, 2, dpctl_dump_conntrack_exp, DP_RO }, + { "flush-conntrack", "[dp] [zone=N] [mark=X[/M]] [labels=Y[/N]] " + "[ct-orig-tuple [ct-reply-tuple]]", + 0, 6, dpctl_flush_conntrack, DP_RW }, { "cache-get-size", "[dp]", 0, 1, dpctl_cache_get_size, DP_RO }, { "cache-set-size", "dp cache ", 3, 3, dpctl_cache_set_size, DP_RW }, { "ct-stats-show", "[dp] [zone=N]", @@ -2880,10 +3035,12 @@ static const struct dpctl_command all_commands[] = { { "ct-get-tcp-seq-chk", "[dp]", 0, 1, dpctl_ct_get_tcp_seq_chk, DP_RO }, { "ct-set-limits", "[dp] [default=L] [zone=N,limit=L]...", 1, INT_MAX, dpctl_ct_set_limits, DP_RO }, - { "ct-del-limits", "[dp] zone=N1[,N2]...", 1, 2, dpctl_ct_del_limits, - DP_RO }, + { "ct-del-limits", "[dp] [default] [zone=N1[,N2]...]", 1, 3, + dpctl_ct_del_limits, DP_RO }, { "ct-get-limits", "[dp] [zone=N1[,N2]...]", 0, 2, dpctl_ct_get_limits, DP_RO }, + { "ct-get-sweep-interval", "[dp]", 0, 1, dpctl_ct_get_sweep, DP_RO }, + { "ct-set-sweep-interval", "[dp] ms", 1, 2, dpctl_ct_set_sweep, DP_RW }, { "ipf-set-enabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_enabled, DP_RW }, { "ipf-set-disabled", "[dp] v4|v6", 1, 2, dpctl_ipf_set_disabled, DP_RW }, { "ipf-set-min-frag", "[dp] v4|v6 minfragment", 2, 3, diff --git a/lib/dpctl.man b/lib/dpctl.man index 87ea8087bb8..66fc50903b0 100644 --- a/lib/dpctl.man +++ b/lib/dpctl.man @@ -302,22 +302,36 @@ are included. With \fB\-\-statistics\fR timeouts and timestamps are added to the output. . .TP -\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] [\fIct-tuple\fR] +\*(DX\fBdump\-conntrack\-exp\fR [\fIdp\fR] [\fBzone=\fIzone\fR] +Prints to the console all the expectation entries in the tracker used by +\fIdp\fR. If \fBzone=\fIzone\fR is specified, only shows the expectations +in \fIzone\fR. Only supported for userspace datapath. +. +.TP +\*(DX\fBflush\-conntrack\fR [\fIdp\fR] [\fBzone=\fIzone\fR] [\fIct-origin-tuple\fR [\fIct-reply-tuple\fR]] Flushes the connection entries in the tracker used by \fIdp\fR based on -\fIzone\fR and connection tracking tuple \fIct-tuple\fR. +\fIzone\fR and connection tracking tuple \fIct-origin-tuple\fR. If \fIct-tuple\fR is not provided, flushes all the connection entries. If \fBzone\fR=\fIzone\fR is specified, only flushes the connections in \fIzone\fR. .IP -If \fIct-tuple\fR is provided, flushes the connection entry specified by -\fIct-tuple\fR in \fIzone\fR. The zone defaults to 0 if it is not provided. -The userspace connection tracker requires flushing with the original pre-NATed -tuple and a warning log will be otherwise generated. -An example of an IPv4 ICMP \fIct-tuple\fR: +If \fIct-[orig|reply]-tuple\fR is provided, flushes the connection entry +specified by \fIct-[orig|reply]-tuple\fR in \fIzone\fR. The zone defaults +to 0 if it is not provided. The userspace connection tracker requires flushing +with the original pre-NATed tuple and a warning log will be otherwise +generated. The tuple can be partial and will remove all connections that are +matching on the specified fields. In order to specify only +\fIct-reply-tuple\fR, provide empty string as \fIct-origin-tuple\fR. +.IP +Note: Currently there is a limitation for matching on ICMP, in order to +partially match on ICMP parameters the \fIct-[orig|reply]-tuple\fR has +to include either source or destination IP. +.IP +An example of an IPv4 ICMP \fIct-[orig|reply]-tuple\fR: .IP "ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10" .IP -An example of an IPv6 TCP \fIct-tuple\fR: +An example of an IPv6 TCP \fIct-[orig|reply]-tuple\fR: .IP "ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2,ct_nw_proto=6,ct_tp_src=1,ct_tp_dst=2" . @@ -374,6 +388,15 @@ Prints whether TCP sequence checking is enabled or disabled on \fIdp\fR. Only supported for the userspace datapath. . .TP +\*(DX\fBct\-set\-sweep\-interval\fR [\fIdp\fR] \fIms\fR +Sets the sweep interval. Only supported for the userspace datapath. +. +.TP +\*(DX\fBct\-get\-sweep\-interval\fR [\fIdp\fR] +Prints the current sweep interval in ms. Only supported for the userspace +datapath. +. +.TP \*(DX\fBct\-set\-limits\fR [\fIdp\fR] [\fBdefault=\fIdefault_limit\fR] [\fBzone=\fIzone\fR,\fBlimit=\fIlimit\fR]... Sets the maximum allowed number of connections in a connection tracking zone. A specific \fIzone\fR may be set to \fIlimit\fR, and multiple zones diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index 3eee1f485c0..58ebf6cb62c 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -49,30 +49,6 @@ dpdk_detach_thread(void) { } -const char * -dpdk_get_vhost_sock_dir(void) -{ - return NULL; -} - -bool -dpdk_vhost_iommu_enabled(void) -{ - return false; -} - -bool -dpdk_vhost_postcopy_enabled(void) -{ - return false; -} - -bool -dpdk_per_port_memory(void) -{ - return false; -} - bool dpdk_available(void) { diff --git a/lib/dpdk.c b/lib/dpdk.c index d909974f91b..940c43c070b 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -47,39 +46,8 @@ VLOG_DEFINE_THIS_MODULE(dpdk); static FILE *log_stream = NULL; /* Stream for DPDK log redirection */ -static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ -static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ -static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY - * support. */ -static bool per_port_memory = false; /* Status of per port memory support */ - /* Indicates successful initialization of DPDK. */ -static atomic_bool dpdk_initialized = ATOMIC_VAR_INIT(false); - -static int -process_vhost_flags(char *flag, const char *default_val, int size, - const struct smap *ovs_other_config, - char **new_val) -{ - const char *val; - int changed = 0; - - val = smap_get(ovs_other_config, flag); - - /* Process the vhost-sock-dir flag if it is provided, otherwise resort to - * default value. - */ - if (val && (strlen(val) <= size)) { - changed = 1; - *new_val = xstrdup(val); - VLOG_INFO("User-provided %s in use: %s", flag, *new_val); - } else { - VLOG_INFO("No %s provided - defaulting to %s", flag, default_val); - *new_val = xstrdup(default_val); - } - - return changed; -} +static atomic_bool dpdk_initialized = false; static bool args_contains(const struct svec *args, const char *value) @@ -345,11 +313,9 @@ malloc_dump_stats_wrapper(FILE *stream) static bool dpdk_init__(const struct smap *ovs_other_config) { - char *sock_dir_subcomponent; char **argv = NULL; int result; bool auto_determine = true; - int err = 0; struct ovs_numa_dump *affinity = NULL; struct svec args = SVEC_EMPTY_INITIALIZER; @@ -361,49 +327,6 @@ dpdk_init__(const struct smap *ovs_other_config) rte_openlog_stream(log_stream); } - if (process_vhost_flags("vhost-sock-dir", ovs_rundir(), - NAME_MAX, ovs_other_config, - &sock_dir_subcomponent)) { - struct stat s; - if (!strstr(sock_dir_subcomponent, "..")) { - vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(), - sock_dir_subcomponent); - - err = stat(vhost_sock_dir, &s); - if (err) { - VLOG_ERR("vhost-user sock directory '%s' does not exist.", - vhost_sock_dir); - } - } else { - vhost_sock_dir = xstrdup(ovs_rundir()); - VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid" - "characters '..' - using %s instead.", - ovs_rundir(), sock_dir_subcomponent, ovs_rundir()); - } - free(sock_dir_subcomponent); - } else { - vhost_sock_dir = sock_dir_subcomponent; - } - - vhost_iommu_enabled = smap_get_bool(ovs_other_config, - "vhost-iommu-support", false); - VLOG_INFO("IOMMU support for vhost-user-client %s.", - vhost_iommu_enabled ? "enabled" : "disabled"); - - vhost_postcopy_enabled = smap_get_bool(ovs_other_config, - "vhost-postcopy-support", false); - if (vhost_postcopy_enabled && memory_locked()) { - VLOG_WARN("vhost-postcopy-support and mlockall are not compatible."); - vhost_postcopy_enabled = false; - } - VLOG_INFO("POSTCOPY support for vhost-user-client %s.", - vhost_postcopy_enabled ? "enabled" : "disabled"); - - per_port_memory = smap_get_bool(ovs_other_config, - "per-port-memory", false); - VLOG_INFO("Per port memory for DPDK devices %s.", - per_port_memory ? "enabled" : "disabled"); - svec_add(&args, ovs_get_program_name()); construct_dpdk_args(ovs_other_config, &args); @@ -414,7 +337,9 @@ dpdk_init__(const struct smap *ovs_other_config) } #endif - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { + if (args_contains(&args, "-c") || + args_contains(&args, "-l") || + args_contains(&args, "--lcores")) { auto_determine = false; } @@ -558,30 +483,6 @@ dpdk_init(const struct smap *ovs_other_config) atomic_store_relaxed(&dpdk_initialized, enabled); } -const char * -dpdk_get_vhost_sock_dir(void) -{ - return vhost_sock_dir; -} - -bool -dpdk_vhost_iommu_enabled(void) -{ - return vhost_iommu_enabled; -} - -bool -dpdk_vhost_postcopy_enabled(void) -{ - return vhost_postcopy_enabled; -} - -bool -dpdk_per_port_memory(void) -{ - return per_port_memory; -} - bool dpdk_available(void) { diff --git a/lib/dpdk.h b/lib/dpdk.h index 64ebca47d6d..1b790e682e4 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -38,10 +38,6 @@ struct ovsrec_open_vswitch; void dpdk_init(const struct smap *ovs_other_config); bool dpdk_attach_thread(unsigned cpu); void dpdk_detach_thread(void); -const char *dpdk_get_vhost_sock_dir(void); -bool dpdk_vhost_iommu_enabled(void); -bool dpdk_vhost_postcopy_enabled(void); -bool dpdk_per_port_memory(void); bool dpdk_available(void); void print_dpdk_version(void); void dpdk_status(const struct ovsrec_open_vswitch *); diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c index 4afbed97eac..57ca4c71b7c 100644 --- a/lib/dpif-netdev-extract-avx512.c +++ b/lib/dpif-netdev-extract-avx512.c @@ -194,6 +194,7 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xBF, 0xFF, 0xFF) #define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11) #define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06) +#define PATTERN_IPV4_NVGRE PATTERN_IPV4_GEN(0x45, 0, 0, 0x2f) #define PATTERN_TCP_GEN(data_offset) \ 0, 0, 0, 0, /* sport, dport */ \ @@ -218,6 +219,12 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, NU, NU, NU, NU, NU, NU, NU, NU, 34, 35, 36, 37, NU, NU, NU, NU, /* TCP */ \ NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. */ +#define PATTERN_IPV4_NVGRE_SHUFFLE \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, NU, NU, /* Ether */ \ + 26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ \ + NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused */\ + NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused */ + #define PATTERN_DT1Q_IPV4_UDP_SHUFFLE \ /* Ether (2 blocks): Note that *VLAN* type is written here. */ \ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 0, 0, \ @@ -286,6 +293,9 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define KMASK_DT1Q_IPV6 0xFF0FULL #define KMASK_IPV6_NOHDR 0x00FFULL +#define PATTERN_IPV4_KMASK \ + (KMASK_ETHER | (KMASK_IPV4 << 16)) + #define PATTERN_IPV4_UDP_KMASK \ (KMASK_ETHER | (KMASK_IPV4 << 16) | (KMASK_UDP << 32)) @@ -332,6 +342,7 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, #define PKT_OFFSET_VLAN_IPV6_L4 (PKT_OFFSET_VLAN_L3 + IPV6_HEADER_LEN) #define PKT_OFFSET_IPV6_L4 (PKT_OFFSET_L3 + IPV6_HEADER_LEN) +#define PKT_MIN_ETH_IPV4 (ETH_HEADER_LEN + IP_HEADER_LEN) #define PKT_MIN_ETH_IPV4_UDP (PKT_OFFSET_IPV4_L4 + UDP_HEADER_LEN) #define PKT_MIN_ETH_VLAN_IPV4_UDP (PKT_OFFSET_VLAN_IPV4_L4 + UDP_HEADER_LEN) #define PKT_MIN_ETH_IPV4_TCP (PKT_OFFSET_IPV4_L4 + TCP_HEADER_LEN) @@ -352,8 +363,8 @@ _mm512_maskz_permutexvar_epi8_selector(__mmask64 k_shuf, __m512i v_shuf, | MF_BIT(dl_dst) | MF_BIT(dl_src)| MF_BIT(dl_type)) #define MF_ETH_VLAN (MF_ETH | MF_BIT(vlans)) -#define MF_IPV4_UDP (MF_BIT(nw_src) | MF_BIT(ipv6_label) | MF_BIT(tp_src) | \ - MF_BIT(tp_dst)) +#define MF_IPV4 (MF_BIT(nw_src) | MF_BIT(ipv6_label)) +#define MF_IPV4_UDP (MF_IPV4 | MF_BIT(tp_src) | MF_BIT(tp_dst)) #define MF_IPV4_TCP (MF_IPV4_UDP | MF_BIT(tcp_flags) | MF_BIT(arp_tha.ea[2])) #define MF_IPV6_UDP (MF_BIT(ipv6_label) | MF_WORD(ipv6_src, 2) | \ @@ -449,6 +460,7 @@ enum MFEX_PROFILES { PROFILE_ETH_IPV6_TCP, PROFILE_ETH_VLAN_IPV6_TCP, PROFILE_ETH_VLAN_IPV6_UDP, + PROFILE_ETH_IPV4_NVGRE, PROFILE_COUNT, }; @@ -608,6 +620,21 @@ static const struct mfex_profile mfex_profiles[PROFILE_COUNT] = }, .dp_pkt_min_size = PKT_MIN_ETH_VLAN_IPV6_UDP, }, + + [PROFILE_ETH_IPV4_NVGRE] = { + .probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK }, + .probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_NVGRE}, + + .store_shuf.u8_data = { PATTERN_IPV4_NVGRE_SHUFFLE }, + .strip_mask.u8_data = { PATTERN_STRIP_IPV4_MASK }, + .store_kmsk = PATTERN_IPV4_KMASK, + + .mf_bits = { MF_ETH, MF_IPV4}, + .dp_pkt_offs = { + 0, UINT16_MAX, PKT_OFFSET_L3, PKT_OFFSET_IPV4_L4, + }, + .dp_pkt_min_size = PKT_MIN_ETH_IPV4, + }, }; /* IPv6 header helper function to fix TC, flow label and next header. */ @@ -731,6 +758,41 @@ mfex_check_tcp_data_offset(const struct tcp_header *tcp) return ret; } +static void +mfex_ipv4_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv4(pkt); + if (dp_packet_ip_checksum_good(pkt)) { + dp_packet_hwol_set_tx_ip_csum(pkt); + } +} + +static void +mfex_ipv6_set_hwol(struct dp_packet *pkt) +{ + dp_packet_hwol_set_tx_ipv6(pkt); +} + +static void +mfex_tcp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_tcp(pkt); + } +} + +static void +mfex_udp_set_hwol(struct dp_packet *pkt) +{ + dp_packet_ol_l4_csum_check_partial(pkt); + if (dp_packet_l4_checksum_good(pkt) + || dp_packet_ol_l4_csum_partial(pkt)) { + dp_packet_hwol_set_csum_udp(pkt); + } +} + /* Generic loop to process any mfex profile. This code is specialized into * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE * to ensure the compiler specializes each instance. The code is marked "hot" @@ -832,6 +894,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, const struct tcp_header *tcp = (void *)&pkt[38]; mfex_handle_tcp_flags(tcp, &blocks[7]); dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV4_UDP: { @@ -844,6 +908,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_TCP: { @@ -859,6 +925,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_IPV4_UDP: { @@ -870,6 +938,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, continue; } dp_packet_update_rss_hash_ipv4_tcp_udp(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_UDP: { @@ -888,6 +958,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[54], &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; case PROFILE_ETH_IPV6_TCP: { @@ -911,6 +983,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[9]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_TCP: { @@ -937,6 +1011,8 @@ mfex_avx512_process(struct dp_packet_batch *packets, } mfex_handle_tcp_flags(tcp, &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_tcp_set_hwol(packet); } break; case PROFILE_ETH_VLAN_IPV6_UDP: { @@ -958,7 +1034,22 @@ mfex_avx512_process(struct dp_packet_batch *packets, /* Process UDP header. */ mfex_handle_ipv6_l4((void *)&pkt[58], &blocks[10]); dp_packet_update_rss_hash_ipv6_tcp_udp(packet); + mfex_ipv6_set_hwol(packet); + mfex_udp_set_hwol(packet); + } break; + + case PROFILE_ETH_IPV4_NVGRE: { + /* Handle dynamic l2_pad_size. */ + uint32_t size_from_ipv4 = size - sizeof(struct eth_header); + struct ip_header *nh = (void *)&pkt[sizeof(struct eth_header)]; + if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4, 0)) { + continue; + } + dp_packet_update_rss_hash_ipv4(packet); + mfex_ipv4_set_hwol(packet); + mfex_udp_set_hwol(packet); } break; + default: break; }; @@ -1013,6 +1104,7 @@ DECLARE_MFEX_FUNC(ipv6_udp, PROFILE_ETH_IPV6_UDP) DECLARE_MFEX_FUNC(ipv6_tcp, PROFILE_ETH_IPV6_TCP) DECLARE_MFEX_FUNC(dot1q_ipv6_tcp, PROFILE_ETH_VLAN_IPV6_TCP) DECLARE_MFEX_FUNC(dot1q_ipv6_udp, PROFILE_ETH_VLAN_IPV6_UDP) +DECLARE_MFEX_FUNC(ip_nvgre, PROFILE_ETH_IPV4_NVGRE) #endif /* __CHECKER__ */ #endif /* __x86_64__ */ diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c index 7d3d81151f1..b916b24875e 100644 --- a/lib/dpif-netdev-lookup-avx512-gather.c +++ b/lib/dpif-netdev-lookup-avx512-gather.c @@ -380,7 +380,9 @@ avx512_lookup_impl(struct dpcls_subtable *subtable, DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(8, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 2) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) @@ -419,7 +421,9 @@ dpcls_subtable_avx512_gather_probe__(uint32_t u0_bits, uint32_t u1_bits, CHECK_LOOKUP_FUNCTION(9, 4, use_vpop); CHECK_LOOKUP_FUNCTION(9, 1, use_vpop); + CHECK_LOOKUP_FUNCTION(8, 1, use_vpop); CHECK_LOOKUP_FUNCTION(5, 3, use_vpop); + CHECK_LOOKUP_FUNCTION(5, 2, use_vpop); CHECK_LOOKUP_FUNCTION(5, 1, use_vpop); CHECK_LOOKUP_FUNCTION(4, 1, use_vpop); CHECK_LOOKUP_FUNCTION(4, 0, use_vpop); diff --git a/lib/dpif-netdev-lookup-generic.c b/lib/dpif-netdev-lookup-generic.c index 6c74ac3a1b7..76f92dd5e69 100644 --- a/lib/dpif-netdev-lookup-generic.c +++ b/lib/dpif-netdev-lookup-generic.c @@ -284,7 +284,9 @@ dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable, DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(8, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 2) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) @@ -308,7 +310,9 @@ dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t u1_bits) CHECK_LOOKUP_FUNCTION(9, 4); CHECK_LOOKUP_FUNCTION(9, 1); + CHECK_LOOKUP_FUNCTION(8, 1); CHECK_LOOKUP_FUNCTION(5, 3); + CHECK_LOOKUP_FUNCTION(5, 2); CHECK_LOOKUP_FUNCTION(5, 1); CHECK_LOOKUP_FUNCTION(4, 1); CHECK_LOOKUP_FUNCTION(4, 0); diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index a2a7d8f0b88..79ea5e3bef2 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -230,18 +230,28 @@ pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, uint64_t tot_iter = histogram_samples(&s->pkts); uint64_t idle_iter = s->pkts.bin[0]; uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; + uint64_t sleep_iter = stats[PMD_SLEEP_ITER]; + uint64_t tot_sleep_cycles = stats[PMD_CYCLES_SLEEP]; ds_put_format(str, " Iterations: %12"PRIu64" (%.2f us/it)\n" " - Used TSC cycles: %12"PRIu64" (%5.1f %% of total cycles)\n" " - idle iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" - " - busy iterations: %12"PRIu64" (%5.1f %% of used cycles)\n", - tot_iter, tot_cycles * us_per_cycle / tot_iter, + " - busy iterations: %12"PRIu64" (%5.1f %% of used cycles)\n" + " - sleep iterations: %12"PRIu64" (%5.1f %% of iterations)\n" + " Sleep time (us): %12.0f (%3.0f us/iteration avg.)\n", + tot_iter, + tot_iter + ? (tot_cycles + tot_sleep_cycles) * us_per_cycle / tot_iter + : 0, tot_cycles, 100.0 * (tot_cycles / duration) / tsc_hz, idle_iter, - 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles, + tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles : 0, busy_iter, - 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles); + tot_cycles ? 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles : 0, + sleep_iter, tot_iter ? 100.0 * sleep_iter / tot_iter : 0, + tot_sleep_cycles * us_per_cycle, + sleep_iter ? (tot_sleep_cycles * us_per_cycle) / sleep_iter : 0); if (rx_packets > 0) { ds_put_format(str, " Rx packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" @@ -327,29 +337,32 @@ pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) ">", s->max_vhost_qfill.bin[i], ">", s->upcalls.bin[i], ">", s->cycles_per_upcall.bin[i]); - if (s->totals.iterations > 0) { - ds_put_cstr(str, - "-----------------------------------------------------" - "-----------------------------------------------------" - "------------------------------------------------\n"); - ds_put_format(str, - " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", - "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", - "vhost qlen", "upcalls/it", "cycles/upcall"); - ds_put_format(str, - " %-21"PRIu64" %-21.5f %-21"PRIu64 - " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", - s->totals.cycles / s->totals.iterations, - 1.0 * s->totals.pkts / s->totals.iterations, - s->totals.pkts - ? s->totals.busy_cycles / s->totals.pkts : 0, - s->totals.batches - ? 1.0 * s->totals.pkts / s->totals.batches : 0, - 1.0 * s->totals.max_vhost_qfill / s->totals.iterations, - 1.0 * s->totals.upcalls / s->totals.iterations, - s->totals.upcalls - ? s->totals.upcall_cycles / s->totals.upcalls : 0); - } + ds_put_cstr(str, + "-----------------------------------------------------" + "-----------------------------------------------------" + "------------------------------------------------\n"); + ds_put_format(str, + " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", + "vhost qlen", "upcalls/it", "cycles/upcall"); + ds_put_format(str, + " %-21"PRIu64" %-21.5f %-21"PRIu64 + " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", + s->totals.iterations + ? s->totals.cycles / s->totals.iterations : 0, + s->totals.iterations + ? 1.0 * s->totals.pkts / s->totals.iterations : 0, + s->totals.pkts + ? s->totals.busy_cycles / s->totals.pkts : 0, + s->totals.batches + ? 1.0 * s->totals.pkts / s->totals.batches : 0, + s->totals.iterations + ? 1.0 * s->totals.max_vhost_qfill / s->totals.iterations + : 0, + s->totals.iterations + ? 1.0 * s->totals.upcalls / s->totals.iterations : 0, + s->totals.upcalls + ? s->totals.upcall_cycles / s->totals.upcalls : 0); } void @@ -518,14 +531,15 @@ OVS_REQUIRES(s->stats_mutex) void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, - int tx_packets, bool full_metrics) + int tx_packets, uint64_t sleep_cycles, + bool full_metrics) { uint64_t now_tsc = cycles_counter_update(s); struct iter_stats *cum_ms; uint64_t cycles, cycles_per_pkt = 0; char *reason = NULL; - cycles = now_tsc - s->start_tsc; + cycles = now_tsc - s->start_tsc - sleep_cycles; s->current.timestamp = s->iteration_cnt; s->current.cycles = cycles; s->current.pkts = rx_packets; @@ -539,6 +553,11 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, histogram_add_sample(&s->cycles, cycles); histogram_add_sample(&s->pkts, rx_packets); + if (sleep_cycles) { + pmd_perf_update_counter(s, PMD_SLEEP_ITER, 1); + pmd_perf_update_counter(s, PMD_CYCLES_SLEEP, sleep_cycles); + } + if (!full_metrics) { return; } diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index 9673dddd835..84beced1519 100644 --- a/lib/dpif-netdev-perf.h +++ b/lib/dpif-netdev-perf.h @@ -80,6 +80,8 @@ enum pmd_stat_type { PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */ PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */ PMD_CYCLES_UPCALL, /* Cycles spent processing upcalls. */ + PMD_SLEEP_ITER, /* Iterations where a sleep has taken place. */ + PMD_CYCLES_SLEEP, /* Total cycles slept to save power. */ PMD_N_STATS }; @@ -408,7 +410,8 @@ void pmd_perf_start_iteration(struct pmd_perf_stats *s); void pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, - int tx_packets, bool full_metrics); + int tx_packets, uint64_t sleep_cycles, + bool full_metrics); /* Formatting the output of commands. */ diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c index 1a9b354201a..ded08fd3ef2 100644 --- a/lib/dpif-netdev-private-extract.c +++ b/lib/dpif-netdev-private-extract.c @@ -184,6 +184,16 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = { .extract_func = mfex_avx512_dot1q_ipv6_udp, .name = "avx512_dot1q_ipv6_udp", }, +#if HAVE_AVX512VBMI + [MFEX_IMPL_VBMI_IPv4_NVGRE] = { + .probe = mfex_avx512_vbmi_probe, + .extract_func = mfex_avx512_vbmi_ip_nvgre, + .name = "avx512_vbmi_ipv4_nvgre", }, +#endif + [MFEX_IMPL_IPv4_NVGRE] = { + .probe = mfex_avx512_probe, + .extract_func = mfex_avx512_ip_nvgre, + .name = "avx512_ipv4_nvgre", }, #endif }; diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h index 8a7f9b01aff..48549beaa0e 100644 --- a/lib/dpif-netdev-private-extract.h +++ b/lib/dpif-netdev-private-extract.h @@ -117,6 +117,10 @@ enum dpif_miniflow_extract_impl_idx { MFEX_IMPL_VBMI_DOT1Q_IPv6_UDP, #endif MFEX_IMPL_DOT1Q_IPv6_UDP, +#if HAVE_AVX512VBMI + MFEX_IMPL_VBMI_IPv4_NVGRE, +#endif + MFEX_IMPL_IPv4_NVGRE, #endif MFEX_IMPL_MAX }; @@ -230,6 +234,7 @@ DECLARE_AVX512_MFEX_PROTOTYPE(ipv6_udp); DECLARE_AVX512_MFEX_PROTOTYPE(ipv6_tcp); DECLARE_AVX512_MFEX_PROTOTYPE(dot1q_ipv6_tcp); DECLARE_AVX512_MFEX_PROTOTYPE(dot1q_ipv6_udp); +DECLARE_AVX512_MFEX_PROTOTYPE(ip_nvgre); #endif /* __x86_64__ */ diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h index 4472b199d5c..8715b383796 100644 --- a/lib/dpif-netdev-private-thread.h +++ b/lib/dpif-netdev-private-thread.h @@ -114,7 +114,7 @@ struct dp_netdev_pmd_thread { atomic_ullong intrvl_cycles; /* Write index for 'busy_cycles_intrvl'. */ - unsigned int intrvl_idx; + atomic_count intrvl_idx; /* Busy cycles in last PMD_INTERVAL_MAX intervals. */ atomic_ullong *busy_cycles_intrvl; @@ -180,6 +180,9 @@ struct dp_netdev_pmd_thread { int numa_id; /* numa node id of this pmd thread. */ bool isolated; + /* Max sleep request in microseconds. */ + atomic_uint64_t max_sleep; + /* Queue id used by this pmd thread to send packets on all netdevs if * XPS disabled for this netdev. All static_tx_qid's are unique and less * than 'cmap_count(dp->poll_threads)'. */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index a45b460145c..f0594e5f5ce 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -99,7 +99,7 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define FLOW_DUMP_MAX_BATCH 50 /* Use per thread recirc_depth to prevent recirculation loop. */ -#define MAX_RECIRC_DEPTH 6 +#define MAX_RECIRC_DEPTH 8 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) /* Use instant packet send by default. */ @@ -115,6 +115,7 @@ COVERAGE_DEFINE(datapath_drop_lock_error); COVERAGE_DEFINE(datapath_drop_userspace_action_error); COVERAGE_DEFINE(datapath_drop_tunnel_push_error); COVERAGE_DEFINE(datapath_drop_tunnel_pop_error); +COVERAGE_DEFINE(datapath_drop_tunnel_tso_recirc); COVERAGE_DEFINE(datapath_drop_recirc_error); COVERAGE_DEFINE(datapath_drop_invalid_port); COVERAGE_DEFINE(datapath_drop_invalid_bond); @@ -160,15 +161,30 @@ static struct odp_support dp_netdev_support = { /* Time in microseconds of the interval in which rxq processing cycles used * in rxq to pmd assignments is measured and stored. */ -#define PMD_INTERVAL_LEN 10000000LL +#define PMD_INTERVAL_LEN 5000000LL +/* For converting PMD_INTERVAL_LEN to secs. */ +#define INTERVAL_USEC_TO_SEC 1000000LL /* Number of intervals for which cycles are stored * and used during rxq to pmd assignment. */ -#define PMD_INTERVAL_MAX 6 +#define PMD_INTERVAL_MAX 12 /* Time in microseconds to try RCU quiescing. */ #define PMD_RCU_QUIESCE_INTERVAL 10000LL +/* Timer resolution for PMD threads in nanoseconds. */ +#define PMD_TIMER_RES_NS 1000 + +/* Number of pkts Rx on an interface that will stop pmd thread sleeping. */ +#define PMD_SLEEP_THRESH (NETDEV_MAX_BURST / 2) +/* Time in uS to increment a pmd thread sleep time. */ +#define PMD_SLEEP_INC_US 1 + +struct pmd_sleep { + unsigned core_id; + uint64_t max_sleep; +}; + struct dpcls { struct cmap_node node; /* Within dp_netdev_pmd_thread.classifiers */ odp_port_t in_port; @@ -202,21 +218,21 @@ static void dpcls_remove(struct dpcls *, struct dpcls_rule *); struct dp_meter_band { uint32_t rate; uint32_t burst_size; - uint64_t bucket; /* In 1/1000 packets (for PKTPS), or in bits (for KBPS) */ - uint64_t packet_count; - uint64_t byte_count; + atomic_uint64_t bucket; /* In 1/1000 packets for PKTPS, + * or in bits for KBPS. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; }; struct dp_meter { struct cmap_node node; - struct ovs_mutex lock; uint32_t id; uint16_t flags; uint16_t n_bands; uint32_t max_delta_t; - uint64_t used; - uint64_t packet_count; - uint64_t byte_count; + atomic_uint64_t used; /* Time of a last use in milliseconds. */ + atomic_uint64_t packet_count; + atomic_uint64_t byte_count; struct dp_meter_band bands[]; }; @@ -277,6 +293,8 @@ struct dp_netdev { atomic_uint32_t emc_insert_min; /* Enable collection of PMD performance metrics. */ atomic_bool pmd_perf_metrics; + /* Default max load based sleep request. */ + uint64_t pmd_max_sleep_default; /* Enable the SMC cache from ovsdb config */ atomic_bool smc_enable_db; @@ -314,6 +332,9 @@ struct dp_netdev { /* Cpu mask for pin of pmd threads. */ char *pmd_cmask; + /* PMD max load based sleep request user string. */ + char *max_sleep_list; + uint64_t last_tnl_conf_seq; struct conntrack *conntrack; @@ -428,10 +449,9 @@ struct dp_netdev_rxq { pinned. OVS_CORE_UNSPEC if the queue doesn't need to be pinned to a particular core. */ - unsigned intrvl_idx; /* Write index for 'cycles_intrvl'. */ + atomic_count intrvl_idx; /* Write index for 'cycles_intrvl'. */ struct dp_netdev_pmd_thread *pmd; /* pmd thread that polls this queue. */ bool is_vhost; /* Is rxq of a vhost port. */ - bool hw_miss_api_supported; /* hw_miss_packet_recover() supported.*/ /* Counters of cycles spent successfully polling and processing pkts. */ atomic_ullong cycles[RXQ_N_CYCLES]; @@ -616,6 +636,9 @@ dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned long long cycles); static uint64_t dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx); +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read); static void dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, bool purge); @@ -687,6 +710,7 @@ enum pmd_info_type { PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ PMD_INFO_SHOW_RXQ, /* Show poll lists of pmd threads. */ PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ + PMD_INFO_SLEEP_SHOW, /* Show max sleep configuration details. */ }; static void @@ -870,14 +894,16 @@ sorted_poll_list(struct dp_netdev_pmd_thread *pmd, struct rxq_poll **list, } static void -pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) +pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd, + int secs) { if (pmd->core_id != NON_PMD_CORE_ID) { struct rxq_poll *list; size_t n_rxq; - uint64_t total_cycles = 0; - uint64_t busy_cycles = 0; + uint64_t total_pmd_cycles = 0; + uint64_t busy_pmd_cycles = 0; uint64_t total_rxq_proc_cycles = 0; + unsigned int intervals; ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n isolated : %s\n", @@ -888,18 +914,17 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) sorted_poll_list(pmd, &list, &n_rxq); /* Get the total pmd cycles for an interval. */ - atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + atomic_read_relaxed(&pmd->intrvl_cycles, &total_pmd_cycles); + /* Calculate how many intervals are to be used. */ + intervals = DIV_ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); /* Estimate the cycles to cover all intervals. */ - total_cycles *= PMD_INTERVAL_MAX; - - for (int j = 0; j < PMD_INTERVAL_MAX; j++) { - uint64_t cycles; - - atomic_read_relaxed(&pmd->busy_cycles_intrvl[j], &cycles); - busy_cycles += cycles; - } - if (busy_cycles > total_cycles) { - busy_cycles = total_cycles; + total_pmd_cycles *= intervals; + busy_pmd_cycles = get_interval_values(pmd->busy_cycles_intrvl, + &pmd->intrvl_idx, + intervals); + if (busy_pmd_cycles > total_pmd_cycles) { + busy_pmd_cycles = total_pmd_cycles; } for (int i = 0; i < n_rxq; i++) { @@ -907,18 +932,18 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) const char *name = netdev_rxq_get_name(rxq->rx); uint64_t rxq_proc_cycles = 0; - for (int j = 0; j < PMD_INTERVAL_MAX; j++) { - rxq_proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j); - } + rxq_proc_cycles = get_interval_values(rxq->cycles_intrvl, + &rxq->intrvl_idx, + intervals); total_rxq_proc_cycles += rxq_proc_cycles; ds_put_format(reply, " port: %-16s queue-id: %2d", name, netdev_rxq_get_queue_id(list[i].rxq->rx)); ds_put_format(reply, " %s", netdev_rxq_enabled(list[i].rxq->rx) ? "(enabled) " : "(disabled)"); ds_put_format(reply, " pmd usage: "); - if (total_cycles) { + if (total_pmd_cycles) { ds_put_format(reply, "%2"PRIu64"", - rxq_proc_cycles * 100 / total_cycles); + rxq_proc_cycles * 100 / total_pmd_cycles); ds_put_cstr(reply, " %"); } else { ds_put_format(reply, "%s", "NOT AVAIL"); @@ -928,14 +953,14 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) if (n_rxq > 0) { ds_put_cstr(reply, " overhead: "); - if (total_cycles) { + if (total_pmd_cycles) { uint64_t overhead_cycles = 0; - if (total_rxq_proc_cycles < busy_cycles) { - overhead_cycles = busy_cycles - total_rxq_proc_cycles; + if (total_rxq_proc_cycles < busy_pmd_cycles) { + overhead_cycles = busy_pmd_cycles - total_rxq_proc_cycles; } ds_put_format(reply, "%2"PRIu64" %%", - overhead_cycles * 100 / total_cycles); + overhead_cycles * 100 / total_pmd_cycles); } else { ds_put_cstr(reply, "NOT AVAIL"); } @@ -1412,6 +1437,19 @@ dpif_netdev_pmd_rebalance(struct unixctl_conn *conn, int argc, ds_destroy(&reply); } +static void +pmd_info_show_sleep(struct ds *reply, unsigned core_id, int numa_id, + uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + ds_put_format(reply, + "pmd thread numa_id %d core_id %d:\n" + " max sleep: %4"PRIu64" us\n", + numa_id, core_id, pmd_max_sleep); +} + static void dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], void *aux) @@ -1423,6 +1461,11 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], unsigned int core_id; bool filter_on_pmd = false; size_t n; + unsigned int secs = 0; + unsigned long long max_secs = (PMD_INTERVAL_LEN * PMD_INTERVAL_MAX) + / INTERVAL_USEC_TO_SEC; + bool show_header = true; + uint64_t max_sleep; ovs_mutex_lock(&dp_netdev_mutex); @@ -1433,6 +1476,14 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], } argc -= 2; argv += 2; + } else if (type == PMD_INFO_SHOW_RXQ && + !strcmp(argv[1], "-secs") && + argc > 2) { + if (!str_to_uint(argv[2], 10, &secs)) { + secs = max_secs; + } + argc -= 2; + argv += 2; } else { dp = shash_find_data(&dp_netdevs, argv[1]); argc -= 1; @@ -1462,13 +1513,33 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], continue; } if (type == PMD_INFO_SHOW_RXQ) { - pmd_info_show_rxq(&reply, pmd); + if (show_header) { + if (!secs || secs > max_secs) { + secs = max_secs; + } else { + secs = ROUND_UP(secs, + PMD_INTERVAL_LEN / INTERVAL_USEC_TO_SEC); + } + ds_put_format(&reply, "Displaying last %u seconds " + "pmd usage %%\n", secs); + show_header = false; + } + pmd_info_show_rxq(&reply, pmd, secs); } else if (type == PMD_INFO_CLEAR_STATS) { pmd_perf_stats_clear(&pmd->perf_stats); } else if (type == PMD_INFO_SHOW_STATS) { pmd_info_show_stats(&reply, pmd); } else if (type == PMD_INFO_PERF_SHOW) { pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); + } else if (type == PMD_INFO_SLEEP_SHOW) { + if (show_header) { + ds_put_format(&reply, "Default max sleep: %4"PRIu64" us\n", + dp->pmd_max_sleep_default); + show_header = false; + } + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); + pmd_info_show_sleep(&reply, pmd->core_id, pmd->numa_id, + max_sleep); } } free(pmd_list); @@ -1569,7 +1640,8 @@ dpif_netdev_init(void) { static enum pmd_info_type show_aux = PMD_INFO_SHOW_STATS, clear_aux = PMD_INFO_CLEAR_STATS, - poll_aux = PMD_INFO_SHOW_RXQ; + poll_aux = PMD_INFO_SHOW_RXQ, + sleep_aux = PMD_INFO_SLEEP_SHOW; unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]", 0, 3, dpif_netdev_pmd_info, @@ -1577,9 +1649,13 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", 0, 3, dpif_netdev_pmd_info, (void *)&clear_aux); - unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]", - 0, 3, dpif_netdev_pmd_info, + unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] " + "[-secs secs] [dp]", + 0, 5, dpif_netdev_pmd_info, (void *)&poll_aux); + unixctl_command_register("dpif-netdev/pmd-sleep-show", "[dp]", + 0, 1, dpif_netdev_pmd_info, + (void *)&sleep_aux); unixctl_command_register("dpif-netdev/pmd-perf-show", "[-nh] [-it iter-history-len]" " [-ms ms-history-len]" @@ -1852,6 +1928,8 @@ create_dp_netdev(const char *name, const struct dpif_class *class, return error; } + dp->max_sleep_list = NULL; + dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); *dpp = dp; return 0; @@ -1961,6 +2039,7 @@ dp_netdev_free(struct dp_netdev *dp) dp_netdev_meter_destroy(dp); + free(dp->max_sleep_list); free(dp->pmd_cmask); free(CONST_CAST(char *, dp->name)); free(dp); @@ -3321,6 +3400,27 @@ netdev_flow_key_init_masked(struct netdev_flow_key *dst, (dst_u64 - miniflow_get_values(&dst->mf)) * 8); } +/* Initializes 'key' as a copy of 'flow'. */ +static inline void +netdev_flow_key_init(struct netdev_flow_key *key, + const struct flow *flow) +{ + uint32_t hash = 0; + uint64_t value; + + miniflow_map_init(&key->mf, flow); + miniflow_init(&key->mf, flow); + + size_t n = miniflow_n_values(&key->mf); + + FLOW_FOR_EACH_IN_MAPS (value, flow, key->mf.map) { + hash = hash_add64(hash, value); + } + + key->hash = hash_finish(hash, n * 8); + key->len = netdev_flow_key_size(n); +} + static inline void emc_change_entry(struct emc_entry *ce, struct dp_netdev_flow *flow, const struct netdev_flow_key *key) @@ -4130,7 +4230,7 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, const struct dpif_flow_put *put, struct dpif_flow_stats *stats) { - struct dp_netdev_flow *netdev_flow; + struct dp_netdev_flow *netdev_flow = NULL; int error = 0; if (stats) { @@ -4138,16 +4238,35 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, } ovs_mutex_lock(&pmd->flow_mutex); - netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); - if (!netdev_flow) { - if (put->flags & DPIF_FP_CREATE) { - dp_netdev_flow_add(pmd, match, ufid, put->actions, - put->actions_len, ODPP_NONE); + if (put->ufid) { + netdev_flow = dp_netdev_pmd_find_flow(pmd, put->ufid, + put->key, put->key_len); + } else { + /* Use key instead of the locally generated ufid + * to search netdev_flow. */ + netdev_flow = dp_netdev_pmd_lookup_flow(pmd, key, NULL); + } + + if (put->flags & DPIF_FP_CREATE) { + if (!netdev_flow) { + dp_netdev_flow_add(pmd, match, ufid, + put->actions, put->actions_len, ODPP_NONE); } else { - error = ENOENT; + error = EEXIST; } - } else { - if (put->flags & DPIF_FP_MODIFY) { + goto exit; + } + + if (put->flags & DPIF_FP_MODIFY) { + if (!netdev_flow) { + error = ENOENT; + } else { + if (!put->ufid && !flow_equal(&match->flow, &netdev_flow->flow)) { + /* Overlapping flow. */ + error = EINVAL; + goto exit; + } + struct dp_netdev_actions *new_actions; struct dp_netdev_actions *old_actions; @@ -4178,15 +4297,11 @@ flow_put_on_pmd(struct dp_netdev_pmd_thread *pmd, * counter, and subtracting it before outputting the stats */ error = EOPNOTSUPP; } - ovsrcu_postpone(dp_netdev_actions_free, old_actions); - } else if (put->flags & DPIF_FP_CREATE) { - error = EEXIST; - } else { - /* Overlapping flow. */ - error = EINVAL; } } + +exit: ovs_mutex_unlock(&pmd->flow_mutex); return error; } @@ -4195,7 +4310,7 @@ static int dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) { struct dp_netdev *dp = get_dp_netdev(dpif); - struct netdev_flow_key key, mask; + struct netdev_flow_key key; struct dp_netdev_pmd_thread *pmd; struct match match; ovs_u128 ufid; @@ -4244,9 +4359,12 @@ dpif_netdev_flow_put(struct dpif *dpif, const struct dpif_flow_put *put) /* Must produce a netdev_flow_key for lookup. * Use the same method as employed to create the key when adding - * the flow to the dplcs to make sure they match. */ - netdev_flow_mask_init(&mask, &match); - netdev_flow_key_init_masked(&key, &match.flow, &mask); + * the flow to the dplcs to make sure they match. + * We need to put in the unmasked key as flow_put_on_pmd() will first try + * to see if an entry exists doing a packet type lookup. As masked-out + * fields are interpreted as zeros, they could falsely match a wider IP + * address mask. Installation of the flow will use the match variable. */ + netdev_flow_key_init(&key, &match.flow); if (put->pmd_id == PMD_ID_NULL) { if (cmap_count(&dp->poll_threads) == 0) { @@ -4655,6 +4773,10 @@ dpif_netdev_offload_stats_get(struct dpif *dpif, } nb_thread = netdev_offload_thread_nb(); + if (!nb_thread) { + return EINVAL; + } + /* nb_thread counters for the overall total as well. */ stats->size = ARRAY_SIZE(hwol_stats) * (nb_thread + 1); stats->counters = xcalloc(stats->size, sizeof *stats->counters); @@ -4750,6 +4872,209 @@ set_pmd_auto_lb(struct dp_netdev *dp, bool state, bool always_log) } } +static int +parse_pmd_sleep_list(const char *max_sleep_list, + struct pmd_sleep **pmd_sleeps) +{ + char *list, *copy, *key, *value; + int num_vals = 0; + + if (!max_sleep_list) { + return num_vals; + } + + list = copy = xstrdup(max_sleep_list); + + while (ofputil_parse_key_value(&list, &key, &value)) { + uint64_t temp, pmd_max_sleep; + char *error = NULL; + unsigned core; + int i; + + error = str_to_u64(key, &temp); + if (error) { + free(error); + continue; + } + + if (value[0] == '\0') { + /* No value specified. key is dp default. */ + core = UINT_MAX; + pmd_max_sleep = temp; + } else { + error = str_to_u64(value, &pmd_max_sleep); + if (!error && temp < UINT_MAX) { + /* Key is pmd core id. */ + core = (unsigned) temp; + } else { + free(error); + continue; + } + } + + /* Detect duplicate max sleep values. */ + for (i = 0; i < num_vals; i++) { + if ((*pmd_sleeps)[i].core_id == core) { + break; + } + } + if (i == num_vals) { + /* Not duplicate, add a new entry. */ + *pmd_sleeps = xrealloc(*pmd_sleeps, + (num_vals + 1) * sizeof **pmd_sleeps); + num_vals++; + } + + pmd_max_sleep = MIN(PMD_RCU_QUIESCE_INTERVAL, pmd_max_sleep); + + (*pmd_sleeps)[i].core_id = core; + (*pmd_sleeps)[i].max_sleep = pmd_max_sleep; + } + + free(copy); + return num_vals; +} + +static void +log_pmd_sleep(unsigned core_id, int numa_id, uint64_t pmd_max_sleep) +{ + if (core_id == NON_PMD_CORE_ID) { + return; + } + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d, " + "max sleep: %4"PRIu64" us.", numa_id, core_id, pmd_max_sleep); +} + +static void +pmd_init_max_sleep(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd) +{ + uint64_t max_sleep = dp->pmd_max_sleep_default; + struct pmd_sleep *pmd_sleeps = NULL; + int num_vals; + + num_vals = parse_pmd_sleep_list(dp->max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a specific value for this pmd. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == pmd->core_id) { + max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_init(&pmd->max_sleep, max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, max_sleep); + free(pmd_sleeps); +} + +static bool +assign_sleep_values_to_pmds(struct dp_netdev *dp, int num_vals, + struct pmd_sleep *pmd_sleeps) +{ + struct dp_netdev_pmd_thread *pmd; + bool value_changed = false; + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + uint64_t new_max_sleep, cur_pmd_max_sleep; + + if (pmd->core_id == NON_PMD_CORE_ID) { + continue; + } + + /* Default to global value. */ + new_max_sleep = dp->pmd_max_sleep_default; + + /* Check for pmd specific value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd->core_id == pmd_sleeps[i].core_id) { + new_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + if (new_max_sleep != cur_pmd_max_sleep) { + atomic_store_relaxed(&pmd->max_sleep, new_max_sleep); + value_changed = true; + } + } + return value_changed; +} + +static void +log_all_pmd_sleeps(struct dp_netdev *dp) +{ + struct dp_netdev_pmd_thread **pmd_list = NULL; + struct dp_netdev_pmd_thread *pmd; + size_t n; + + VLOG_INFO("Default PMD thread max sleep: %4"PRIu64" us.", + dp->pmd_max_sleep_default); + + sorted_poll_thread_list(dp, &pmd_list, &n); + + for (size_t i = 0; i < n; i++) { + uint64_t cur_pmd_max_sleep; + + pmd = pmd_list[i]; + atomic_read_relaxed(&pmd->max_sleep, &cur_pmd_max_sleep); + log_pmd_sleep(pmd->core_id, pmd->numa_id, cur_pmd_max_sleep); + } + free(pmd_list); +} + +static bool +set_all_pmd_max_sleeps(struct dp_netdev *dp, const struct smap *config) +{ + const char *max_sleep_list = smap_get(config, "pmd-sleep-max"); + struct pmd_sleep *pmd_sleeps = NULL; + uint64_t default_max_sleep = 0; + bool default_changed = false; + bool pmd_changed = false; + uint64_t pmd_maxsleep; + int num_vals = 0; + + /* Check for deprecated 'pmd-maxsleep' value. */ + pmd_maxsleep = smap_get_ullong(config, "pmd-maxsleep", UINT64_MAX); + if (pmd_maxsleep != UINT64_MAX && !max_sleep_list) { + VLOG_WARN_ONCE("pmd-maxsleep is deprecated. " + "Please use pmd-sleep-max instead."); + default_max_sleep = pmd_maxsleep; + } + + /* Check if there is no change in string or value. */ + if (!!dp->max_sleep_list == !!max_sleep_list) { + if (max_sleep_list + ? nullable_string_is_equal(max_sleep_list, dp->max_sleep_list) + : default_max_sleep == dp->pmd_max_sleep_default) { + return false; + } + } + + /* Free existing string and copy new one (if any). */ + free(dp->max_sleep_list); + dp->max_sleep_list = nullable_xstrdup(max_sleep_list); + + if (max_sleep_list) { + num_vals = parse_pmd_sleep_list(max_sleep_list, &pmd_sleeps); + + /* Check if the user has set a global value. */ + for (int i = 0; i < num_vals; i++) { + if (pmd_sleeps[i].core_id == UINT_MAX) { + default_max_sleep = pmd_sleeps[i].max_sleep; + break; + } + } + } + + if (dp->pmd_max_sleep_default != default_max_sleep) { + dp->pmd_max_sleep_default = default_max_sleep; + default_changed = true; + } + pmd_changed = assign_sleep_values_to_pmds(dp, num_vals, pmd_sleeps); + + free(pmd_sleeps); + return default_changed || pmd_changed; +} + /* Applies datapath configuration from the database. Some of the changes are * actually applied in dpif_netdev_run(). */ static int @@ -4769,6 +5094,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) uint32_t rebalance_load, rebalance_improve; bool log_autolb = false; enum sched_assignment_type pmd_rxq_assign_type; + static bool first_set_config = true; tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", DEFAULT_TX_FLUSH_INTERVAL); @@ -4915,9 +5241,24 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) bool autolb_state = smap_get_bool(other_config, "pmd-auto-lb", false); set_pmd_auto_lb(dp, autolb_state, log_autolb); + + bool sleep_changed = set_all_pmd_max_sleeps(dp, other_config); + if (first_set_config || sleep_changed) { + log_all_pmd_sleeps(dp); + } + + first_set_config = false; return 0; } +static bool +dpif_netdev_number_handlers_required(struct dpif *dpif_ OVS_UNUSED, + uint32_t *n_handlers) +{ + *n_handlers = 0; + return true; +} + /* Parses affinity list and returns result in 'core_ids'. */ static int parse_affinity_list(const char *affinity_list, unsigned *core_ids, int n_rxq) @@ -5150,7 +5491,7 @@ static void dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned long long cycles) { - unsigned int idx = rx->intrvl_idx++ % PMD_INTERVAL_MAX; + unsigned int idx = atomic_count_inc(&rx->intrvl_idx) % PMD_INTERVAL_MAX; atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles); } @@ -5416,7 +5757,6 @@ port_reconfigure(struct dp_netdev_port *port) port->rxqs[i].port = port; port->rxqs[i].is_vhost = !strncmp(port->type, "dpdkvhost", 9); - port->rxqs[i].hw_miss_api_supported = true; err = netdev_rxq_open(netdev, &port->rxqs[i].rx, i); if (err) { @@ -6078,39 +6418,33 @@ rxq_scheduling(struct dp_netdev *dp) static uint64_t variance(uint64_t a[], int n); static uint64_t -sched_numa_list_variance(struct sched_numa_list *numa_list) +sched_numa_variance(struct sched_numa *numa) { - struct sched_numa *numa; uint64_t *percent_busy = NULL; - unsigned total_pmds = 0; int n_proc = 0; uint64_t var; - HMAP_FOR_EACH (numa, node, &numa_list->numas) { - total_pmds += numa->n_pmds; - percent_busy = xrealloc(percent_busy, - total_pmds * sizeof *percent_busy); + percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy); - for (unsigned i = 0; i < numa->n_pmds; i++) { - struct sched_pmd *sched_pmd; - uint64_t total_cycles = 0; + for (unsigned i = 0; i < numa->n_pmds; i++) { + struct sched_pmd *sched_pmd; + uint64_t total_cycles = 0; - sched_pmd = &numa->pmds[i]; - /* Exclude isolated PMDs from variance calculations. */ - if (sched_pmd->isolated == true) { - continue; - } - /* Get the total pmd cycles for an interval. */ - atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); - - if (total_cycles) { - /* Estimate the cycles to cover all intervals. */ - total_cycles *= PMD_INTERVAL_MAX; - percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) - / total_cycles; - } else { - percent_busy[n_proc++] = 0; - } + sched_pmd = &numa->pmds[i]; + /* Exclude isolated PMDs from variance calculations. */ + if (sched_pmd->isolated == true) { + continue; + } + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles); + + if (total_cycles) { + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_INTERVAL_MAX; + percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100) + / total_cycles; + } else { + percent_busy[n_proc++] = 0; } } var = variance(percent_busy, n_proc); @@ -6184,6 +6518,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp) struct sched_numa_list numa_list_est; bool thresh_met = false; uint64_t current_var, estimate_var; + struct sched_numa *numa_cur, *numa_est; uint64_t improvement = 0; VLOG_DBG("PMD auto load balance performing dry run."); @@ -6202,25 +6537,29 @@ pmd_rebalance_dry_run(struct dp_netdev *dp) sched_numa_list_count(&numa_list_est) == 1) { /* Calculate variances. */ - current_var = sched_numa_list_variance(&numa_list_cur); - estimate_var = sched_numa_list_variance(&numa_list_est); - - if (estimate_var < current_var) { - improvement = ((current_var - estimate_var) * 100) / current_var; - } - VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64".", - current_var, estimate_var); - VLOG_DBG("Variance improvement %"PRIu64"%%.", improvement); - - if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { - thresh_met = true; - VLOG_DBG("PMD load variance improvement threshold %u%% " - "is met.", dp->pmd_alb.rebalance_improve_thresh); - } else { - VLOG_DBG("PMD load variance improvement threshold " - "%u%% is not met.", - dp->pmd_alb.rebalance_improve_thresh); + HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) { + numa_est = sched_numa_list_lookup(&numa_list_est, + numa_cur->numa_id); + if (!numa_est) { + continue; + } + current_var = sched_numa_variance(numa_cur); + estimate_var = sched_numa_variance(numa_est); + if (estimate_var < current_var) { + improvement = ((current_var - estimate_var) * 100) + / current_var; + } + VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated " + "variance %"PRIu64". Variance improvement %"PRIu64"%%.", + numa_cur->numa_id, current_var, + estimate_var, improvement); + if (improvement >= dp->pmd_alb.rebalance_improve_thresh) { + thresh_met = true; + } } + VLOG_DBG("PMD load variance improvement threshold %u%% is %s.", + dp->pmd_alb.rebalance_improve_thresh, + thresh_met ? "met" : "not met"); } else { VLOG_DBG("PMD auto load balance detected cross-numa polling with " "multiple numa nodes. Unable to accurately estimate."); @@ -6877,6 +7216,7 @@ pmd_thread_main(void *f_) int poll_cnt; int i; int process_packets = 0; + uint64_t sleep_time = 0; poll_list = NULL; @@ -6887,10 +7227,14 @@ pmd_thread_main(void *f_) poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); dfc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); + set_timer_resolution(PMD_TIMER_RES_NS); reload: atomic_count_init(&pmd->pmd_overloaded, 0); + pmd->intrvl_tsc_prev = 0; + atomic_store_relaxed(&pmd->intrvl_cycles, 0); + if (!dpdk_attached) { dpdk_attached = dpdk_attach_thread(pmd->core_id); } @@ -6922,12 +7266,10 @@ pmd_thread_main(void *f_) } } - pmd->intrvl_tsc_prev = 0; - atomic_store_relaxed(&pmd->intrvl_cycles, 0); for (i = 0; i < PMD_INTERVAL_MAX; i++) { atomic_store_relaxed(&pmd->busy_cycles_intrvl[i], 0); } - pmd->intrvl_idx = 0; + atomic_count_set(&pmd->intrvl_idx, 0); cycles_counter_update(s); pmd->next_rcu_quiesce = pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL; @@ -6936,10 +7278,13 @@ pmd_thread_main(void *f_) ovs_mutex_lock(&pmd->perf_stats.stats_mutex); for (;;) { uint64_t rx_packets = 0, tx_packets = 0; + uint64_t time_slept = 0; + uint64_t max_sleep; pmd_perf_start_iteration(s); atomic_read_relaxed(&pmd->dp->smc_enable_db, &pmd->ctx.smc_enable_db); + atomic_read_relaxed(&pmd->max_sleep, &max_sleep); for (i = 0; i < poll_cnt; i++) { @@ -6958,6 +7303,9 @@ pmd_thread_main(void *f_) dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, poll_list[i].port_no); rx_packets += process_packets; + if (process_packets >= PMD_SLEEP_THRESH) { + sleep_time = 0; + } } if (!rx_packets) { @@ -6965,7 +7313,30 @@ pmd_thread_main(void *f_) * Check if we need to send something. * There was no time updates on current iteration. */ pmd_thread_ctx_time_update(pmd); - tx_packets = dp_netdev_pmd_flush_output_packets(pmd, false); + tx_packets = dp_netdev_pmd_flush_output_packets(pmd, + max_sleep && sleep_time + ? true : false); + } + + if (max_sleep) { + /* Check if a sleep should happen on this iteration. */ + if (sleep_time) { + struct cycle_timer sleep_timer; + + cycle_timer_start(&pmd->perf_stats, &sleep_timer); + xnanosleep_no_quiesce(sleep_time * 1000); + time_slept = cycle_timer_stop(&pmd->perf_stats, &sleep_timer); + pmd_thread_ctx_time_update(pmd); + } + if (sleep_time < max_sleep) { + /* Increase sleep time for next iteration. */ + sleep_time += PMD_SLEEP_INC_US; + } else { + sleep_time = max_sleep; + } + } else { + /* Reset sleep time as max sleep policy may have been changed. */ + sleep_time = 0; } /* Do RCU synchronization at fixed interval. This ensures that @@ -7005,7 +7376,7 @@ pmd_thread_main(void *f_) break; } - pmd_perf_end_iteration(s, rx_packets, tx_packets, + pmd_perf_end_iteration(s, rx_packets, tx_packets, time_slept, pmd_perf_metrics_enabled(pmd)); } ovs_mutex_unlock(&pmd->perf_stats.stats_mutex); @@ -7057,22 +7428,56 @@ dpif_netdev_meter_get_features(const struct dpif * dpif OVS_UNUSED, features->max_color = 0; } +/* Tries to atomically add 'n' to 'value' in terms of saturation arithmetic, + * i.e., if the result will be larger than 'max_value', will store 'max_value' + * instead. */ +static void +atomic_sat_add(atomic_uint64_t *value, uint64_t n, uint64_t max_value) +{ + uint64_t current, new_value; + + atomic_read_relaxed(value, ¤t); + do { + new_value = current + n; + new_value = MIN(new_value, max_value); + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + new_value)); +} + +/* Tries to atomically subtract 'n' from 'value'. Does not perform the + * operation and returns 'false' if the result will be less than 'min_value'. + * Otherwise, stores the result and returns 'true'. */ +static bool +atomic_bound_sub(atomic_uint64_t *value, uint64_t n, uint64_t min_value) +{ + uint64_t current; + + atomic_read_relaxed(value, ¤t); + do { + if (current < min_value + n) { + return false; + } + } while (!atomic_compare_exchange_weak_relaxed(value, ¤t, + current - n)); + return true; +} + /* Applies the meter identified by 'meter_id' to 'packets_'. Packets * that exceed a band are dropped in-place. */ static void dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, - uint32_t meter_id, long long int now) + uint32_t meter_id, long long int now_ms) { - struct dp_meter *meter; - struct dp_meter_band *band; - struct dp_packet *packet; - long long int long_delta_t; /* msec */ - uint32_t delta_t; /* msec */ const size_t cnt = dp_packet_batch_size(packets_); - uint32_t bytes, volume; - int exceeded_band[NETDEV_MAX_BURST]; uint32_t exceeded_rate[NETDEV_MAX_BURST]; - int exceeded_pkt = cnt; /* First packet that exceeded a band rate. */ + uint32_t exceeded_band[NETDEV_MAX_BURST]; + uint64_t bytes, volume, meter_used, old; + uint64_t band_packets[MAX_BANDS]; + uint64_t band_bytes[MAX_BANDS]; + struct dp_meter_band *band; + struct dp_packet *packet; + struct dp_meter *meter; + bool exceeded = false; if (meter_id >= MAX_METERS) { return; @@ -7088,116 +7493,101 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, /* Initialize as zeroes. */ memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); - ovs_mutex_lock(&meter->lock); - /* All packets will hit the meter at the same time. */ - long_delta_t = now / 1000 - meter->used / 1000; /* msec */ + atomic_read_relaxed(&meter->used, &meter_used); + do { + if (meter_used >= now_ms) { + /* The '>' condition means that we have several threads hitting the + * same meter, and the other one already advanced the time. */ + meter_used = now_ms; + break; + } + } while (!atomic_compare_exchange_weak_relaxed(&meter->used, + &meter_used, now_ms)); - if (long_delta_t < 0) { - /* This condition means that we have several threads fighting for a - meter lock, and the one who received the packets a bit later wins. - Assuming that all racing threads received packets at the same time - to avoid overflow. */ - long_delta_t = 0; - } + /* Refill all buckets right away, since other threads may use them. */ + if (meter_used < now_ms) { + /* All packets will hit the meter at the same time. */ + uint64_t delta_t = now_ms - meter_used; - /* Make sure delta_t will not be too large, so that bucket will not - * wrap around below. */ - delta_t = (long_delta_t > (long long int)meter->max_delta_t) - ? meter->max_delta_t : (uint32_t)long_delta_t; + /* Make sure delta_t will not be too large, so that bucket will not + * wrap around below. */ + delta_t = MIN(delta_t, meter->max_delta_t); + + for (int m = 0; m < meter->n_bands; m++) { + band = &meter->bands[m]; + /* Update band's bucket. We can't just use atomic add here, + * because we should never add above the max capacity. */ + atomic_sat_add(&band->bucket, delta_t * band->rate, + band->burst_size * 1000ULL); + } + } /* Update meter stats. */ - meter->used = now; - meter->packet_count += cnt; + atomic_add_relaxed(&meter->packet_count, cnt, &old); bytes = 0; DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { bytes += dp_packet_size(packet); } - meter->byte_count += bytes; + atomic_add_relaxed(&meter->byte_count, bytes, &old); /* Meters can operate in terms of packets per second or kilobits per * second. */ if (meter->flags & OFPMF13_PKTPS) { - /* Rate in packets/second, bucket 1/1000 packets. */ - /* msec * packets/sec = 1/1000 packets. */ + /* Rate in packets/second, bucket 1/1000 packets. + * msec * packets/sec = 1/1000 packets. */ volume = cnt * 1000; /* Take 'cnt' packets from the bucket. */ } else { - /* Rate in kbps, bucket in bits. */ - /* msec * kbps = bits */ + /* Rate in kbps, bucket in bits. + * msec * kbps = bits */ volume = bytes * 8; } - /* Update all bands and find the one hit with the highest rate for each - * packet (if any). */ - for (int m = 0; m < meter->n_bands; ++m) { - uint64_t max_bucket_size; - + /* Find the band hit with the highest rate for each packet (if any). */ + for (int m = 0; m < meter->n_bands; m++) { band = &meter->bands[m]; - max_bucket_size = band->burst_size * 1000ULL; - /* Update band's bucket. */ - band->bucket += (uint64_t) delta_t * band->rate; - if (band->bucket > max_bucket_size) { - band->bucket = max_bucket_size; - } /* Drain the bucket for all the packets, if possible. */ - if (band->bucket >= volume) { - band->bucket -= volume; - } else { - int band_exceeded_pkt; - - /* Band limit hit, must process packet-by-packet. */ - if (meter->flags & OFPMF13_PKTPS) { - band_exceeded_pkt = band->bucket / 1000; - band->bucket %= 1000; /* Remainder stays in bucket. */ - - /* Update the exceeding band for each exceeding packet. - * (Only one band will be fired by a packet, and that - * can be different for each packet.) */ - for (int i = band_exceeded_pkt; i < cnt; i++) { - if (band->rate > exceeded_rate[i]) { - exceeded_rate[i] = band->rate; - exceeded_band[i] = m; - } - } - } else { - /* Packet sizes differ, must process one-by-one. */ - band_exceeded_pkt = cnt; - DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { - uint32_t bits = dp_packet_size(packet) * 8; - - if (band->bucket >= bits) { - band->bucket -= bits; - } else { - if (i < band_exceeded_pkt) { - band_exceeded_pkt = i; - } - /* Update the exceeding band for the exceeding packet. - * (Only one band will be fired by a packet, and that - * can be different for each packet.) */ - if (band->rate > exceeded_rate[i]) { - exceeded_rate[i] = band->rate; - exceeded_band[i] = m; - } - } + if (atomic_bound_sub(&band->bucket, volume, 0)) { + continue; + } + + /* Band limit hit, must process packet-by-packet. */ + DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) { + uint64_t packet_volume = (meter->flags & OFPMF13_PKTPS) + ? 1000 : (dp_packet_size(packet) * 8); + + if (!atomic_bound_sub(&band->bucket, packet_volume, 0)) { + /* Update the exceeding band for the exceeding packet. + * Only one band will be fired by a packet, and that can + * be different for each packet. */ + if (band->rate > exceeded_rate[i]) { + exceeded_rate[i] = band->rate; + exceeded_band[i] = m; + exceeded = true; } } - /* Remember the first exceeding packet. */ - if (exceeded_pkt > band_exceeded_pkt) { - exceeded_pkt = band_exceeded_pkt; - } } } + /* No need to iterate over packets if there are no drops. */ + if (!exceeded) { + return; + } + /* Fire the highest rate band exceeded by each packet, and drop * packets if needed. */ + + memset(band_packets, 0, sizeof band_packets); + memset(band_bytes, 0, sizeof band_bytes); + size_t j; DP_PACKET_BATCH_REFILL_FOR_EACH (j, cnt, packet, packets_) { - if (exceeded_band[j] >= 0) { + uint32_t m = exceeded_band[j]; + + if (m != UINT32_MAX) { /* Meter drop packet. */ - band = &meter->bands[exceeded_band[j]]; - band->packet_count += 1; - band->byte_count += dp_packet_size(packet); - COVERAGE_INC(datapath_drop_meter); + band_packets[m]++; + band_bytes[m] += dp_packet_size(packet); dp_packet_delete(packet); } else { /* Meter accepts packet. */ @@ -7205,7 +7595,15 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, } } - ovs_mutex_unlock(&meter->lock); + for (int m = 0; m < meter->n_bands; m++) { + if (!band_packets[m]) { + continue; + } + band = &meter->bands[m]; + atomic_add_relaxed(&band->packet_count, band_packets[m], &old); + atomic_add_relaxed(&band->byte_count, band_bytes[m], &old); + COVERAGE_ADD(datapath_drop_meter, band_packets[m]); + } } /* Meter set/get/del processing is still single-threaded. */ @@ -7246,13 +7644,13 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, meter->flags = config->flags; meter->n_bands = config->n_bands; meter->max_delta_t = 0; - meter->used = time_usec(); meter->id = mid; - ovs_mutex_init_adaptive(&meter->lock); + atomic_init(&meter->used, time_msec()); /* set up bands */ for (i = 0; i < config->n_bands; ++i) { uint32_t band_max_delta_t; + uint64_t bucket_size; /* Set burst size to a workable value if none specified. */ if (config->bands[i].burst_size == 0) { @@ -7262,11 +7660,11 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id meter_id, meter->bands[i].rate = config->bands[i].rate; meter->bands[i].burst_size = config->bands[i].burst_size; /* Start with a full bucket. */ - meter->bands[i].bucket = meter->bands[i].burst_size * 1000ULL; + bucket_size = meter->bands[i].burst_size * 1000ULL; + atomic_init(&meter->bands[i].bucket, bucket_size); /* Figure out max delta_t that is enough to fill any bucket. */ - band_max_delta_t - = meter->bands[i].bucket / meter->bands[i].rate; + band_max_delta_t = bucket_size / meter->bands[i].rate; if (band_max_delta_t > meter->max_delta_t) { meter->max_delta_t = band_max_delta_t; } @@ -7289,7 +7687,7 @@ dpif_netdev_meter_get(const struct dpif *dpif, { struct dp_netdev *dp = get_dp_netdev(dpif); uint32_t meter_id = meter_id_.uint32; - const struct dp_meter *meter; + struct dp_meter *meter; if (meter_id >= MAX_METERS) { return EFBIG; @@ -7303,17 +7701,15 @@ dpif_netdev_meter_get(const struct dpif *dpif, if (stats) { int i = 0; - ovs_mutex_lock(&meter->lock); - - stats->packet_in_count = meter->packet_count; - stats->byte_in_count = meter->byte_count; + atomic_read_relaxed(&meter->packet_count, &stats->packet_in_count); + atomic_read_relaxed(&meter->byte_count, &stats->byte_in_count); for (i = 0; i < n_bands && i < meter->n_bands; ++i) { - stats->bands[i].packet_count = meter->bands[i].packet_count; - stats->bands[i].byte_count = meter->bands[i].byte_count; + atomic_read_relaxed(&meter->bands[i].packet_count, + &stats->bands[i].packet_count); + atomic_read_relaxed(&meter->bands[i].byte_count, + &stats->bands[i].byte_count); } - - ovs_mutex_unlock(&meter->lock); stats->n_bands = i; } @@ -7475,6 +7871,8 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, hmap_init(&pmd->send_port_cache); cmap_init(&pmd->tx_bonds); + pmd_init_max_sleep(dp, pmd); + /* Initialize DPIF function pointer to the default configured version. */ atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default()); @@ -7805,6 +8203,10 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, ds_destroy(&ds); } + if (type != DPIF_UC_MISS) { + dp_packet_ol_send_prepare(packet_, 0); + } + return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata, actions, wc, put_actions, dp->upcall_aux); } @@ -8034,17 +8436,15 @@ dp_netdev_hw_flow(const struct dp_netdev_pmd_thread *pmd, #ifdef ALLOW_EXPERIMENTAL_API /* Packet restoration API required. */ /* Restore the packet if HW processing was terminated before completion. */ struct dp_netdev_rxq *rxq = pmd->ctx.last_rxq; + bool miss_api_supported; - if (rxq->hw_miss_api_supported) { + atomic_read_relaxed(&rxq->port->netdev->hw_info.miss_api_supported, + &miss_api_supported); + if (miss_api_supported) { int err = netdev_hw_miss_packet_recover(rxq->port->netdev, packet); - if (err) { - if (err != EOPNOTSUPP) { - COVERAGE_INC(datapath_drop_hw_miss_recover); - return -1; - } else { - /* API unsupported by the port; avoid subsequent calls. */ - rxq->hw_miss_api_supported = false; - } + if (err && err != EOPNOTSUPP) { + COVERAGE_INC(datapath_drop_hw_miss_recover); + return -1; } } #endif @@ -8521,6 +8921,34 @@ static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + size_t i, size = dp_packet_batch_size(packets); + struct dp_packet *packet; + + DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets) { + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + + if (dp_packet_hwol_is_tso(packet)) { + /* Can't perform GSO in the middle of a pipeline. */ + COVERAGE_INC(datapath_drop_tunnel_tso_recirc); + dp_packet_delete(packet); + VLOG_WARN_RL(&rl, "Recirculating tunnel packets with " + "TSO is not supported"); + continue; + } + /* Have to fix all the checksums before re-parsing, because the + * packet will be treated as having a single set of headers. */ + dp_packet_ol_send_prepare(packet, 0); + /* This packet must not be marked with anything tunnel-related. */ + dp_packet_hwol_reset_tunnel(packet); + /* Clear inner offsets. Other ones are collateral, but they will + * be re-initialized on re-parsing. */ + dp_packet_reset_offsets(packet); + } + dp_packet_batch_refill(packets, packet, i); + } + dp_netdev_input__(pmd, packets, true, 0); } @@ -9018,9 +9446,13 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, nl_attr_get_u16(b_nest); proto_num_max_specified = true; break; + case OVS_NAT_ATTR_PROTO_RANDOM: + nat_action_info.nat_flags |= NAT_RANGE_RANDOM; + break; case OVS_NAT_ATTR_PERSISTENT: + nat_action_info.nat_flags |= NAT_PERSISTENT; + break; case OVS_NAT_ATTR_PROTO_HASH: - case OVS_NAT_ATTR_PROTO_RANDOM: break; case OVS_NAT_ATTR_UNSPEC: case __OVS_NAT_ATTR_MAX: @@ -9057,15 +9489,14 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, } conntrack_execute(dp->conntrack, packets_, aux->flow->dl_type, force, - commit, zone, setmark, setlabel, aux->flow->tp_src, - aux->flow->tp_dst, helper, nat_action_info_ref, - pmd->ctx.now / 1000, tp_id); + commit, zone, setmark, setlabel, helper, + nat_action_info_ref, pmd->ctx.now / 1000, tp_id); break; } case OVS_ACTION_ATTR_METER: dp_netdev_run_meter(pmd->dp, packets_, nl_attr_get_u32(a), - pmd->ctx.now); + pmd->ctx.now / 1000); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -9087,6 +9518,8 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } @@ -9159,6 +9592,53 @@ dpif_netdev_ct_dump_done(struct dpif *dpif OVS_UNUSED, return err; } +static int +dpif_netdev_ct_exp_dump_start(struct dpif *dpif, + struct ct_dpif_dump_state **dump_, + const uint16_t *pzone) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + struct dp_netdev_ct_dump *dump; + + dump = xzalloc(sizeof *dump); + dump->dp = dp; + dump->ct = dp->conntrack; + + conntrack_exp_dump_start(dp->conntrack, &dump->dump, pzone); + + *dump_ = &dump->up; + + return 0; +} + +static int +dpif_netdev_ct_exp_dump_next(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_, + struct ct_dpif_exp *entry) +{ + struct dp_netdev_ct_dump *dump; + + INIT_CONTAINER(dump, dump_, up); + + return conntrack_exp_dump_next(&dump->dump, entry); +} + +static int +dpif_netdev_ct_exp_dump_done(struct dpif *dpif OVS_UNUSED, + struct ct_dpif_dump_state *dump_) +{ + struct dp_netdev_ct_dump *dump; + int err; + + INIT_CONTAINER(dump, dump_, up); + + err = conntrack_exp_dump_done(&dump->dump); + + free(dump); + + return err; +} + static int dpif_netdev_ct_flush(struct dpif *dpif, const uint16_t *zone, const struct ct_dpif_tuple *tuple) @@ -9211,19 +9691,27 @@ dpif_netdev_ct_get_tcp_seq_chk(struct dpif *dpif, bool *enabled) return 0; } +static int +dpif_netdev_ct_set_sweep_interval(struct dpif *dpif, uint32_t ms) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + return conntrack_set_sweep_interval(dp->conntrack, ms); +} + +static int +dpif_netdev_ct_get_sweep_interval(struct dpif *dpif, uint32_t *ms) +{ + struct dp_netdev *dp = get_dp_netdev(dpif); + *ms = conntrack_get_sweep_interval(dp->conntrack); + return 0; +} + static int dpif_netdev_ct_set_limits(struct dpif *dpif, - const uint32_t *default_limits, const struct ovs_list *zone_limits) { int err = 0; struct dp_netdev *dp = get_dp_netdev(dpif); - if (default_limits) { - err = zone_limit_update(dp->conntrack, DEFAULT_ZONE, *default_limits); - if (err != 0) { - return err; - } - } struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits) { @@ -9238,20 +9726,12 @@ dpif_netdev_ct_set_limits(struct dpif *dpif, static int dpif_netdev_ct_get_limits(struct dpif *dpif, - uint32_t *default_limit, const struct ovs_list *zone_limits_request, struct ovs_list *zone_limits_reply) { struct dp_netdev *dp = get_dp_netdev(dpif); struct conntrack_zone_limit czl; - czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); - if (czl.zone == DEFAULT_ZONE) { - *default_limit = czl.limit; - } else { - return EINVAL; - } - if (!ovs_list_is_empty(zone_limits_request)) { struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits_request) { @@ -9265,6 +9745,12 @@ dpif_netdev_ct_get_limits(struct dpif *dpif, } } } else { + czl = zone_limit_get(dp->conntrack, DEFAULT_ZONE); + if (czl.zone == DEFAULT_ZONE) { + ct_dpif_push_zone_limit(zone_limits_reply, DEFAULT_ZONE, + czl.limit, 0); + } + for (int z = MIN_ZONE; z <= MAX_ZONE; z++) { czl = zone_limit_get(dp->conntrack, z); if (czl.zone == z) { @@ -9510,6 +9996,7 @@ dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id, const struct dpif_class dpif_netdev_class = { "netdev", true, /* cleanup_required */ + true, /* synced_dp_layers */ dpif_netdev_init, dpif_netdev_enumerate, dpif_netdev_port_open_type, @@ -9541,7 +10028,7 @@ const struct dpif_class dpif_netdev_class = { dpif_netdev_offload_stats_get, NULL, /* recv_set */ NULL, /* handlers_set */ - NULL, /* number_handlers_required */ + dpif_netdev_number_handlers_required, dpif_netdev_set_config, dpif_netdev_queue_to_priority, NULL, /* recv */ @@ -9555,12 +10042,17 @@ const struct dpif_class dpif_netdev_class = { dpif_netdev_ct_dump_start, dpif_netdev_ct_dump_next, dpif_netdev_ct_dump_done, + dpif_netdev_ct_exp_dump_start, + dpif_netdev_ct_exp_dump_next, + dpif_netdev_ct_exp_dump_done, dpif_netdev_ct_flush, dpif_netdev_ct_set_maxconns, dpif_netdev_ct_get_maxconns, dpif_netdev_ct_get_nconns, dpif_netdev_ct_set_tcp_seq_chk, dpif_netdev_ct_get_tcp_seq_chk, + dpif_netdev_ct_set_sweep_interval, + dpif_netdev_ct_get_sweep_interval, dpif_netdev_ct_set_limits, dpif_netdev_ct_get_limits, dpif_netdev_ct_del_limits, @@ -9858,7 +10350,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, struct polled_queue *poll_list, int poll_cnt) { struct dpcls *cls; - uint64_t tot_idle = 0, tot_proc = 0; + uint64_t tot_idle = 0, tot_proc = 0, tot_sleep = 0; unsigned int pmd_load = 0; if (pmd->ctx.now > pmd->next_cycle_store) { @@ -9875,10 +10367,13 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; + tot_sleep = pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP] - + pmd->prev_stats[PMD_CYCLES_SLEEP]; if (pmd_alb->is_enabled && !pmd->isolated) { if (tot_proc) { - pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc)); + pmd_load = ((tot_proc * 100) / + (tot_idle + tot_proc + tot_sleep)); } atomic_read_relaxed(&pmd_alb->rebalance_load_thresh, @@ -9895,6 +10390,8 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; + pmd->prev_stats[PMD_CYCLES_SLEEP] = + pmd->perf_stats.counters.n[PMD_CYCLES_SLEEP]; /* Get the cycles that were used to process each queue and store. */ for (unsigned i = 0; i < poll_cnt; i++) { @@ -9910,7 +10407,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, atomic_store_relaxed(&pmd->intrvl_cycles, curr_tsc - pmd->intrvl_tsc_prev); } - idx = pmd->intrvl_idx++ % PMD_INTERVAL_MAX; + idx = atomic_count_inc(&pmd->intrvl_idx) % PMD_INTERVAL_MAX; atomic_store_relaxed(&pmd->busy_cycles_intrvl[idx], tot_proc); pmd->intrvl_tsc_prev = curr_tsc; /* Start new measuring interval */ @@ -9933,6 +10430,27 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, } } +/* Returns the sum of a specified number of newest to + * oldest interval values. 'cur_idx' is where the next + * write will be and wrap around needs to be handled. + */ +static uint64_t +get_interval_values(atomic_ullong *source, atomic_count *cur_idx, + int num_to_read) { + unsigned int i; + uint64_t total = 0; + + i = atomic_count_get(cur_idx) % PMD_INTERVAL_MAX; + for (int read = 0; read < num_to_read; read++) { + uint64_t interval_value; + + i = i ? i - 1 : PMD_INTERVAL_MAX - 1; + atomic_read_relaxed(&source[i], &interval_value); + total += interval_value; + } + return total; +} + /* Insert 'rule' into 'cls'. */ static void dpcls_insert(struct dpcls *cls, struct dpcls_rule *rule, diff --git a/lib/dpif-netlink-rtnl.c b/lib/dpif-netlink-rtnl.c index f0b1fcc0cea..174f3b7c699 100644 --- a/lib/dpif-netlink-rtnl.c +++ b/lib/dpif-netlink-rtnl.c @@ -130,6 +130,8 @@ vport_type_to_kind(enum ovs_vport_type type, } case OVS_VPORT_TYPE_GTPU: return NULL; + case OVS_VPORT_TYPE_SRV6: + return "srv6"; case OVS_VPORT_TYPE_BAREUDP: return "bareudp"; case OVS_VPORT_TYPE_NETDEV: @@ -320,6 +322,7 @@ dpif_netlink_rtnl_verify(const struct netdev_tunnel_config *tnl_cfg, case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_GTPU: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: default: @@ -412,6 +415,7 @@ dpif_netlink_rtnl_create(const struct netdev_tunnel_config *tnl_cfg, case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_GTPU: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_UNSPEC: case __OVS_VPORT_TYPE_MAX: default: @@ -520,6 +524,7 @@ dpif_netlink_rtnl_port_destroy(const char *name, const char *type) case OVS_VPORT_TYPE_ERSPAN: case OVS_VPORT_TYPE_IP6ERSPAN: case OVS_VPORT_TYPE_IP6GRE: + case OVS_VPORT_TYPE_SRV6: case OVS_VPORT_TYPE_BAREUDP: return dpif_netlink_rtnl_destroy(name); case OVS_VPORT_TYPE_NETDEV: @@ -562,6 +567,7 @@ dpif_netlink_rtnl_probe_oot_tunnels(void) tnl_cfg = netdev_get_tunnel_config(netdev); if (!tnl_cfg) { + netdev_close(netdev); return true; } diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index a620a6ec52d..84e2bd8eaf5 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -395,7 +395,7 @@ dpif_netlink_open(const struct dpif_class *class OVS_UNUSED, const char *name, dp_request.user_features |= OVS_DP_F_UNALIGNED; dp_request.user_features |= OVS_DP_F_VPORT_PIDS; dp_request.user_features |= OVS_DP_F_UNSUPPORTED; - error = dpif_netlink_dp_transact(&dp_request, &dp, &buf); + error = dpif_netlink_dp_transact(&dp_request, NULL, NULL); if (error) { /* The Open vSwitch kernel module has two modes for dispatching * upcalls: per-vport and per-cpu. @@ -919,6 +919,9 @@ get_vport_type(const struct dpif_netlink_vport *vport) case OVS_VPORT_TYPE_GTPU: return "gtpu"; + case OVS_VPORT_TYPE_SRV6: + return "srv6"; + case OVS_VPORT_TYPE_BAREUDP: return "bareudp"; @@ -957,6 +960,8 @@ netdev_to_ovs_vport_type(const char *type) return OVS_VPORT_TYPE_GRE; } else if (!strcmp(type, "gtpu")) { return OVS_VPORT_TYPE_GTPU; + } else if (!strcmp(type, "srv6")) { + return OVS_VPORT_TYPE_SRV6; } else if (!strcmp(type, "bareudp")) { return OVS_VPORT_TYPE_BAREUDP; } else { @@ -2582,7 +2587,7 @@ dpif_netlink_calculate_n_handlers(void) n_handlers = MIN(next_prime_num, total_cores); } - return n_handlers; + return MAX(n_handlers, 1); } static int @@ -3355,7 +3360,6 @@ dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone, static int dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, - const uint32_t *default_limits, const struct ovs_list *zone_limits) { if (ovs_ct_limit_family < 0) { @@ -3373,13 +3377,6 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, size_t opt_offset; opt_offset = nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); - if (default_limits) { - struct ovs_zone_limit req_zone_limit = { - .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, - .limit = *default_limits, - }; - nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); - } if (!ovs_list_is_empty(zone_limits)) { struct ct_dpif_zone_limit *zone_limit; @@ -3401,7 +3398,6 @@ dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED, static int dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, - uint32_t *default_limit, struct ovs_list *zone_limits) { static const struct nl_policy ovs_ct_limit_policy[] = { @@ -3434,11 +3430,8 @@ dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, nl_attr_get(attr[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]); while (rem >= sizeof *zone_limit) { - if (zone_limit->zone_id == OVS_ZONE_LIMIT_DEFAULT_ZONE) { - *default_limit = zone_limit->limit; - } else if (zone_limit->zone_id < OVS_ZONE_LIMIT_DEFAULT_ZONE || - zone_limit->zone_id > UINT16_MAX) { - } else { + if (zone_limit->zone_id >= OVS_ZONE_LIMIT_DEFAULT_ZONE && + zone_limit->zone_id <= UINT16_MAX) { ct_dpif_push_zone_limit(zone_limits, zone_limit->zone_id, zone_limit->limit, zone_limit->count); } @@ -3451,7 +3444,6 @@ dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf, static int dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, - uint32_t *default_limit, const struct ovs_list *zone_limits_request, struct ovs_list *zone_limits_reply) { @@ -3472,14 +3464,11 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, size_t opt_offset = nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); - struct ovs_zone_limit req_zone_limit = { - .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, - }; - nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); - struct ct_dpif_zone_limit *zone_limit; LIST_FOR_EACH (zone_limit, node, zone_limits_request) { - req_zone_limit.zone_id = zone_limit->zone; + struct ovs_zone_limit req_zone_limit = { + .zone_id = zone_limit->zone, + }; nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit); } @@ -3492,8 +3481,7 @@ dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED, goto out; } - err = dpif_netlink_zone_limits_from_ofpbuf(reply, default_limit, - zone_limits_reply); + err = dpif_netlink_zone_limits_from_ofpbuf(reply, zone_limits_reply); out: ofpbuf_delete(request); @@ -4105,7 +4093,6 @@ dpif_netlink_meter_get_features(const struct dpif *dpif_, struct ofputil_meter_features *features) { if (probe_broken_meters(CONST_CAST(struct dpif *, dpif_))) { - features = NULL; return; } @@ -4516,6 +4503,7 @@ dpif_netlink_cache_set_size(struct dpif *dpif_, uint32_t level, uint32_t size) const struct dpif_class dpif_netlink_class = { "system", false, /* cleanup_required */ + false, /* synced_dp_layers */ NULL, /* init */ dpif_netlink_enumerate, NULL, @@ -4561,12 +4549,17 @@ const struct dpif_class dpif_netlink_class = { dpif_netlink_ct_dump_start, dpif_netlink_ct_dump_next, dpif_netlink_ct_dump_done, + NULL, /* ct_exp_dump_start */ + NULL, /* ct_exp_dump_next */ + NULL, /* ct_exp_dump_done */ dpif_netlink_ct_flush, NULL, /* ct_set_maxconns */ NULL, /* ct_get_maxconns */ NULL, /* ct_get_nconns */ NULL, /* ct_set_tcp_seq_chk */ NULL, /* ct_get_tcp_seq_chk */ + NULL, /* ct_set_sweep_interval */ + NULL, /* ct_get_sweep_interval */ dpif_netlink_ct_set_limits, dpif_netlink_ct_get_limits, dpif_netlink_ct_del_limits, @@ -4686,6 +4679,8 @@ dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *vport, .optional = true }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true }, [OVS_VPORT_ATTR_NETNSID] = { .type = NL_A_U32, .optional = true }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NL_A_NESTED, + .optional = true }, }; dpif_netlink_vport_init(vport); @@ -4717,6 +4712,21 @@ dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *vport, if (a[OVS_VPORT_ATTR_STATS]) { vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]); } + if (a[OVS_VPORT_ATTR_UPCALL_STATS]) { + const struct nlattr *nla; + size_t left; + + NL_NESTED_FOR_EACH (nla, left, a[OVS_VPORT_ATTR_UPCALL_STATS]) { + if (nl_attr_type(nla) == OVS_VPORT_UPCALL_ATTR_SUCCESS) { + vport->upcall_success = nl_attr_get_u64(nla); + } else if (nl_attr_type(nla) == OVS_VPORT_UPCALL_ATTR_FAIL) { + vport->upcall_fail = nl_attr_get_u64(nla); + } + } + } else { + vport->upcall_success = UINT64_MAX; + vport->upcall_fail = UINT64_MAX; + } if (a[OVS_VPORT_ATTR_OPTIONS]) { vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]); vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]); diff --git a/lib/dpif-netlink.h b/lib/dpif-netlink.h index 24294bc42dc..4909fe16089 100644 --- a/lib/dpif-netlink.h +++ b/lib/dpif-netlink.h @@ -44,6 +44,8 @@ struct dpif_netlink_vport { uint32_t n_upcall_pids; const uint32_t *upcall_pids; /* OVS_VPORT_ATTR_UPCALL_PID. */ const struct ovs_vport_stats *stats; /* OVS_VPORT_ATTR_STATS. */ + uint64_t upcall_success; /* OVS_VPORT_UPCALL_ATTR_SUCCESS. */ + uint64_t upcall_fail; /* OVS_VPORT_UPCALL_ATTR_FAIL. */ const struct nlattr *options; /* OVS_VPORT_ATTR_OPTIONS. */ size_t options_len; }; diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h index 12477a24fee..520e21e68db 100644 --- a/lib/dpif-provider.h +++ b/lib/dpif-provider.h @@ -79,6 +79,7 @@ dpif_flow_dump_thread_init(struct dpif_flow_dump_thread *thread, struct ct_dpif_dump_state; struct ct_dpif_entry; +struct ct_dpif_exp; struct ct_dpif_tuple; struct ct_dpif_timeout_policy; enum ct_features; @@ -127,6 +128,14 @@ struct dpif_class { * datapaths that can not exist without it (e.g. netdev datapath). */ bool cleanup_required; + /* If 'true' the specific dpif implementation synchronizes the various + * datapath implementation layers, i.e., the dpif's layer in combination + * with the underlying netdev offload layers. For example, dpif-netlink + * does not sync its kernel flows with the tc ones, i.e., only one gets + * installed. On the other hand, dpif-netdev installs both flows, + * internally keeps track of both, and represents them as one. */ + bool synced_dp_layers; + /* Called when the dpif provider is registered, typically at program * startup. Returning an error from this function will prevent any * datapath with this class from being created. @@ -463,6 +472,16 @@ struct dpif_class { struct ct_dpif_entry *entry); int (*ct_dump_done)(struct dpif *, struct ct_dpif_dump_state *state); + /* Starts the dump initializing the structures involved and the zone + * filter. */ + int (*ct_exp_dump_start)(struct dpif *, struct ct_dpif_dump_state **state, + const uint16_t *zone); + /* Fill the expectation 'entry' with the related information. */ + int (*ct_exp_dump_next)(struct dpif *, struct ct_dpif_dump_state *state, + struct ct_dpif_exp *entry); + /* Ends the dump cleaning up any potential pending state, if any. */ + int (*ct_exp_dump_done)(struct dpif *, struct ct_dpif_dump_state *state); + /* Flushes the connection tracking tables. The arguments have the * following behavior: * @@ -485,6 +504,10 @@ struct dpif_class { int (*ct_set_tcp_seq_chk)(struct dpif *, bool enabled); /* Get the TCP sequence checking configuration. */ int (*ct_get_tcp_seq_chk)(struct dpif *, bool *enabled); + /* Updates the sweep interval for the CT sweeper. */ + int (*ct_set_sweep_interval)(struct dpif *, uint32_t ms); + /* Get the current value of the sweep interval for the CT sweeper. */ + int (*ct_get_sweep_interval)(struct dpif *, uint32_t *ms); /* Connection tracking per zone limit */ @@ -497,19 +520,17 @@ struct dpif_class { /* Sets the max connections allowed per zone according to 'zone_limits', * a list of 'struct ct_dpif_zone_limit' entries (the 'count' member - * is not used when setting limits). If 'default_limit' is not NULL, - * modifies the default limit to '*default_limit'. */ - int (*ct_set_limits)(struct dpif *, const uint32_t *default_limit, - const struct ovs_list *zone_limits); - - /* Looks up the default per zone limit and stores that in - * 'default_limit'. Look up the per zone limits for all zones in - * the 'zone_limits_in' list of 'struct ct_dpif_zone_limit' entries - * (the 'limit' and 'count' members are not used), and stores the - * reply that includes the zone, the per zone limit, and the number - * of connections in the zone into 'zone_limits_out' list. */ - int (*ct_get_limits)(struct dpif *, uint32_t *default_limit, - const struct ovs_list *zone_limits_in, + * is not used when setting limits). */ + int (*ct_set_limits)(struct dpif *, const struct ovs_list *zone_limits); + + /* Looks up the per zone limits for all zones in the 'zone_limits_in' list + * of 'struct ct_dpif_zone_limit' entries (the 'limit' and 'count' members + * are not used), and stores the reply that includes the zone, the per + * zone limit, and the number of connections in the zone into + * 'zone_limits_out' list. If the 'zone_limits_in' list is empty the + * report will contain all previously set zone limits and the default + * limit. Note: The default zone limit "count" is not used. */ + int (*ct_get_limits)(struct dpif *, const struct ovs_list *zone_limits_in, struct ovs_list *zone_limits_out); /* Deletes per zone limit of all zones specified in 'zone_limits', a diff --git a/lib/dpif.c b/lib/dpif.c index 40f5fe44606..ab633fd274d 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -28,6 +28,7 @@ #include "dpctl.h" #include "dpif-netdev.h" #include "flow.h" +#include "netdev-offload.h" #include "netdev-provider.h" #include "netdev.h" #include "netlink.h" @@ -55,18 +56,22 @@ VLOG_DEFINE_THIS_MODULE(dpif); COVERAGE_DEFINE(dpif_destroy); -COVERAGE_DEFINE(dpif_port_add); -COVERAGE_DEFINE(dpif_port_del); +COVERAGE_DEFINE(dpif_execute); +COVERAGE_DEFINE(dpif_execute_error); +COVERAGE_DEFINE(dpif_execute_with_help); +COVERAGE_DEFINE(dpif_flow_del); +COVERAGE_DEFINE(dpif_flow_del_error); COVERAGE_DEFINE(dpif_flow_flush); COVERAGE_DEFINE(dpif_flow_get); +COVERAGE_DEFINE(dpif_flow_get_error); COVERAGE_DEFINE(dpif_flow_put); -COVERAGE_DEFINE(dpif_flow_del); -COVERAGE_DEFINE(dpif_execute); -COVERAGE_DEFINE(dpif_purge); -COVERAGE_DEFINE(dpif_execute_with_help); -COVERAGE_DEFINE(dpif_meter_set); -COVERAGE_DEFINE(dpif_meter_get); +COVERAGE_DEFINE(dpif_flow_put_error); COVERAGE_DEFINE(dpif_meter_del); +COVERAGE_DEFINE(dpif_meter_get); +COVERAGE_DEFINE(dpif_meter_set); +COVERAGE_DEFINE(dpif_port_add); +COVERAGE_DEFINE(dpif_port_del); +COVERAGE_DEFINE(dpif_purge); static const struct dpif_class *base_dpif_classes[] = { #if defined(__linux__) || defined(_WIN32) @@ -701,13 +706,14 @@ dpif_port_set_config(struct dpif *dpif, odp_port_t port_no, * initializes '*port' appropriately; on failure, returns a positive errno * value. * - * Retuns ENODEV if the port doesn't exist. + * Retuns ENODEV if the port doesn't exist. Will not log a warning in this + * case unless 'warn_if_not_found' is true. * * The caller owns the data in 'port' and must free it with * dpif_port_destroy() when it is no longer needed. */ int dpif_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, - struct dpif_port *port) + struct dpif_port *port, bool warn_if_not_found) { int error = dpif->dpif_class->port_query_by_number(dpif, port_no, port); if (!error) { @@ -715,8 +721,13 @@ dpif_port_query_by_number(const struct dpif *dpif, odp_port_t port_no, dpif_name(dpif), port_no, port->name); } else { memset(port, 0, sizeof *port); - VLOG_WARN_RL(&error_rl, "%s: failed to query port %"PRIu32": %s", - dpif_name(dpif), port_no, ovs_strerror(error)); + if (error == ENODEV && !warn_if_not_found) { + VLOG_DBG_RL(&dpmsg_rl, "%s: failed to query port %"PRIu32": %s", + dpif_name(dpif), port_no, ovs_strerror(error)); + } else { + VLOG_WARN_RL(&error_rl, "%s: failed to query port %"PRIu32": %s", + dpif_name(dpif), port_no, ovs_strerror(error)); + } } return error; } @@ -784,7 +795,7 @@ dpif_port_get_name(struct dpif *dpif, odp_port_t port_no, ovs_assert(name_size > 0); - error = dpif_port_query_by_number(dpif, port_no, &port); + error = dpif_port_query_by_number(dpif, port_no, &port, true); if (!error) { ovs_strlcpy(name, port.name, name_size); dpif_port_destroy(&port); @@ -1181,6 +1192,8 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_TUNNEL_PUSH: case OVS_ACTION_ATTR_TUNNEL_POP: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_PSAMPLE: + case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_RECIRC: { struct dpif_execute execute; struct ofpbuf execute_actions; @@ -1213,7 +1226,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, /* The Linux kernel datapath throws away the tunnel information * that we supply as metadata. We have to use a "set" action to * supply it. */ - if (md->tunnel.ip_dst) { + if (flow_tnl_dst_is_set(&md->tunnel)) { odp_put_tunnel_action(&md->tunnel, &execute_actions, NULL); } ofpbuf_put(&execute_actions, action, NLA_ALIGN(action->nla_len)); @@ -1267,7 +1280,6 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_POP_MPLS: case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: - case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_TRUNC: case OVS_ACTION_ATTR_PUSH_ETH: case OVS_ACTION_ATTR_POP_ETH: @@ -1279,6 +1291,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } @@ -1381,8 +1394,11 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_put); log_flow_put_message(dpif, &this_module, put, error); - if (error && put->stats) { - memset(put->stats, 0, sizeof *put->stats); + if (error) { + COVERAGE_INC(dpif_flow_put_error); + if (put->stats) { + memset(put->stats, 0, sizeof *put->stats); + } } break; } @@ -1392,10 +1408,10 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_get); if (error) { + COVERAGE_INC(dpif_flow_get_error); memset(get->flow, 0, sizeof *get->flow); } log_flow_get_message(dpif, &this_module, get, error); - break; } @@ -1404,8 +1420,11 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_flow_del); log_flow_del_message(dpif, &this_module, del, error); - if (error && del->stats) { - memset(del->stats, 0, sizeof *del->stats); + if (error) { + COVERAGE_INC(dpif_flow_del_error); + if (del->stats) { + memset(del->stats, 0, sizeof *del->stats); + } } break; } @@ -1414,6 +1433,9 @@ dpif_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, COVERAGE_INC(dpif_execute); log_execute_message(dpif, &this_module, &op->execute, false, error); + if (error) { + COVERAGE_INC(dpif_execute_error); + } break; } } @@ -1915,9 +1937,10 @@ dpif_supports_tnl_push_pop(const struct dpif *dpif) } bool -dpif_supports_explicit_drop_action(const struct dpif *dpif) +dpif_may_support_explicit_drop_action(const struct dpif *dpif) { - return dpif_is_netdev(dpif); + /* TC does not support offloading this action. */ + return dpif_is_netdev(dpif) || !netdev_is_flow_api_enabled(); } bool @@ -1930,6 +1953,13 @@ dpif_supports_lb_output_action(const struct dpif *dpif) return dpif_is_netdev(dpif); } +bool +dpif_may_support_psample(const struct dpif *dpif) +{ + /* Userspace datapath does not support this action. */ + return !dpif_is_netdev(dpif); +} + /* Meters */ void dpif_meter_get_features(const struct dpif *dpif, @@ -2109,3 +2139,9 @@ dpif_cache_set_size(struct dpif *dpif, uint32_t level, uint32_t size) ? dpif->dpif_class->cache_set_size(dpif, level, size) : EOPNOTSUPP; } + +bool +dpif_synced_dp_layers(struct dpif *dpif) +{ + return dpif->dpif_class->synced_dp_layers; +} diff --git a/lib/dpif.h b/lib/dpif.h index 6cb4dae6d8d..6bef7d5b304 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -91,7 +91,9 @@ * * - Carrier status (netdev_get_carrier()). * - * - Speed (netdev_get_features()). + * - Link features (netdev_get_features()). + * + * - Speed (netdev_get_speed()). * * - QoS queue configuration (netdev_get_queue(), netdev_set_queue() and * related functions.) @@ -461,7 +463,7 @@ void dpif_port_clone(struct dpif_port *, const struct dpif_port *); void dpif_port_destroy(struct dpif_port *); bool dpif_port_exists(const struct dpif *dpif, const char *devname); int dpif_port_query_by_number(const struct dpif *, odp_port_t port_no, - struct dpif_port *); + struct dpif_port *, bool warn_if_not_found); int dpif_port_query_by_name(const struct dpif *, const char *devname, struct dpif_port *); int dpif_port_get_name(struct dpif *, odp_port_t port_no, @@ -938,7 +940,9 @@ int dpif_get_pmds_for_port(const struct dpif * dpif, odp_port_t port_no, char *dpif_get_dp_version(const struct dpif *); bool dpif_supports_tnl_push_pop(const struct dpif *); -bool dpif_supports_explicit_drop_action(const struct dpif *); +bool dpif_may_support_explicit_drop_action(const struct dpif *); +bool dpif_may_support_psample(const struct dpif *); +bool dpif_synced_dp_layers(struct dpif *); /* Log functions. */ struct vlog_module; diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c index bbb31ef2751..95315007478 100644 --- a/lib/fatal-signal.c +++ b/lib/fatal-signal.c @@ -35,10 +35,14 @@ #include "openvswitch/type-props.h" -#ifdef HAVE_UNWIND +#if defined(HAVE_UNWIND) || defined(HAVE_BACKTRACE) #include "daemon-private.h" #endif +#ifdef HAVE_BACKTRACE +#include +#endif + #ifndef SIG_ATOMIC_MAX #define SIG_ATOMIC_MAX TYPE_MAXIMUM(sig_atomic_t) #endif @@ -78,6 +82,39 @@ static void call_hooks(int sig_nr); static BOOL WINAPI ConsoleHandlerRoutine(DWORD dwCtrlType); #endif +/* Sets up a pipe or event handle that will be used to wake up the current + * process after signal is received, so it can be processed outside of the + * signal handler context in fatal_signal_run(). */ +static void +fatal_signal_create_wakeup_events(void) +{ +#ifndef _WIN32 + xpipe_nonblocking(signal_fds); +#else + wevent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!wevent) { + char *msg_buf = ovs_lasterror_to_string(); + VLOG_FATAL("Failed to create a event (%s).", msg_buf); + } +#endif +} + +static void +fatal_signal_destroy_wakeup_events(void) +{ +#ifndef _WIN32 + close(signal_fds[0]); + signal_fds[0] = -1; + close(signal_fds[1]); + signal_fds[1] = -1; +#else + ResetEvent(wevent); + CloseHandle(wevent); + wevent = NULL; +#endif +} + + /* Initializes the fatal signal handling module. Calling this function is * optional, because calling any other function in the module will also * initialize it. However, in a multithreaded program, the module must be @@ -94,15 +131,16 @@ fatal_signal_init(void) inited = true; ovs_mutex_init_recursive(&mutex); -#ifndef _WIN32 - xpipe_nonblocking(signal_fds); -#else - wevent = CreateEvent(NULL, TRUE, FALSE, NULL); - if (!wevent) { - char *msg_buf = ovs_lasterror_to_string(); - VLOG_FATAL("Failed to create a event (%s).", msg_buf); - } + /* The dummy backtrace is needed. + * See comment for send_backtrace_to_monitor(). */ + struct backtrace dummy_bt; + + backtrace_capture(&dummy_bt); + + fatal_signal_create_wakeup_events(); + +#ifdef _WIN32 /* Register a function to handle Ctrl+C. */ SetConsoleCtrlHandler(ConsoleHandlerRoutine, true); #endif @@ -181,7 +219,8 @@ llong_to_hex_str(unsigned long long value, char *str) * library functions used here must be async-signal-safe. */ static inline void -send_backtrace_to_monitor(void) { +send_backtrace_to_monitor(void) +{ /* volatile added to prevent a "clobbered" error on ppc64le with gcc */ volatile int dep; struct unw_backtrace unw_bt[UNW_MAX_DEPTH]; @@ -211,11 +250,10 @@ send_backtrace_to_monitor(void) { /* Since there is no monitor daemon running, write backtrace * in current process. */ - char str[] = "SIGSEGV detected, backtrace:\n"; char ip_str[16], offset_str[6]; char line[64], fn_name[UNW_MAX_FUNCN]; - vlog_direct_write_to_log_file_unsafe(str); + vlog_direct_write_to_log_file_unsafe(BACKTRACE_DUMP_MSG); for (int i = 0; i < dep; i++) { memset(line, 0, sizeof line); @@ -239,6 +277,36 @@ send_backtrace_to_monitor(void) { } } } +#elif HAVE_BACKTRACE +/* Send the backtrace to monitor thread. + * + * Note that this runs in the signal handling context, any system + * library functions used here must be async-signal-safe. + * backtrace() is only signal safe if the "libgcc" or equivalent was loaded + * before the signal handler. In order to keep it safe the fatal_signal_init() + * should always call backtrace_capture which will ensure that "libgcc" or + * equivlent is loaded. + */ +static inline void +send_backtrace_to_monitor(void) +{ + struct backtrace bt; + + backtrace_capture(&bt); + + if (monitor && daemonize_fd > -1) { + ignore(write(daemonize_fd, &bt, sizeof bt)); + } else { + int log_fd = vlog_get_log_file_fd_unsafe(); + + if (log_fd < 0) { + return; + } + + vlog_direct_write_to_log_file_unsafe(BACKTRACE_DUMP_MSG); + backtrace_symbols_fd(bt.frames, bt.n_frames, log_fd); + } +} #else static inline void send_backtrace_to_monitor(void) { @@ -456,6 +524,9 @@ do_unlink_files(void) * hooks passed a 'cancel_cb' function to fatal_signal_add_hook(), then those * functions will be called, allowing them to free resources, etc. * + * Also re-creates wake-up events, so signals in one of the processes do not + * wake up the other one. + * * Following a fork, one of the resulting processes can call this function to * allow it to terminate without calling the hooks registered before calling * this function. New hooks registered after calling this function will take @@ -467,6 +538,9 @@ fatal_signal_fork(void) assert_single_threaded(); + fatal_signal_destroy_wakeup_events(); + fatal_signal_create_wakeup_events(); + for (i = 0; i < n_hooks; i++) { struct hook *h = &hooks[i]; if (h->cancel_cb) { diff --git a/lib/flow.c b/lib/flow.c index c3a3aa3ce45..9be4375246a 100644 --- a/lib/flow.c +++ b/lib/flow.c @@ -408,7 +408,8 @@ parse_ethertype(const void **datap, size_t *sizep) static inline bool parse_icmpv6(const void **datap, size_t *sizep, const struct icmp6_data_header *icmp6, - ovs_be32 *rso_flags, const struct in6_addr **nd_target, + ovs_be32 *rso_flags, + const union ovs_16aligned_in6_addr **nd_target, struct eth_addr arp_buf[2], uint8_t *opt_type) { if (icmp6->icmp6_base.icmp6_code != 0 || @@ -479,9 +480,17 @@ parse_icmpv6(const void **datap, size_t *sizep, static inline bool parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr) + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr) { - *frag_hdr = NULL; + if (frag_hdr) { + *frag_hdr = NULL; + } + + if (rt_hdr) { + *rt_hdr = NULL; + } + while (1) { if (OVS_LIKELY((*nw_proto != IPPROTO_HOPOPTS) && (*nw_proto != IPPROTO_ROUTING) @@ -504,7 +513,6 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, } if ((*nw_proto == IPPROTO_HOPOPTS) - || (*nw_proto == IPPROTO_ROUTING) || (*nw_proto == IPPROTO_DSTOPTS)) { /* These headers, while different, have the fields we care * about in the same location and with the same @@ -515,6 +523,18 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, (ext_hdr->ip6e_len + 1) * 8))) { return false; } + } else if (*nw_proto == IPPROTO_ROUTING) { + const struct ip6_rt_hdr *tmp; + if (!rt_hdr) { + rt_hdr = &tmp; + } + + *rt_hdr = *datap; + *nw_proto = (*rt_hdr)->nexthdr; + if (OVS_UNLIKELY(!data_try_pull(datap, sizep, + ((*rt_hdr)->hdrlen + 1) * 8))) { + return false; + } } else if (*nw_proto == IPPROTO_AH) { /* A standard AH definition isn't available, but the fields * we care about are in the same location as the generic @@ -527,6 +547,11 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, return false; } } else if (*nw_proto == IPPROTO_FRAGMENT) { + const struct ovs_16aligned_ip6_frag *tmp; + if (!frag_hdr) { + frag_hdr = &tmp; + } + *frag_hdr = *datap; *nw_proto = (*frag_hdr)->ip6f_nxt; @@ -561,15 +586,19 @@ parse_ipv6_ext_hdrs__(const void **datap, size_t *sizep, uint8_t *nw_proto, * has FLOW_NW_FRAG_LATER set. Both first and later fragments have * FLOW_NW_FRAG_ANY set in 'nw_frag'. * + * If a routing header is found, '*rt_hdr' is set to the routing + * header and otherwise set to NULL. + * * A return value of false indicates that there was a problem parsing * the extension headers.*/ bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr) + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr) { return parse_ipv6_ext_hdrs__(datap, sizep, nw_proto, nw_frag, - frag_hdr); + frag_hdr, rt_hdr); } bool @@ -907,6 +936,10 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) nw_proto = nh->ip_proto; nw_frag = ipv4_get_nw_frag(nh); data_pull(&data, &size, ip_len); + dp_packet_hwol_set_tx_ipv4(packet); + if (dp_packet_ip_checksum_good(packet)) { + dp_packet_hwol_set_tx_ip_csum(packet); + } } else if (dl_type == htons(ETH_TYPE_IPV6)) { const struct ovs_16aligned_ip6_hdr *nh = data; ovs_be32 tc_flow; @@ -920,6 +953,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } data_pull(&data, &size, sizeof *nh); + dp_packet_hwol_set_tx_ipv6(packet); plen = ntohs(nh->ip6_plen); dp_packet_set_l2_pad_size(packet, size - plen); size = plen; /* Never pull padding. */ @@ -945,9 +979,8 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) nw_ttl = nh->ip6_hlim; nw_proto = nh->ip6_nxt; - const struct ovs_16aligned_ip6_frag *frag_hdr; - if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag, - &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { goto out; } @@ -1022,6 +1055,11 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_tcp(packet); + } } } } else if (OVS_LIKELY(nw_proto == IPPROTO_UDP)) { @@ -1037,6 +1075,11 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } else if (dl_type == htons(ETH_TYPE_IPV6)) { dp_packet_update_rss_hash_ipv6_tcp_udp(packet); } + dp_packet_ol_l4_csum_check_partial(packet); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_udp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_SCTP)) { if (OVS_LIKELY(size >= SCTP_HEADER_LEN)) { @@ -1046,6 +1089,11 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); miniflow_push_be16(mf, ct_tp_src, ct_tp_src); miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst); + dp_packet_ol_l4_csum_check_partial(packet); + if (dp_packet_l4_checksum_good(packet) + || dp_packet_ol_l4_csum_partial(packet)) { + dp_packet_hwol_set_csum_sctp(packet); + } } } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMP)) { if (OVS_LIKELY(size >= ICMP_HEADER_LEN)) { @@ -1070,7 +1118,7 @@ miniflow_extract(struct dp_packet *packet, struct miniflow *dst) } } else if (OVS_LIKELY(nw_proto == IPPROTO_ICMPV6)) { if (OVS_LIKELY(size >= sizeof(struct icmp6_data_header))) { - const struct in6_addr *nd_target; + const union ovs_16aligned_in6_addr *nd_target; struct eth_addr arp_buf[2]; /* This will populate whether we received Option 1 * or Option 2. */ @@ -1200,10 +1248,9 @@ parse_tcp_flags(struct dp_packet *packet, plen = ntohs(nh->ip6_plen); /* Never pull padding. */ dp_packet_set_l2_pad_size(packet, size - plen); size = plen; - const struct ovs_16aligned_ip6_frag *frag_hdr; nw_proto = nh->ip6_nxt; - if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag, - &frag_hdr)) { + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, NULL)) { return 0; } } else { @@ -3165,6 +3212,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, tcp->tcp_csum = 0; tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, tcp, l4_len)); + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_UDP) { struct udp_header *udp = dp_packet_l4(p); @@ -3174,6 +3222,7 @@ flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, if (!udp->udp_csum) { udp->udp_csum = htons(0xffff); } + dp_packet_ol_set_l4_csum_good(p); } else if (flow->nw_proto == IPPROTO_ICMP) { struct icmp_header *icmp = dp_packet_l4(p); @@ -3221,9 +3270,12 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) struct ip_header *ip = dp_packet_l3(p); ip->ip_tot_len = htons(p->l4_ofs - p->l3_ofs + l4_len); - ip->ip_csum = 0; - ip->ip_csum = csum(ip, sizeof *ip); - + if (dp_packet_hwol_tx_ip_csum(p)) { + dp_packet_ol_reset_ip_csum_good(p); + } else { + dp_packet_ip_set_header_csum(p, false); + dp_packet_ol_set_ip_csum_good(p); + } pseudo_hdr_csum = packet_csum_pseudoheader(ip); } else { /* ETH_TYPE_IPV6 */ struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(p); @@ -3249,6 +3301,8 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) * (This is useful only for testing, obviously, and the packet isn't really * valid. Lots of fields are just zeroed.) * + * If 'bad_csum' is true, the final IP checksum is invalid. + * * For packets whose protocols can encapsulate arbitrary L7 payloads, 'l7' and * 'l7_len' determine that payload: * @@ -3261,7 +3315,7 @@ packet_expand(struct dp_packet *p, const struct flow *flow, size_t size) * from 'l7'. */ void flow_compose(struct dp_packet *p, const struct flow *flow, - const void *l7, size_t l7_len) + const void *l7, size_t l7_len, bool bad_csum) { /* Add code to this function (or its callees) for emitting new fields or * protocols. (This isn't essential, so it can be skipped for initial @@ -3313,6 +3367,18 @@ flow_compose(struct dp_packet *p, const struct flow *flow, /* Checksum has already been zeroed by put_zeros call. */ ip->ip_csum = csum(ip, sizeof *ip); + if (bad_csum) { + /* + * Internet checksum is a sum complement to zero, so any other + * value will result in an invalid checksum. Here, we flip one + * bit. + */ + ip->ip_csum ^= (OVS_FORCE ovs_be16) 0x1; + dp_packet_ip_checksum_bad(p); + } else { + dp_packet_ol_set_ip_csum_good(p); + } + pseudo_hdr_csum = packet_csum_pseudoheader(ip); flow_compose_l4_csum(p, flow, pseudo_hdr_csum); } else if (flow->dl_type == htons(ETH_TYPE_IPV6)) { @@ -3355,6 +3421,24 @@ flow_compose(struct dp_packet *p, const struct flow *flow, arp->ar_sha = flow->arp_sha; arp->ar_tha = flow->arp_tha; } + } else if (flow->dl_type == htons(ETH_TYPE_NSH)) { + struct nsh_hdr *nsh; + + nsh = dp_packet_put_zeros(p, sizeof *nsh); + dp_packet_set_l3(p, nsh); + + nsh_set_flags_ttl_len(nsh, flow->nsh.flags, flow->nsh.ttl, + flow->nsh.mdtype == NSH_M_TYPE1 + ? NSH_M_TYPE1_LEN : NSH_BASE_HDR_LEN); + nsh->next_proto = flow->nsh.np; + nsh->md_type = flow->nsh.mdtype; + put_16aligned_be32(&nsh->path_hdr, flow->nsh.path_hdr); + + if (flow->nsh.mdtype == NSH_M_TYPE1) { + for (size_t i = 0; i < 4; i++) { + put_16aligned_be32(&nsh->md1.context[i], flow->nsh.context[i]); + } + } } if (eth_type_mpls(flow->dl_type)) { diff --git a/lib/flow.h b/lib/flow.h index c647ad83c25..60ec4b0d780 100644 --- a/lib/flow.h +++ b/lib/flow.h @@ -127,12 +127,13 @@ void flow_set_mpls_bos(struct flow *, int idx, uint8_t stack); void flow_set_mpls_lse(struct flow *, int idx, ovs_be32 lse); void flow_compose(struct dp_packet *, const struct flow *, - const void *l7, size_t l7_len); + const void *l7, size_t l7_len, bool bad_csum); void packet_expand(struct dp_packet *, const struct flow *, size_t size); bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto, uint8_t *nw_frag, - const struct ovs_16aligned_ip6_frag **frag_hdr); + const struct ovs_16aligned_ip6_frag **frag_hdr, + const struct ip6_rt_hdr **rt_hdr); bool parse_nsh(const void **datap, size_t *sizep, struct ovs_key_nsh *key); uint16_t parse_tcp_flags(struct dp_packet *packet, ovs_be16 *dl_type_p, uint8_t *nw_frag_p, ovs_be16 *first_vlan_tci_p); @@ -938,6 +939,15 @@ flow_union_with_miniflow(struct flow *dst, const struct miniflow *src) flow_union_with_miniflow_subset(dst, src, src->map); } +/* Perform a bitwise OR of minimask 'src' mask data with the equivalent + * fields in 'dst', storing the result in 'dst'. */ +static inline void +flow_wildcards_union_with_minimask(struct flow_wildcards *dst, + const struct minimask *src) +{ + flow_union_with_miniflow_subset(&dst->masks, &src->masks, src->masks.map); +} + static inline bool is_ct_valid(const struct flow *flow, const struct flow_wildcards *mask, struct flow_wildcards *wc) diff --git a/lib/hash.c b/lib/hash.c index c722f3c3cc2..3d574de9b44 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -29,15 +29,16 @@ hash_3words(uint32_t a, uint32_t b, uint32_t c) uint32_t hash_bytes(const void *p_, size_t n, uint32_t basis) { - const uint32_t *p = p_; + const uint8_t *p = p_; size_t orig_n = n; uint32_t hash; hash = basis; while (n >= 4) { - hash = hash_add(hash, get_unaligned_u32(p)); + hash = hash_add(hash, + get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p))); n -= 4; - p += 1; + p += 4; } if (n) { diff --git a/lib/hash.h b/lib/hash.h index 60a39a40b8a..307309fd059 100644 --- a/lib/hash.h +++ b/lib/hash.h @@ -187,21 +187,83 @@ static inline uint32_t hash_finish(uint64_t hash, uint64_t final) return hash ^ (uint32_t)hash >> 16; /* Increase entropy in LSBs. */ } +static inline uint32_t +hash_finish32(uint64_t hash, uint32_t final, uint32_t semifinal) +{ + /* The finishing multiplier 0x805204f3 has been experimentally + * derived to pass the testsuite hash tests. */ + hash = _mm_crc32_u32(hash, semifinal); + hash = _mm_crc32_u32(hash, final) * 0x805204f3; + return hash ^ ((uint32_t) hash >> 16); /* Increase entropy in LSBs. */ +} + +static inline uint32_t +hash_words_32aligned(const uint32_t *p, size_t n_words, uint32_t basis) +{ + uint32_t hash1 = basis; + uint32_t hash2 = 0; + uint32_t hash3 = n_words; + const uint32_t *endp = (const uint32_t *) p + n_words; + const uint32_t *limit = p + n_words - 6; + + while (p <= limit) { + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + hash3 = _mm_crc32_u32(hash3, p[4]); + hash3 = _mm_crc32_u32(hash3, p[5]); + p += 6; + } + switch (endp - (const uint32_t *) p) { + case 1: + hash1 = _mm_crc32_u32(hash1, p[0]); + break; + case 2: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + break; + case 3: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + break; + case 4: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + break; + case 5: + hash1 = _mm_crc32_u32(hash1, p[0]); + hash1 = _mm_crc32_u32(hash1, p[1]); + hash2 = _mm_crc32_u32(hash2, p[2]); + hash2 = _mm_crc32_u32(hash2, p[3]); + hash3 = _mm_crc32_u32(hash3, p[4]); + break; + } + return hash_finish32(hash1, hash2, hash3); +} + /* Returns the hash of the 'n' 32-bit words at 'p_', starting from 'basis'. * We access 'p_' as a uint64_t pointer, which is fine for __SSE_4_2__. * * This is inlined for the compiler to have access to the 'n_words', which * in many cases is a constant. */ static inline uint32_t -hash_words_inline(const uint32_t p_[], size_t n_words, uint32_t basis) +hash_words_inline(const uint32_t *p_, size_t n_words, uint32_t basis) { - const uint64_t *p = (const void *)p_; + const uint64_t *p = ALIGNED_CAST(const uint64_t *, p_); uint64_t hash1 = basis; uint64_t hash2 = 0; uint64_t hash3 = n_words; const uint32_t *endp = (const uint32_t *)p + n_words; const uint64_t *limit = p + n_words / 2 - 3; + if (OVS_UNLIKELY(((intptr_t) p & ((sizeof(uint64_t)) - 1)) != 0)) { + return hash_words_32aligned(p_, n_words, basis); + } + while (p <= limit) { hash1 = _mm_crc32_u64(hash1, p[0]); hash2 = _mm_crc32_u64(hash2, p[1]); diff --git a/lib/ipf.c b/lib/ipf.c index d452663743c..2d715f5e9d3 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -433,7 +433,9 @@ ipf_reassemble_v4_frags(struct ipf_list *ipf_list) len += rest_len; l3 = dp_packet_l3(pkt); ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS); - if (!dp_packet_hwol_is_ipv4(pkt)) { + if (dp_packet_hwol_tx_ip_csum(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off, new_ip_frag_off); l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len)); @@ -485,9 +487,9 @@ ipf_reassemble_v6_frags(struct ipf_list *ipf_list) const void *data = l3 + 1; size_t datasize = pl; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; - if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr) - || !nw_frag || !frag_hdr) { + const struct ovs_16aligned_ip6_frag *frag_hdr; + if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr, + NULL) || !nw_frag || !frag_hdr) { ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3); dp_packet_delete(pkt); @@ -504,13 +506,15 @@ ipf_reassemble_v6_frags(struct ipf_list *ipf_list) } /* Called when a frag list state transitions to another state. This is - * triggered by new fragment for the list being received.*/ -static void +* triggered by new fragment for the list being received. Returns a reassembled +* packet if this fragment has completed one. */ +static struct reassembled_pkt * ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list, bool ff, bool lf, bool v6) OVS_REQUIRES(ipf->ipf_lock) { enum ipf_list_state curr_state = ipf_list->state; + struct reassembled_pkt *ret = NULL; enum ipf_list_state next_state; switch (curr_state) { case IPF_LIST_STATE_UNUSED: @@ -560,12 +564,15 @@ ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list, ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp); ipf_expiry_list_remove(ipf_list); next_state = IPF_LIST_STATE_COMPLETED; + ret = rp; } else { next_state = IPF_LIST_STATE_REASS_FAIL; } } } ipf_list->state = next_state; + + return ret; } /* Some sanity checks are redundant, but prudent, in case code paths for @@ -608,8 +615,7 @@ ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt) goto invalid_pkt; } - if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt) - && !dp_packet_hwol_is_ipv4(pkt) + if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(pkt) && csum(l3, ip_hdr_len) != 0)) { COVERAGE_INC(ipf_l3csum_err); goto invalid_pkt; @@ -678,9 +684,9 @@ ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt) uint8_t nw_proto = l3->ip6_nxt; const void *data = l3 + 1; size_t datasize = l3_size - l3_hdr_size; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + const struct ovs_16aligned_ip6_frag *frag_hdr; if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, - &frag_hdr) || !nw_frag || !frag_hdr) { + &frag_hdr, NULL) || !nw_frag || !frag_hdr) { return false; } @@ -721,9 +727,10 @@ ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone, uint8_t nw_proto = l3->ip6_nxt; const void *data = l3 + 1; size_t datasize = dp_packet_l3_size(pkt) - sizeof *l3; - const struct ovs_16aligned_ip6_frag *frag_hdr = NULL; + const struct ovs_16aligned_ip6_frag *frag_hdr; - parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr); + parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr, + NULL); ovs_assert(nw_frag && frag_hdr); ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg; *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) + @@ -797,7 +804,8 @@ ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx, static bool ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list, struct dp_packet *pkt, uint16_t start_data_byte, - uint16_t end_data_byte, bool ff, bool lf, bool v6) + uint16_t end_data_byte, bool ff, bool lf, bool v6, + struct reassembled_pkt **rp) OVS_REQUIRES(ipf->ipf_lock) { bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list, @@ -818,7 +826,7 @@ ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list, ipf_list->last_inuse_idx++; atomic_count_inc(&ipf->nfrag); ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED); - ipf_list_state_transition(ipf, ipf_list, ff, lf, v6); + *rp = ipf_list_state_transition(ipf, ipf_list, ff, lf, v6); } else { OVS_NOT_REACHED(); } @@ -851,7 +859,8 @@ ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key, * to a list of fragemnts. */ static bool ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type, - uint16_t zone, long long now, uint32_t hash_basis) + uint16_t zone, long long now, uint32_t hash_basis, + struct reassembled_pkt **rp) OVS_REQUIRES(ipf->ipf_lock) { struct ipf_list_key key; @@ -920,7 +929,7 @@ ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type, } return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte, - end_data_byte, ff, lf, v6); + end_data_byte, ff, lf, v6, rp); } /* Filters out fragments from a batch of fragments and adjust the batch. */ @@ -939,11 +948,17 @@ ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb, || (dl_type == htons(ETH_TYPE_IPV6) && ipf_is_valid_v6_frag(ipf, pkt)))) { + struct reassembled_pkt *rp = NULL; ovs_mutex_lock(&ipf->ipf_lock); - if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis)) { + if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis, + &rp)) { dp_packet_batch_refill(pb, pkt, pb_idx); } else { + if (rp && !dp_packet_batch_is_full(pb)) { + dp_packet_batch_refill(pb, rp->pkt, pb_idx); + rp->list->reass_execute_ctx = rp->pkt; + } dp_packet_delete(pkt); } ovs_mutex_unlock(&ipf->ipf_lock); @@ -1061,6 +1076,9 @@ ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb, struct ipf_list *ipf_list; LIST_FOR_EACH_SAFE (ipf_list, list_node, &ipf->frag_complete_list) { + if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) { + continue; + } if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST, v6, now)) { ipf_completed_list_clean(&ipf->frag_lists, ipf_list); @@ -1094,6 +1112,9 @@ ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb, size_t lists_removed = 0; LIST_FOR_EACH_SAFE (ipf_list, list_node, &ipf->frag_exp_list) { + if ((ipf_list->key.dl_type == htons(ETH_TYPE_IPV6)) != v6) { + continue; + } if (now <= ipf_list->expiration || lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) { break; @@ -1114,7 +1135,8 @@ ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb, /* Adds a reassmebled packet to a packet batch to be processed by the caller. */ static void -ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb) +ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb, + ovs_be16 dl_type) { if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) { return; @@ -1125,6 +1147,7 @@ ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb) LIST_FOR_EACH_SAFE (rp, rp_list_node, &ipf->reassembled_pkt_list) { if (!rp->list->reass_execute_ctx && + rp->list->key.dl_type == dl_type && ipf_dp_packet_batch_add(pb, rp->pkt, false)) { rp->list->reass_execute_ctx = rp->pkt; } @@ -1185,7 +1208,9 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, } else { struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); struct ip_header *l3_reass = dp_packet_l3(pkt); - if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { + if (dp_packet_hwol_tx_ip_csum(frag_i->pkt)) { + dp_packet_ol_reset_ip_csum_good(frag_i->pkt); + } else { ovs_be32 reass_ip = get_16aligned_be32(&l3_reass->ip_src); ovs_be32 frag_ip = @@ -1233,7 +1258,7 @@ ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb, } if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) { - ipf_execute_reass_pkts(ipf, pb); + ipf_execute_reass_pkts(ipf, pb, dl_type); } } diff --git a/lib/jhash.c b/lib/jhash.c index c59b51b6113..a8e3f457b94 100644 --- a/lib/jhash.c +++ b/lib/jhash.c @@ -96,18 +96,18 @@ jhash_words(const uint32_t *p, size_t n, uint32_t basis) uint32_t jhash_bytes(const void *p_, size_t n, uint32_t basis) { - const uint32_t *p = p_; + const uint8_t *p = p_; uint32_t a, b, c; a = b = c = 0xdeadbeef + n + basis; while (n >= 12) { - a += get_unaligned_u32(p); - b += get_unaligned_u32(p + 1); - c += get_unaligned_u32(p + 2); + a += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p)); + b += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p + 4)); + c += get_unaligned_u32(ALIGNED_CAST(const uint32_t *, p + 8)); jhash_mix(&a, &b, &c); n -= 12; - p += 3; + p += 12; } if (n) { diff --git a/lib/json.c b/lib/json.c index 3267a619633..001f6e6ab79 100644 --- a/lib/json.c +++ b/lib/json.c @@ -24,13 +24,21 @@ #include #include +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "hash.h" +#include "json.h" #include "openvswitch/shash.h" #include "unicode.h" #include "util.h" #include "uuid.h" +/* Non-public JSSF_* flags. Must not overlap with public ones defined + * in include/openvswitch/json.h. */ +enum { + JSSF_YIELD = 1 << 7, +}; + /* The type of a JSON token. */ enum json_token_type { T_EOF = 0, @@ -189,6 +197,14 @@ json_serialized_object_create(const struct json *src) return json; } +struct json * +json_serialized_object_create_with_yield(const struct json *src) +{ + struct json *json = json_create(JSON_SERIALIZED_OBJECT); + json->string = json_to_string(src, JSSF_SORT | JSSF_YIELD); + return json; +} + struct json * json_array_create_empty(void) { @@ -257,6 +273,21 @@ json_array_create_3(struct json *elem0, struct json *elem1, struct json *elem2) return json_array_create(elems, 3); } +bool +json_array_contains_string(const struct json *json, const char *str) +{ + ovs_assert(json->type == JSON_ARRAY); + + for (size_t i = 0; i < json->array.n; i++) { + const struct json *elem = json->array.elems[i]; + + if (elem->type == JSON_STRING && !strcmp(json_string(elem), str)) { + return true; + } + } + return false; +} + struct json * json_object_create(void) { @@ -360,20 +391,20 @@ json_integer(const struct json *json) return json->integer; } -static void json_destroy_object(struct shash *object); -static void json_destroy_array(struct json_array *array); +static void json_destroy_object(struct shash *object, bool yield); +static void json_destroy_array(struct json_array *array, bool yield); /* Frees 'json' and everything it points to, recursively. */ void -json_destroy__(struct json *json) +json_destroy__(struct json *json, bool yield) { switch (json->type) { case JSON_OBJECT: - json_destroy_object(json->object); + json_destroy_object(json->object, yield); break; case JSON_ARRAY: - json_destroy_array(&json->array); + json_destroy_array(&json->array, yield); break; case JSON_STRING: @@ -395,14 +426,22 @@ json_destroy__(struct json *json) } static void -json_destroy_object(struct shash *object) +json_destroy_object(struct shash *object, bool yield) { struct shash_node *node; + if (yield) { + cooperative_multitasking_yield(); + } + SHASH_FOR_EACH_SAFE (node, object) { struct json *value = node->data; - json_destroy(value); + if (yield) { + json_destroy_with_yield(value); + } else { + json_destroy(value); + } shash_delete(object, node); } shash_destroy(object); @@ -410,18 +449,26 @@ json_destroy_object(struct shash *object) } static void -json_destroy_array(struct json_array *array) +json_destroy_array(struct json_array *array, bool yield) { size_t i; + if (yield) { + cooperative_multitasking_yield(); + } + for (i = 0; i < array->n; i++) { - json_destroy(array->elems[i]); + if (yield) { + json_destroy_with_yield(array->elems[i]); + } else { + json_destroy(array->elems[i]); + } } free(array->elems); } -static struct json *json_clone_object(const struct shash *object); -static struct json *json_clone_array(const struct json_array *array); +static struct json *json_deep_clone_object(const struct shash *object); +static struct json *json_deep_clone_array(const struct json_array *array); /* Returns a deep copy of 'json'. */ struct json * @@ -429,10 +476,10 @@ json_deep_clone(const struct json *json) { switch (json->type) { case JSON_OBJECT: - return json_clone_object(json->object); + return json_deep_clone_object(json->object); case JSON_ARRAY: - return json_clone_array(&json->array); + return json_deep_clone_array(&json->array); case JSON_STRING: return json_string_create(json->string); @@ -464,7 +511,7 @@ json_nullable_clone(const struct json *json) } static struct json * -json_clone_object(const struct shash *object) +json_deep_clone_object(const struct shash *object) { struct shash_node *node; struct json *json; @@ -472,20 +519,20 @@ json_clone_object(const struct shash *object) json = json_object_create(); SHASH_FOR_EACH (node, object) { struct json *value = node->data; - json_object_put(json, node->name, json_clone(value)); + json_object_put(json, node->name, json_deep_clone(value)); } return json; } static struct json * -json_clone_array(const struct json_array *array) +json_deep_clone_array(const struct json_array *array) { struct json **elems; size_t i; elems = xmalloc(array->n * sizeof *elems); for (i = 0; i < array->n; i++) { - elems[i] = json_clone(array->elems[i]); + elems[i] = json_deep_clone(array->elems[i]); } return json_array_create(elems, array->n); } @@ -1649,6 +1696,10 @@ json_serialize_object(const struct shash *object, struct json_serializer *s) s->depth++; indent_line(s); + if (s->flags & JSSF_YIELD) { + cooperative_multitasking_yield(); + } + if (s->flags & JSSF_SORT) { const struct shash_node **nodes; size_t n, i; @@ -1682,6 +1733,10 @@ json_serialize_array(const struct json_array *array, struct json_serializer *s) ds_put_char(ds, '['); s->depth++; + if (s->flags & JSSF_YIELD) { + cooperative_multitasking_yield(); + } + if (array->n > 0) { indent_line(s); diff --git a/lib/json.h b/lib/json.h new file mode 100644 index 00000000000..4ad440b396f --- /dev/null +++ b/lib/json.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef JSON_H +#define JSON_H 1 + +#include "openvswitch/json.h" + +static inline void +json_destroy_with_yield(struct json *json) +{ + if (json && !--json->count) { + json_destroy__(json, true); + } +} + +struct json *json_serialized_object_create_with_yield(const struct json *); + +#endif /* JSON_H */ diff --git a/lib/jsonrpc.c b/lib/jsonrpc.c index c8ce5362e16..f1ef709502c 100644 --- a/lib/jsonrpc.c +++ b/lib/jsonrpc.c @@ -221,19 +221,19 @@ jsonrpc_log_msg(const struct jsonrpc *rpc, const char *title, } if (msg->params) { ds_put_cstr(&s, ", params="); - json_to_ds(msg->params, 0, &s); + json_to_ds(msg->params, JSSF_SORT, &s); } if (msg->result) { ds_put_cstr(&s, ", result="); - json_to_ds(msg->result, 0, &s); + json_to_ds(msg->result, JSSF_SORT, &s); } if (msg->error) { ds_put_cstr(&s, ", error="); - json_to_ds(msg->error, 0, &s); + json_to_ds(msg->error, JSSF_SORT, &s); } if (msg->id) { ds_put_cstr(&s, ", id="); - json_to_ds(msg->id, 0, &s); + json_to_ds(msg->id, JSSF_SORT, &s); } VLOG_DBG("%s: %s %s%s", rpc->name, title, jsonrpc_msg_type_to_string(msg->type), ds_cstr(&s)); @@ -1337,6 +1337,15 @@ jsonrpc_session_set_dscp(struct jsonrpc_session *s, uint8_t dscp) } } +void +jsonrpc_session_set_options(struct jsonrpc_session *s, + const struct jsonrpc_session_options *options) +{ + jsonrpc_session_set_max_backoff(s, options->max_backoff); + jsonrpc_session_set_probe_interval(s, options->probe_interval); + jsonrpc_session_set_dscp(s, options->dscp); +} + /* Sets thresholds for send backlog. If send backlog contains more than * 'max_n_msgs' messages or is larger than 'max_backlog_bytes' bytes, * connection will be closed (then reconnected, if that feature is enabled). */ diff --git a/lib/jsonrpc.h b/lib/jsonrpc.h index 2aa97d3fe6d..1baffcd8071 100644 --- a/lib/jsonrpc.h +++ b/lib/jsonrpc.h @@ -139,6 +139,14 @@ void jsonrpc_session_enable_reconnect(struct jsonrpc_session *); void jsonrpc_session_force_reconnect(struct jsonrpc_session *); void jsonrpc_session_reset_backoff(struct jsonrpc_session *); +struct jsonrpc_session_options { + int max_backoff; /* Maximum reconnection backoff, in msec. */ + int probe_interval; /* Max idle time before probing, in msec. */ + uint8_t dscp; /* Dscp value for passive connections. */ +}; + +void jsonrpc_session_set_options(struct jsonrpc_session *, + const struct jsonrpc_session_options *); void jsonrpc_session_set_max_backoff(struct jsonrpc_session *, int max_backoff); void jsonrpc_session_set_probe_interval(struct jsonrpc_session *, diff --git a/lib/lacp.h b/lib/lacp.h index 1ca06f762ba..5ba17c36a5c 100644 --- a/lib/lacp.h +++ b/lib/lacp.h @@ -24,7 +24,7 @@ /* LACP Protocol Implementation. */ enum lacp_status { - LACP_NEGOTIATED, /* Successful LACP negotations. */ + LACP_NEGOTIATED, /* Successful LACP negotiations. */ LACP_CONFIGURED, /* LACP is enabled but not negotiated. */ LACP_DISABLED /* LACP is not enabled. */ }; diff --git a/lib/latch-unix.c b/lib/latch-unix.c index f4d10c39a03..c62bb024b44 100644 --- a/lib/latch-unix.c +++ b/lib/latch-unix.c @@ -71,7 +71,7 @@ latch_set(struct latch *latch) bool latch_is_set(const struct latch *latch) { - struct pollfd pfd; + struct pollfd pfd = {0}; int retval; pfd.fd = latch->fds[0]; diff --git a/lib/learn.c b/lib/learn.c index a40209ec0b8..a62add2fda0 100644 --- a/lib/learn.c +++ b/lib/learn.c @@ -241,7 +241,7 @@ static char * OVS_WARN_UNUSED_RESULT learn_parse_spec(const char *orig, char *name, char *value, const struct ofputil_port_map *port_map, struct ofpact_learn_spec *spec, - struct ofpbuf *ofpacts, struct match *match) + struct ofpbuf *ofpacts) { /* Parse destination and check prerequisites. */ struct mf_subfield dst; @@ -275,14 +275,14 @@ learn_parse_spec(const char *orig, char *name, char *value, } else { char *tail; /* Partial field value. */ - if (parse_int_string(value, (uint8_t *)&imm, + if (parse_int_string(value, imm.b, dst.field->n_bytes, &tail) || *tail != 0) { imm_error = xasprintf("%s: cannot parse integer value", orig); } if (!imm_error && - !bitwise_is_all_zeros(&imm, dst.field->n_bytes, + !bitwise_is_all_zeros(imm.b, dst.field->n_bytes, dst.n_bits, dst.field->n_bytes * 8 - dst.n_bits)) { struct ds ds; @@ -304,15 +304,13 @@ learn_parse_spec(const char *orig, char *name, char *value, spec->src_type = NX_LEARN_SRC_IMMEDIATE; - /* Update 'match' to allow for satisfying destination - * prerequisites. */ - mf_write_subfield_value(&dst, &imm, match); - /* Push value last, as this may reallocate 'spec'! */ unsigned int imm_bytes = DIV_ROUND_UP(dst.n_bits, 8); uint8_t *src_imm = ofpbuf_put_zeros(ofpacts, OFPACT_ALIGN(imm_bytes)); - memcpy(src_imm, &imm, imm_bytes); + + memcpy(src_imm, &imm.b[dst.field->n_bytes - imm_bytes], + imm_bytes); free(error); return NULL; @@ -391,7 +389,6 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, struct ofpbuf *ofpacts) { struct ofpact_learn *learn; - struct match match; char *name, *value; learn = ofpact_put_LEARN(ofpacts); @@ -400,7 +397,6 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, learn->priority = OFP_DEFAULT_PRIORITY; learn->table_id = 1; - match_init_catchall(&match); while (ofputil_parse_key_value(&arg, &name, &value)) { if (!strcmp(name, "table")) { if (!ofputil_table_from_string(value, table_map, @@ -448,7 +444,7 @@ learn_parse__(char *orig, char *arg, const struct ofputil_port_map *port_map, spec = ofpbuf_put_zeros(ofpacts, sizeof *spec); error = learn_parse_spec(orig, name, value, port_map, - spec, ofpacts, &match); + spec, ofpacts); if (error) { return error; } diff --git a/lib/learning-switch.c b/lib/learning-switch.c index 8102475cae5..cdf42935c1d 100644 --- a/lib/learning-switch.c +++ b/lib/learning-switch.c @@ -569,6 +569,7 @@ process_packet_in(struct lswitch *sw, const struct ofp_header *oh) } /* Prepare packet_out in case we need one. */ + match_init_catchall(&po.flow_metadata); po.buffer_id = buffer_id; if (buffer_id == UINT32_MAX) { po.packet = dp_packet_data(&pkt); diff --git a/lib/libopenvswitch.pc.in b/lib/libopenvswitch.pc.in index 44fbb1f9fd2..a5f4d39479a 100644 --- a/lib/libopenvswitch.pc.in +++ b/lib/libopenvswitch.pc.in @@ -7,5 +7,5 @@ Name: libopenvswitch Description: Open vSwitch library Version: @VERSION@ Libs: -L${libdir} -lopenvswitch -Libs.private: @LIBS@ @SSL_LIBS@ @CAPNG_LDADD@ @LIBBPF_LDADD@ +Libs.private: @LIBS@ @SSL_LIBS@ @CAPNG_LDADD@ Cflags: -I${includedir} diff --git a/lib/lldp/lldp.c b/lib/lldp/lldp.c index dfeb2a80024..6fdcfef5694 100644 --- a/lib/lldp/lldp.c +++ b/lib/lldp/lldp.c @@ -583,6 +583,7 @@ lldp_decode(struct lldpd *cfg OVS_UNUSED, char *frame, int s, switch(tlv_subtype) { case LLDP_TLV_AA_ELEMENT_SUBTYPE: + CHECK_TLV_SIZE(50, "ELEMENT"); PEEK_BYTES(&msg_auth_digest, sizeof msg_auth_digest); aa_element_dword = PEEK_UINT32; @@ -629,6 +630,7 @@ lldp_decode(struct lldpd *cfg OVS_UNUSED, char *frame, int s, break; case LLDP_TLV_AA_ISID_VLAN_ASGNS_SUBTYPE: + CHECK_TLV_SIZE(36, "ISID_VLAN_ASGNS"); PEEK_BYTES(&msg_auth_digest, sizeof msg_auth_digest); /* Subtract off tlv type and length (2Bytes) + OUI (3B) + diff --git a/lib/mac-learning.c b/lib/mac-learning.c index 85b97f07c4d..5d498cfd568 100644 --- a/lib/mac-learning.c +++ b/lib/mac-learning.c @@ -182,12 +182,18 @@ get_lru(struct mac_learning *ml, struct mac_entry **e) OVS_REQ_RDLOCK(ml->rwlock) { if (!ovs_list_is_empty(&ml->lrus)) { - *e = mac_entry_from_lru_node(ml->lrus.next); - return true; - } else { - *e = NULL; - return false; + struct mac_entry *entry; + + LIST_FOR_EACH (entry, lru_node, &ml->lrus) { + if (entry->expires != MAC_ENTRY_AGE_STATIC_ENTRY) { + *e = entry; + return true; + } + } } + + *e = NULL; + return false; } static unsigned int @@ -456,12 +462,15 @@ bool is_mac_learning_update_needed(const struct mac_learning *ml, struct eth_addr src, int vlan, bool is_gratuitous_arp, bool is_bond, - void *in_port) + const void *in_port, bool *is_static_move) OVS_REQ_RDLOCK(ml->rwlock) { struct mac_entry *mac; + bool is_port_move; int age; + *is_static_move = false; + if (!mac_learning_may_learn(ml, src, vlan)) { return false; } @@ -472,14 +481,13 @@ is_mac_learning_update_needed(const struct mac_learning *ml, return true; } + /* Check whether address is on a different port. */ + is_port_move = mac_entry_get_port(ml, mac) != in_port; + age = mac_entry_age(ml, mac); /* If mac is a static entry, then there is no need to update. */ if (age == MAC_ENTRY_AGE_STATIC_ENTRY) { - /* Coverage counter to increment when a packet with same - * static-mac appears on a different port. */ - if (mac_entry_get_port(ml, mac) != in_port) { - COVERAGE_INC(mac_learning_static_none_move); - } + *is_static_move = is_port_move; return false; } @@ -500,7 +508,7 @@ is_mac_learning_update_needed(const struct mac_learning *ml, } } - return mac_entry_get_port(ml, mac) != in_port /* ofbundle */; + return is_port_move; } /* Updates MAC learning table 'ml' given that a packet matching 'src' was @@ -568,7 +576,8 @@ mac_learning_update(struct mac_learning *ml, struct eth_addr src, void *in_port) OVS_EXCLUDED(ml->rwlock) { - bool need_update; + bool is_static_move = false; + bool need_update = false; bool updated = false; /* Don't learn the OFPP_NONE port. */ @@ -576,8 +585,14 @@ mac_learning_update(struct mac_learning *ml, struct eth_addr src, /* First try the common case: no change to MAC learning table. */ ovs_rwlock_rdlock(&ml->rwlock); need_update = is_mac_learning_update_needed(ml, src, vlan, - is_gratuitous_arp, is_bond, - in_port); + is_gratuitous_arp, + is_bond, in_port, + &is_static_move); + if (is_static_move) { + /* Coverage counter to increment when a packet with same + * static-mac appears on a different port. */ + COVERAGE_INC(mac_learning_static_none_move); + } ovs_rwlock_unlock(&ml->rwlock); if (need_update) { @@ -644,25 +659,10 @@ mac_learning_expire(struct mac_learning *ml, struct mac_entry *e) void mac_learning_flush(struct mac_learning *ml) { - struct mac_entry *e, *first_static_mac = NULL; - - while (get_lru(ml, &e) && (e != first_static_mac)) { - - /* Static mac should not be evicted. */ - if (MAC_ENTRY_AGE_STATIC_ENTRY == e->expires) { - - /* Make note of first static-mac encountered, so that this while - * loop will break on visting this mac again via get_lru(). */ - if (!first_static_mac) { - first_static_mac = e; - } + struct mac_entry *e; - /* Remove from lru head and append it to tail. */ - ovs_list_remove(&e->lru_node); - ovs_list_push_back(&ml->lrus, &e->lru_node); - } else { - mac_learning_expire(ml, e); - } + while (get_lru(ml, &e)) { + mac_learning_expire(ml, e); } hmap_shrink(&ml->table); } diff --git a/lib/mac-learning.h b/lib/mac-learning.h index 0502b281153..f1e1e8e2ee7 100644 --- a/lib/mac-learning.h +++ b/lib/mac-learning.h @@ -230,11 +230,10 @@ bool mac_learning_may_learn(const struct mac_learning *ml, const struct eth_addr src_mac, uint16_t vlan) OVS_REQ_RDLOCK(ml->rwlock); -bool -is_mac_learning_update_needed(const struct mac_learning *ml, - struct eth_addr src, int vlan, - bool is_gratuitous_arp, bool is_bond, - void *in_port) +bool is_mac_learning_update_needed(const struct mac_learning *ml, + struct eth_addr src, int vlan, + bool is_gratuitous_arp, bool is_bond, + const void *in_port, bool *is_static_move) OVS_REQ_RDLOCK(ml->rwlock); struct mac_entry *mac_learning_insert(struct mac_learning *ml, const struct eth_addr src, diff --git a/lib/match.c b/lib/match.c index 0b9dc4278c1..9b7e06e0c7f 100644 --- a/lib/match.c +++ b/lib/match.c @@ -1618,7 +1618,7 @@ match_format(const struct match *match, ds_put_char(s, ','); } for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) { - char str_i[8]; + char str_i[12]; if (!wc->masks.vlans[i].tci) { break; diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c index 029ca28558b..dc5164b41c7 100644 --- a/lib/mcast-snooping.c +++ b/lib/mcast-snooping.c @@ -57,6 +57,30 @@ mcast_snooping_flood_unreg(const struct mcast_snooping *ms) return ms->flood_unreg; } +char * +mcast_snooping_group_protocol_str(enum mcast_group_proto grp_proto) +{ + switch (grp_proto) { + case MCAST_GROUP_IGMPV1: + return "IGMPv1"; + break; + case MCAST_GROUP_IGMPV2: + return "IGMPv2"; + break; + case MCAST_GROUP_IGMPV3: + return "IGMPv3"; + break; + case MCAST_GROUP_MLDV1: + return "MLDv1"; + break; + case MCAST_GROUP_MLDV2: + return "MLDv2"; + break; + default: + return "UNKNOWN"; + } +} + bool mcast_snooping_is_query(ovs_be16 igmp_type) { @@ -389,7 +413,8 @@ mcast_snooping_prune_expired(struct mcast_snooping *ms, bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { bool learned; @@ -424,6 +449,9 @@ mcast_snooping_add_group(struct mcast_snooping *ms, } mcast_group_insert_bundle(ms, grp, port, ms->idle_time); + /* update the protocol version. */ + grp->protocol_version = grp_proto; + /* Mark 'grp' as recently used. */ ovs_list_push_back(&ms->group_lru, &grp->group_node); return learned; @@ -431,11 +459,12 @@ mcast_snooping_add_group(struct mcast_snooping *ms, bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock) { struct in6_addr addr = in6_addr_mapped_ipv4(ip4); - return mcast_snooping_add_group(ms, &addr, vlan, port); + return mcast_snooping_add_group(ms, &addr, vlan, port, grp_proto); } int @@ -478,7 +507,8 @@ mcast_snooping_add_report(struct mcast_snooping *ms, || record->type == IGMPV3_CHANGE_TO_INCLUDE_MODE)) { ret = mcast_snooping_leave_group4(ms, ip4, vlan, port); } else { - ret = mcast_snooping_add_group4(ms, ip4, vlan, port); + ret = mcast_snooping_add_group4(ms, ip4, vlan, port, + MCAST_GROUP_IGMPV3); } if (ret) { count++; @@ -513,7 +543,8 @@ mcast_snooping_add_mld(struct mcast_snooping *ms, switch (mld->type) { case MLD_REPORT: - ret = mcast_snooping_add_group(ms, addr, vlan, port); + ret = mcast_snooping_add_group(ms, addr, vlan, port, + MCAST_GROUP_MLDV1); if (ret) { count++; } @@ -545,7 +576,8 @@ mcast_snooping_add_mld(struct mcast_snooping *ms, || record->type == IGMPV3_CHANGE_TO_INCLUDE_MODE)) { ret = mcast_snooping_leave_group(ms, addr, vlan, port); } else { - ret = mcast_snooping_add_group(ms, addr, vlan, port); + ret = mcast_snooping_add_group(ms, addr, vlan, port, + MCAST_GROUP_MLDV2); } if (ret) { count++; @@ -946,8 +978,9 @@ mcast_snooping_wait(struct mcast_snooping *ms) void mcast_snooping_flush_bundle(struct mcast_snooping *ms, void *port) { - struct mcast_group *g; struct mcast_mrouter_bundle *m; + struct mcast_port_bundle *p; + struct mcast_group *g; if (!mcast_snooping_enabled(ms)) { return; @@ -971,5 +1004,19 @@ mcast_snooping_flush_bundle(struct mcast_snooping *ms, void *port) } } + LIST_FOR_EACH_SAFE (p, node, &ms->fport_list) { + if (p->port == port) { + mcast_snooping_flush_port(p); + ms->need_revalidate = true; + } + } + + LIST_FOR_EACH_SAFE (p, node, &ms->rport_list) { + if (p->port == port) { + mcast_snooping_flush_port(p); + ms->need_revalidate = true; + } + } + ovs_rwlock_unlock(&ms->rwlock); } diff --git a/lib/mcast-snooping.h b/lib/mcast-snooping.h index f120405da57..de42cf826ba 100644 --- a/lib/mcast-snooping.h +++ b/lib/mcast-snooping.h @@ -39,6 +39,15 @@ struct mcast_snooping; /* Time, in seconds, before expiring a mrouter_port due to inactivity. */ #define MCAST_MROUTER_PORT_IDLE_TIME 180 +/* Multicast group protocol. */ +enum mcast_group_proto { + MCAST_GROUP_IGMPV1 = 0, + MCAST_GROUP_IGMPV2, + MCAST_GROUP_IGMPV3, + MCAST_GROUP_MLDV1, + MCAST_GROUP_MLDV2, +}; + /* Multicast group entry. * Guarded by owning 'mcast_snooping''s rwlock. */ struct mcast_group { @@ -51,6 +60,9 @@ struct mcast_group { /* VLAN tag. */ uint16_t vlan; + /* Multicast group IPv6/IPv4 Protocol version IGMPv1,2,3 or MLDv1,2 */ + enum mcast_group_proto protocol_version; + /* Node in parent struct mcast_snooping group_lru. */ struct ovs_list group_node OVS_GUARDED; @@ -185,10 +197,12 @@ mcast_snooping_lookup4(const struct mcast_snooping *ms, ovs_be32 ip4, /* Learning. */ bool mcast_snooping_add_group(struct mcast_snooping *ms, const struct in6_addr *addr, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_add_group4(struct mcast_snooping *ms, ovs_be32 ip4, - uint16_t vlan, void *port) + uint16_t vlan, void *port, + enum mcast_group_proto grp_proto) OVS_REQ_WRLOCK(ms->rwlock); int mcast_snooping_add_report(struct mcast_snooping *ms, const struct dp_packet *p, @@ -210,6 +224,7 @@ bool mcast_snooping_add_mrouter(struct mcast_snooping *ms, uint16_t vlan, OVS_REQ_WRLOCK(ms->rwlock); bool mcast_snooping_is_query(ovs_be16 igmp_type); bool mcast_snooping_is_membership(ovs_be16 igmp_type); +char *mcast_snooping_group_protocol_str(enum mcast_group_proto grp_proto); /* Flush. */ void mcast_snooping_mdb_flush(struct mcast_snooping *ms); diff --git a/lib/meta-flow.c b/lib/meta-flow.c index c576ae6202a..499be04b608 100644 --- a/lib/meta-flow.c +++ b/lib/meta-flow.c @@ -71,8 +71,10 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); #define MF_VALUE_EXACT_64 MF_VALUE_EXACT_32, MF_VALUE_EXACT_32 #define MF_VALUE_EXACT_128 MF_VALUE_EXACT_64, MF_VALUE_EXACT_64 #define MF_VALUE_EXACT_INITIALIZER { .tun_metadata = { MF_VALUE_EXACT_128 } } +#define MF_SUBVALUE_EXACT_INITIALIZER { .u8 = { MF_VALUE_EXACT_128 } } const union mf_value exact_match_mask = MF_VALUE_EXACT_INITIALIZER; +const union mf_subvalue exact_sub_match_mask = MF_SUBVALUE_EXACT_INITIALIZER; static void nxm_init(void); @@ -2751,8 +2753,8 @@ static char * mf_from_integer_string(const struct mf_field *mf, const char *s, uint8_t *valuep, uint8_t *maskp) { + const char *err_str; char *tail; - const char *err_str = ""; int err; err = parse_int_string(s, valuep, mf->n_bytes, &tail); @@ -2785,8 +2787,8 @@ mf_from_integer_string(const struct mf_field *mf, const char *s, static char * mf_from_packet_type_string(const char *s, ovs_be32 *packet_type) { + const char *err_str; char *tail; - const char *err_str = ""; int err; if (*s != '(') { @@ -3676,3 +3678,28 @@ mf_bitmap_not(struct mf_bitmap x) bitmap_not(x.bm, MFF_N_IDS); return x; } + +void +mf_set_mask_l3_prereqs(const struct mf_field *mf, const struct flow *fl, + struct flow_wildcards *wc) +{ + if (is_ip_any(fl) && + ((mf->id == MFF_IPV4_SRC) || + (mf->id == MFF_IPV4_DST) || + (mf->id == MFF_IPV6_SRC) || + (mf->id == MFF_IPV6_DST) || + (mf->id == MFF_IPV6_LABEL) || + (mf->id == MFF_IP_DSCP) || + (mf->id == MFF_IP_ECN) || + (mf->id == MFF_IP_TTL))) { + WC_MASK_FIELD(wc, nw_proto); + } else if ((fl->dl_type == htons(ETH_TYPE_ARP)) && + ((mf->id == MFF_ARP_OP) || + (mf->id == MFF_ARP_SHA) || + (mf->id == MFF_ARP_THA) || + (mf->id == MFF_ARP_SPA) || + (mf->id == MFF_ARP_TPA))) { + /* mask only the lower 8 bits. */ + wc->masks.nw_proto = 0xff; + } +} diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml index a1a20366d40..ac72a44bce4 100644 --- a/lib/meta-flow.xml +++ b/lib/meta-flow.xml @@ -3517,23 +3517,24 @@ actions=clone(load:0->NXM_OF_IN_PORT[],output:123)

+tab(;); r r r r r. -Criteria OpenFlow 1.0 OpenFlow 1.1 OpenFlow 1.2+ NXM -\_ \_ \_ \_ \_ -[1] \fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR \fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR \fL0000\fR/\fL0000\fR,\fL--\fR \fL0000\fR/\fL0000\fR -[2] \fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR \fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR \fL0000\fR/\fLffff\fR,\fL--\fR \fL0000\fR/\fLffff\fR -[3] \fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR \fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR \fL1xxx\fR/\fLffff\fR,\fL--\fR \fL1xxx\fR/\fL1fff\fR -[4] \fL????\fR/\fL1\fR,\fL0y\fR/\fL0\fR \fLfffe\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL1000\fR/\fL1000\fR,\fL0y\fR \fLz000\fR/\fLf000\fR -[5] \fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR \fL1xxx\fR/\fLffff\fR,\fL0y\fR \fLzxxx\fR/\fLffff\fR +Criteria;OpenFlow 1.0;OpenFlow 1.1;OpenFlow 1.2+;NXM +\_;\_;\_;\_;\_ +[1];\fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR;\fL????\fR/\fL1\fR,\fL??\fR/\fL?\fR;\fL0000\fR/\fL0000\fR,\fL--\fR;\fL0000\fR/\fL0000\fR +[2];\fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR;\fLffff\fR/\fL0\fR,\fL??\fR/\fL?\fR;\fL0000\fR/\fLffff\fR,\fL--\fR;\fL0000\fR/\fLffff\fR +[3];\fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR;\fL0xxx\fR/\fL0\fR,\fL??\fR/\fL1\fR;\fL1xxx\fR/\fLffff\fR,\fL--\fR;\fL1xxx\fR/\fL1fff\fR +[4];\fL????\fR/\fL1\fR,\fL0y\fR/\fL0\fR;\fLfffe\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL1000\fR/\fL1000\fR,\fL0y\fR;\fLz000\fR/\fLf000\fR +[5];\fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL0xxx\fR/\fL0\fR,\fL0y\fR/\fL0\fR;\fL1xxx\fR/\fLffff\fR,\fL0y\fR;\fLzxxx\fR/\fLffff\fR .T& -r r c c r. -[6] (none) (none) \fL1001\fR/\fL1001\fR,\fL--\fR \fL1001\fR/\fL1001\fR +r c c r r. +[6];(none);(none);\fL1001\fR/\fL1001\fR,\fL--\fR;\fL1001\fR/\fL1001\fR .T& -r r c c c. -[7] (none) (none) (none) \fL3000\fR/\fL3000\fR -[8] (none) (none) (none) \fL0000\fR/\fL0fff\fR -[9] (none) (none) (none) \fL0000\fR/\fLf000\fR -[10] (none) (none) (none) \fL0000\fR/\fLefff\fR +r c c c r. +[7];(none);(none);(none);\fL3000\fR/\fL3000\fR +[8];(none);(none);(none);\fL0000\fR/\fL0fff\fR +[9];(none);(none);(none);\fL0000\fR/\fLf000\fR +[10];(none);(none);(none);\fL0000\fR/\fLefff\fR

@@ -4312,9 +4313,9 @@ r r c c c. - + - + diff --git a/lib/mpsc-queue.h b/lib/mpsc-queue.h index 8c7109621a1..70c2d7a01ec 100644 --- a/lib/mpsc-queue.h +++ b/lib/mpsc-queue.h @@ -116,9 +116,9 @@ struct mpsc_queue { }; #define MPSC_QUEUE_INITIALIZER(Q) { \ - .head = ATOMIC_VAR_INIT(&(Q)->stub), \ - .tail = ATOMIC_VAR_INIT(&(Q)->stub), \ - .stub = { .next = ATOMIC_VAR_INIT(NULL) }, \ + .head = &(Q)->stub, \ + .tail = &(Q)->stub, \ + .stub = { .next = NULL }, \ .read_lock = OVS_MUTEX_INITIALIZER, \ } diff --git a/lib/netdev-afxdp-pool.c b/lib/netdev-afxdp-pool.c index 3386d2dcf78..f56a7b29ece 100644 --- a/lib/netdev-afxdp-pool.c +++ b/lib/netdev-afxdp-pool.c @@ -15,6 +15,8 @@ */ #include +#include + #include "dp-packet.h" #include "netdev-afxdp-pool.h" #include "openvswitch/util.h" diff --git a/lib/netdev-afxdp-pool.h b/lib/netdev-afxdp-pool.h index f929b9489c7..6681cf539e9 100644 --- a/lib/netdev-afxdp-pool.h +++ b/lib/netdev-afxdp-pool.h @@ -19,12 +19,7 @@ #ifdef HAVE_AF_XDP -#include -#include -#include - #include "openvswitch/thread.h" -#include "ovs-atomic.h" /* LIFO ptr_array. */ struct umem_pool { diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c index ca3f2431eac..54029722e0b 100644 --- a/lib/netdev-afxdp.c +++ b/lib/netdev-afxdp.c @@ -21,6 +21,11 @@ #include "netdev-afxdp.h" #include "netdev-afxdp-pool.h" +#ifdef HAVE_LIBXDP +#include +#else +#include +#endif #include #include #include @@ -29,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +50,7 @@ #include "openvswitch/list.h" #include "openvswitch/thread.h" #include "openvswitch/vlog.h" +#include "ovs-atomic.h" #include "ovs-numa.h" #include "packets.h" #include "socket-util.h" @@ -72,7 +79,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); #define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS #define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP #define NEED_WAKEUP_DEFAULT true #else #define NEED_WAKEUP_DEFAULT false @@ -169,7 +176,7 @@ struct netdev_afxdp_tx_lock { ); }; -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP static inline void xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem, struct netdev *netdev, int fd) @@ -201,7 +208,7 @@ xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info) return xsk_ring_prod__needs_wakeup(&xsk_info->tx); } -#else /* !HAVE_XDP_NEED_WAKEUP */ +#else /* !XDP_USE_NEED_WAKEUP */ static inline void xsk_rx_wakeup_if_needed(struct xsk_umem_info *umem OVS_UNUSED, struct netdev *netdev OVS_UNUSED, @@ -215,7 +222,7 @@ xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED) { return true; } -#endif /* HAVE_XDP_NEED_WAKEUP */ +#endif /* XDP_USE_NEED_WAKEUP */ static void netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool) @@ -351,7 +358,7 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, cfg.bind_flags = xdp_modes[mode].bind_flags; cfg.xdp_flags = xdp_modes[mode].xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST; -#ifdef HAVE_XDP_NEED_WAKEUP +#ifdef XDP_USE_NEED_WAKEUP if (use_need_wakeup) { cfg.bind_flags |= XDP_USE_NEED_WAKEUP; } @@ -377,7 +384,11 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex, } /* Make sure the built-in AF_XDP program is loaded. */ +#ifdef HAVE_BPF_XDP_QUERY_ID + ret = bpf_xdp_query_id(ifindex, cfg.xdp_flags, &prog_id); +#else ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags); +#endif if (ret || !prog_id) { if (ret) { VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno)); @@ -423,7 +434,11 @@ xsk_configure(int ifindex, int xdp_queue_id, enum afxdp_mode mode, /* Umem memory region. */ bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE); +#ifndef __CHECKER__ + /* Sparse complains about a very large memset, but it is OK in this case. + * So, hiding it from the checker. */ memset(bufs, 0, NUM_FRAMES * FRAME_SIZE); +#endif /* Create AF_XDP socket. */ umem = xsk_configure_umem(bufs, NUM_FRAMES * FRAME_SIZE); @@ -630,9 +645,9 @@ netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, } need_wakeup = smap_get_bool(args, "use-need-wakeup", NEED_WAKEUP_DEFAULT); -#ifndef HAVE_XDP_NEED_WAKEUP +#ifndef XDP_USE_NEED_WAKEUP if (need_wakeup) { - VLOG_WARN("XDP need_wakeup is not supported in libbpf."); + VLOG_WARN("XDP need_wakeup is not supported in libbpf/libxdp."); need_wakeup = false; } #endif @@ -657,8 +672,6 @@ netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args) ovs_mutex_lock(&dev->mutex); smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); smap_add_format(args, "xdp-mode", "%s", xdp_modes[dev->xdp_mode].name); - smap_add_format(args, "xdp-mode-in-use", "%s", - xdp_modes[dev->xdp_mode_in_use].name); smap_add_format(args, "use-need-wakeup", "%s", dev->use_need_wakeup ? "true" : "false"); ovs_mutex_unlock(&dev->mutex); @@ -742,7 +755,11 @@ xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) uint32_t ret, prog_id = 0; /* Check whether XDP program is loaded. */ +#ifdef HAVE_BPF_XDP_QUERY_ID + ret = bpf_xdp_query_id(ifindex, flags, &prog_id); +#else ret = bpf_get_link_xdp_id(ifindex, &prog_id, flags); +#endif if (ret) { VLOG_ERR("Failed to get XDP prog id (%s)", ovs_strerror(errno)); return; @@ -753,7 +770,14 @@ xsk_remove_xdp_program(uint32_t ifindex, enum afxdp_mode mode) return; } - bpf_set_link_xdp_fd(ifindex, -1, flags); +#ifdef HAVE_BPF_XDP_DETACH + if (bpf_xdp_detach(ifindex, flags, NULL) != 0) { +#else + if (bpf_set_link_xdp_fd(ifindex, -1, flags) != 0) { +#endif + VLOG_ERR("Failed to detach XDP program (%s) at ifindex %d", + ovs_strerror(errno), ifindex); + } } void @@ -868,9 +892,22 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, OVS_XDP_HEADROOM); dp_packet_set_size(packet, len); +#if __GNUC__ >= 11 && !__clang__ + /* GCC 11+ generates a false-positive warning about free() being + * called on DPBUF_AFXDP packet, but it is an imposisible code path. + * Disabling a warning to avoid build failures. + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108187 */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfree-nonheap-object" +#endif + /* Add packet into batch, increase batch->count. */ dp_packet_batch_add(batch, packet); +#if __GNUC__ && !__clang__ +#pragma GCC diagnostic pop +#endif + idx_rx++; } /* Release the RX queue. */ @@ -1156,18 +1193,18 @@ libbpf_print(enum libbpf_print_level level, return 0; } -int netdev_afxdp_init(void) -{ - libbpf_set_print(libbpf_print); - return 0; -} - int netdev_afxdp_construct(struct netdev *netdev) { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; struct netdev_linux *dev = netdev_linux_cast(netdev); int ret; + if (ovsthread_once_start(&once)) { + libbpf_set_print(libbpf_print); + ovsthread_once_done(&once); + } + /* Configure common netdev-linux first. */ ret = netdev_linux_construct(netdev); if (ret) { @@ -1328,3 +1365,22 @@ netdev_afxdp_get_stats(const struct netdev *netdev, return error; } + +int +netdev_afxdp_get_status(const struct netdev *netdev, struct smap *args) +{ + int error = netdev_linux_get_status(netdev, args); + + if (error) { + return error; + } + + struct netdev_linux *dev = netdev_linux_cast(netdev); + + ovs_mutex_lock(&dev->mutex); + smap_add_format(args, "xdp-mode", "%s", + xdp_modes[dev->xdp_mode_in_use].name); + ovs_mutex_unlock(&dev->mutex); + + return error; +} diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h index e91cd102d28..236a37cc844 100644 --- a/lib/netdev-afxdp.h +++ b/lib/netdev-afxdp.h @@ -47,7 +47,6 @@ struct xsk_socket_info; int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_); void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_); -int netdev_afxdp_init(void); int netdev_afxdp_construct(struct netdev *netdev_); void netdev_afxdp_destruct(struct netdev *netdev_); int netdev_afxdp_verify_mtu_size(const struct netdev *netdev, int mtu); @@ -63,6 +62,7 @@ int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args, int netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args); int netdev_afxdp_get_stats(const struct netdev *netdev_, struct netdev_stats *stats); +int netdev_afxdp_get_status(const struct netdev *netdev, struct smap *args); int netdev_afxdp_get_custom_stats(const struct netdev *netdev, struct netdev_custom_stats *custom_stats); diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c index 7875636cc3c..8596741aa17 100644 --- a/lib/netdev-bsd.c +++ b/lib/netdev-bsd.c @@ -1168,6 +1168,27 @@ netdev_bsd_get_features(const struct netdev *netdev, return error; } +static int +netdev_bsd_get_speed(const struct netdev *netdev, uint32_t *current, + uint32_t *max) +{ + enum netdev_features f_current, f_supported, f_advertised, f_peer; + int error; + + error = netdev_bsd_get_features(netdev, &f_current, &f_advertised, + &f_supported, &f_peer); + if (error) { + return error; + } + + *current = MIN(UINT32_MAX, + netdev_features_to_bps(f_current, 0) / 1000000ULL); + *max = MIN(UINT32_MAX, + netdev_features_to_bps(f_supported, 0) / 1000000ULL); + + return 0; +} + /* * Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask. If * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared. Returns a @@ -1493,6 +1514,7 @@ netdev_bsd_update_flags(struct netdev *netdev_, enum netdev_flags off, .get_carrier = netdev_bsd_get_carrier, \ .get_stats = netdev_bsd_get_stats, \ .get_features = netdev_bsd_get_features, \ + .get_speed = netdev_bsd_get_speed, \ .set_in4 = netdev_bsd_set_in4, \ .get_addr_list = netdev_bsd_get_addr_list, \ .get_next_hop = netdev_bsd_get_next_hop, \ diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 0dd655507b5..02cef6e4513 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -24,11 +24,13 @@ #include #include #include +#include #include -#include +#include #include #include +#include #include #include #include @@ -47,6 +49,7 @@ #include "dpif-netdev.h" #include "fatal-signal.h" #include "if-notifier.h" +#include "mpsc-queue.h" #include "netdev-provider.h" #include "netdev-vport.h" #include "odp-util.h" @@ -55,6 +58,7 @@ #include "openvswitch/match.h" #include "openvswitch/ofp-parse.h" #include "openvswitch/ofp-print.h" +#include "openvswitch/poll-loop.h" #include "openvswitch/shash.h" #include "openvswitch/vlog.h" #include "ovs-numa.h" @@ -76,7 +80,12 @@ VLOG_DEFINE_THIS_MODULE(netdev_dpdk); static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); COVERAGE_DEFINE(vhost_tx_contention); -COVERAGE_DEFINE(vhost_notification); + +static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ +static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ +static bool vhost_postcopy_enabled = false; /* Status of vHost POSTCOPY + * support. */ +static bool per_port_memory = false; /* Status of per port memory support */ #define DPDK_PORT_WATCHDOG_INTERVAL 5 @@ -159,7 +168,6 @@ typedef uint16_t dpdk_port_t; static const struct rte_eth_conf port_conf = { .rxmode = { - .split_hdr_size = 0, .offloads = 0, }, .rx_adv_conf = { @@ -181,7 +189,6 @@ static int new_device(int vid); static void destroy_device(int vid); static int vring_state_changed(int vid, uint16_t queue_id, int enable); static void destroy_connection(int vid); -static void vhost_guest_notified(int vid); static const struct rte_vhost_device_ops virtio_net_device_ops = { @@ -191,7 +198,6 @@ static const struct rte_vhost_device_ops virtio_net_device_ops = .features_changed = NULL, .new_connection = NULL, .destroy_connection = destroy_connection, - .guest_notified = vhost_guest_notified, }; /* Custom software stats for dpdk ports */ @@ -406,10 +412,33 @@ enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, NETDEV_RX_HW_CRC_STRIP = 1 << 1, NETDEV_RX_HW_SCATTER = 1 << 2, - NETDEV_TX_TSO_OFFLOAD = 1 << 3, - NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3, + NETDEV_TX_TCP_CKSUM_OFFLOAD = 1 << 4, + NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5, + NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6, + NETDEV_TX_TSO_OFFLOAD = 1 << 7, + NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8, + NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9, + NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10, + NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11, +}; + +enum dpdk_rx_steer_flags { + DPDK_RX_STEER_LACP = 1 << 0, }; +/* Flags for the netdev_dpdk virtio_features_state field. + * This is used for the virtio features recovery mechanism linked to TSO + * support. */ +#define OVS_VIRTIO_F_CLEAN (UINT8_C(1) << 0) +#define OVS_VIRTIO_F_WORKAROUND (UINT8_C(1) << 1) +#define OVS_VIRTIO_F_NEGOTIATED (UINT8_C(1) << 2) +#define OVS_VIRTIO_F_RECONF_PENDING (UINT8_C(1) << 3) +#define OVS_VIRTIO_F_CLEAN_NEGOTIATED \ + (OVS_VIRTIO_F_CLEAN | OVS_VIRTIO_F_NEGOTIATED) +#define OVS_VIRTIO_F_WORKAROUND_NEGOTIATED \ + (OVS_VIRTIO_F_WORKAROUND | OVS_VIRTIO_F_NEGOTIATED) + /* * In order to avoid confusion in variables names, following naming convention * should be used, if possible: @@ -435,9 +464,8 @@ struct netdev_dpdk { bool attached; /* If true, rte_eth_dev_start() was successfully called */ bool started; - bool reset_needed; - /* 1 pad byte here. */ struct eth_addr hwaddr; + /* 2 pad bytes here. */ int mtu; int socket_id; int buf_size; @@ -466,7 +494,11 @@ struct netdev_dpdk { bool vhost_reconfigured; atomic_uint8_t vhost_tx_retries_max; - /* 2 pad bytes here. */ + + /* Flags for virtio features recovery mechanism. */ + uint8_t virtio_features_state; + + /* 1 pad byte here. */ ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -484,6 +516,9 @@ struct netdev_dpdk { /* Array of vhost rxq states, see vring_state_changed. */ bool *vhost_rxq_enabled; + + /* Ensures that Rx metadata delivery is configured only once. */ + bool rx_metadata_delivery_configured; ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -500,6 +535,12 @@ struct netdev_dpdk { * netdev_dpdk*_reconfigure() is called */ int requested_mtu; int requested_n_txq; + /* User input for n_rxq (see dpdk_set_rxq_config). */ + int user_n_rxq; + /* user_n_rxq + an optional rx steering queue (see + * netdev_dpdk_reconfigure). This field is different from the other + * requested_* fields as it may contain a different value than the user + * input. */ int requested_n_rxq; int requested_rxq_size; int requested_txq_size; @@ -529,6 +570,13 @@ struct netdev_dpdk { /* VF configuration. */ struct eth_addr requested_hwaddr; + + /* Requested rx queue steering flags, + * from the enum set 'dpdk_rx_steer_flags'. */ + uint64_t requested_rx_steer_flags; + uint64_t rx_steer_flags; + size_t rx_steer_flows_num; + struct rte_flow **rx_steer_flows; ); PADDED_MEMBERS(CACHE_LINE_SIZE, @@ -558,6 +606,9 @@ int netdev_dpdk_get_vid(const struct netdev_dpdk *dev); struct ingress_policer * netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev); +static void netdev_dpdk_mbuf_dump(const char *prefix, const char *message, + const struct rte_mbuf *); + static bool is_dpdk_class(const struct netdev_class *class) { @@ -687,11 +738,11 @@ dpdk_mp_sweep(void) OVS_REQUIRES(dpdk_mp_mutex) * calculating. */ static uint32_t -dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu) { uint32_t n_mbufs; - if (!per_port_mp) { + if (!per_port_memory) { /* Shared memory are being used. * XXX: this is a really rough method of provisioning memory. * It's impossible to determine what the exact memory requirements are @@ -722,7 +773,7 @@ dpdk_calculate_mbufs(struct netdev_dpdk *dev, int mtu, bool per_port_mp) } static struct dpdk_mp * -dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_mp_create(struct netdev_dpdk *dev, int mtu) { char mp_name[RTE_MEMPOOL_NAMESIZE]; const char *netdev_name = netdev_get_name(&dev->up); @@ -747,7 +798,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) /* Get the size of each mbuf, based on the MTU */ mbuf_size = MTU_TO_FRAME_LEN(mtu); - n_mbufs = dpdk_calculate_mbufs(dev, mtu, per_port_mp); + n_mbufs = dpdk_calculate_mbufs(dev, mtu); do { /* Full DPDK memory pool name must be unique and cannot be @@ -833,15 +884,15 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu, bool per_port_mp) } static struct dpdk_mp * -dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) +dpdk_mp_get(struct netdev_dpdk *dev, int mtu) { - struct dpdk_mp *dmp, *next; + struct dpdk_mp *dmp = NULL, *next; bool reuse = false; ovs_mutex_lock(&dpdk_mp_mutex); /* Check if shared memory is being used, if so check existing mempools * to see if reuse is possible. */ - if (!per_port_mp) { + if (!per_port_memory) { /* If user has provided defined mempools, check if one is suitable * and get new buffer size.*/ mtu = dpdk_get_user_adjusted_mtu(mtu, dev->requested_mtu, @@ -860,7 +911,7 @@ dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) dpdk_mp_sweep(); if (!reuse) { - dmp = dpdk_mp_create(dev, mtu, per_port_mp); + dmp = dpdk_mp_create(dev, mtu); if (dmp) { /* Shared memory will hit the reuse case above so will not * request a mempool that already exists but we need to check @@ -870,7 +921,7 @@ dpdk_mp_get(struct netdev_dpdk *dev, int mtu, bool per_port_mp) * dmp to point to the existing entry and increment the refcount * to avoid being freed at a later stage. */ - if (per_port_mp && rte_errno == EEXIST) { + if (per_port_memory && rte_errno == EEXIST) { LIST_FOR_EACH (next, list_node, &dpdk_mp_list) { if (dmp->mp == next->mp) { rte_free(dmp); @@ -915,17 +966,16 @@ netdev_dpdk_mempool_configure(struct netdev_dpdk *dev) uint32_t buf_size = dpdk_buf_size(dev->requested_mtu); struct dpdk_mp *dmp; int ret = 0; - bool per_port_mp = dpdk_per_port_memory(); /* With shared memory we do not need to configure a mempool if the MTU * and socket ID have not changed, the previous configuration is still * valid so return 0 */ - if (!per_port_mp && dev->mtu == dev->requested_mtu + if (!per_port_memory && dev->mtu == dev->requested_mtu && dev->socket_id == dev->requested_socket_id) { return ret; } - dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size), per_port_mp); + dmp = dpdk_mp_get(dev, FRAME_LEN_TO_MTU(buf_size)); if (!dmp) { VLOG_ERR("Failed to create memory pool for netdev " "%s, with MTU %d on socket %d: %s\n", @@ -1003,6 +1053,45 @@ dpdk_watchdog(void *dummy OVS_UNUSED) return NULL; } +static void +netdev_dpdk_update_netdev_flag(struct netdev_dpdk *dev, + enum dpdk_hw_ol_features hw_ol_features, + enum netdev_ol_flags flag) + OVS_REQUIRES(dev->mutex) +{ + struct netdev *netdev = &dev->up; + + if (dev->hw_ol_features & hw_ol_features) { + netdev->ol_flags |= flag; + } else { + netdev->ol_flags &= ~flag; + } +} + +static void +netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev) + OVS_REQUIRES(dev->mutex) +{ + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_IPV4_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_IPV4_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TCP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_UDP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_UDP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_SCTP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_SCTP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD, + NETDEV_TX_OFFLOAD_TCP_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD, + NETDEV_TX_VXLAN_TNL_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD, + NETDEV_TX_GENEVE_TNL_TSO); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM); + netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD, + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM); +} + static int dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) { @@ -1035,11 +1124,40 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_KEEP_CRC; } + if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_TCP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_UDP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_SCTP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; + } + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_SCTP_CKSUM; - } + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO; + } + + if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO; + } + + if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO; + } + + if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM; + } + + if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) { + conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM; } /* Limit configured rss hash functions to only those supported @@ -1136,6 +1254,45 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) } } +static void +dpdk_eth_dev_init_rx_metadata(struct netdev_dpdk *dev) +{ + uint64_t rx_metadata = 0; + int ret; + + if (dev->rx_metadata_delivery_configured) { + return; + } + + /* For the fallback offload (non-"transfer" rules). */ + rx_metadata |= RTE_ETH_RX_METADATA_USER_MARK; + +#ifdef ALLOW_EXPERIMENTAL_API + /* For the tunnel offload. */ + rx_metadata |= RTE_ETH_RX_METADATA_TUNNEL_ID; +#endif /* ALLOW_EXPERIMENTAL_API */ + + ret = rte_eth_rx_metadata_negotiate(dev->port_id, &rx_metadata); + if (ret == 0) { + if (!(rx_metadata & RTE_ETH_RX_METADATA_USER_MARK)) { + VLOG_DBG("%s: The NIC will not provide per-packet USER_MARK", + netdev_get_name(&dev->up)); + } +#ifdef ALLOW_EXPERIMENTAL_API + if (!(rx_metadata & RTE_ETH_RX_METADATA_TUNNEL_ID)) { + VLOG_DBG("%s: The NIC will not provide per-packet TUNNEL_ID", + netdev_get_name(&dev->up)); + } +#endif /* ALLOW_EXPERIMENTAL_API */ + } else { + VLOG(ret == -ENOTSUP ? VLL_DBG : VLL_WARN, + "%s: Cannot negotiate Rx metadata: %s", + netdev_get_name(&dev->up), rte_strerror(-ret)); + } + + dev->rx_metadata_delivery_configured = true; +} + static int dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) @@ -1145,11 +1302,22 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) struct rte_ether_addr eth_addr; int diag; int n_rxq, n_txq; - uint32_t tx_tso_offload_capa = DPDK_TX_TSO_OFFLOAD_FLAGS; uint32_t rx_chksm_offload_capa = RTE_ETH_RX_OFFLOAD_UDP_CKSUM | RTE_ETH_RX_OFFLOAD_TCP_CKSUM | RTE_ETH_RX_OFFLOAD_IPV4_CKSUM; + if (netdev_is_flow_api_enabled()) { + /* + * Full tunnel offload requires that tunnel ID metadata be + * delivered with "miss" packets from the hardware to the + * PMD. The same goes for megaflow mark metadata which is + * used in MARK + RSS offload scenario. + * + * Request delivery of such metadata. + */ + dpdk_eth_dev_init_rx_metadata(dev); + } + rte_eth_dev_info_get(dev->port_id, &info); if (strstr(info.driver_name, "vf") != NULL) { @@ -1175,22 +1343,88 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER; } + if (!strcmp(info.driver_name, "net_tap")) { + /* FIXME: L4 checksum offloading is broken in DPDK net/tap driver. + * This workaround can be removed once the fix makes it to a DPDK + * LTS release used by OVS. */ + VLOG_INFO("%s: disabled Tx L4 checksum offloads for a net/tap port.", + netdev_get_name(&dev->up)); + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_TCP_CKSUM; + } + + if (!strcmp(info.driver_name, "net_ice") + || !strcmp(info.driver_name, "net_i40e") + || !strcmp(info.driver_name, "net_iavf")) { + /* FIXME: Driver advertises the capability but doesn't seem + * to actually support it correctly. Can remove this once + * the driver is fixed on DPDK side. */ + VLOG_INFO("%s: disabled Tx outer udp checksum offloads for a " + "net/ice, net/i40e or net/iavf port.", + netdev_get_name(&dev->up)); + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO; + info.tx_offload_capa &= ~RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_TCP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_UDP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD; + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) { + dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD; + } else { + dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD; + } + dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD; if (userspace_tso_enabled()) { - if ((info.tx_offload_capa & tx_tso_offload_capa) - == tx_tso_offload_capa) { + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_SCTP_CKSUM) { - dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; - } else { - VLOG_WARN("%s: Tx SCTP checksum offload is not supported, " - "SCTP packets sent to this device will be dropped", - netdev_get_name(&dev->up)); - } } else { VLOG_WARN("%s: Tx TSO offload is not supported.", netdev_get_name(&dev->up)); } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD; + } else { + VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.", + netdev_get_name(&dev->up)); + } + + if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) { + dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD; + } else { + VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.", + netdev_get_name(&dev->up)); + } } n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq); @@ -1293,9 +1527,9 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, dev->requested_lsc_interrupt_mode = 0; ovsrcu_index_init(&dev->vid, -1); dev->vhost_reconfigured = false; + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; dev->attached = false; dev->started = false; - dev->reset_needed = false; ovsrcu_init(&dev->qos_conf, NULL); @@ -1305,10 +1539,15 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, netdev->n_rxq = 0; netdev->n_txq = 0; + dev->user_n_rxq = NR_QUEUE; dev->requested_n_rxq = NR_QUEUE; dev->requested_n_txq = NR_QUEUE; dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE; dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE; + dev->requested_rx_steer_flags = 0; + dev->rx_steer_flags = 0; + dev->rx_steer_flows_num = 0; + dev->rx_steer_flows = NULL; /* Initialize the flow control to NULL */ memset(&dev->fc_conf, 0, sizeof dev->fc_conf); @@ -1316,6 +1555,8 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, /* Initilize the hardware offload flags to 0 */ dev->hw_ol_features = 0; + dev->rx_metadata_delivery_configured = false; + dev->flags = NETDEV_UP | NETDEV_PROMISC; ovs_list_push_back(&dpdk_list, &dev->list_node); @@ -1379,7 +1620,7 @@ netdev_dpdk_vhost_construct(struct netdev *netdev) /* Take the name of the vhost-user port and append it to the location where * the socket is to be created, then register the socket. */ - dev->vhost_id = xasprintf("%s/%s", dpdk_get_vhost_sock_dir(), name); + dev->vhost_id = xasprintf("%s/%s", vhost_sock_dir, name); dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; @@ -1483,6 +1724,8 @@ common_destruct(struct netdev_dpdk *dev) ovs_mutex_destroy(&dev->mutex); } +static void dpdk_rx_steer_unconfigure(struct netdev_dpdk *); + static void netdev_dpdk_destruct(struct netdev *netdev) { @@ -1490,6 +1733,9 @@ netdev_dpdk_destruct(struct netdev *netdev) ovs_mutex_lock(&dpdk_mutex); + /* Destroy any rx-steering flows to allow RXQs to be removed. */ + dpdk_rx_steer_unconfigure(dev); + rte_eth_dev_stop(dev->port_id); dev->started = false; @@ -1729,39 +1975,41 @@ netdev_dpdk_get_config(const struct netdev *netdev, struct smap *args) ovs_mutex_lock(&dev->mutex); - smap_add_format(args, "requested_rx_queues", "%d", dev->requested_n_rxq); - smap_add_format(args, "configured_rx_queues", "%d", netdev->n_rxq); - smap_add_format(args, "requested_tx_queues", "%d", dev->requested_n_txq); - smap_add_format(args, "configured_tx_queues", "%d", netdev->n_txq); - smap_add_format(args, "mtu", "%d", dev->mtu); + if (dev->devargs && dev->devargs[0]) { + smap_add_format(args, "dpdk-devargs", "%s", dev->devargs); + } - if (dev->type == DPDK_DEV_ETH) { - smap_add_format(args, "requested_rxq_descriptors", "%d", - dev->requested_rxq_size); - smap_add_format(args, "configured_rxq_descriptors", "%d", - dev->rxq_size); - smap_add_format(args, "requested_txq_descriptors", "%d", - dev->requested_txq_size); - smap_add_format(args, "configured_txq_descriptors", "%d", - dev->txq_size); - if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) { - smap_add(args, "rx_csum_offload", "true"); - } else { - smap_add(args, "rx_csum_offload", "false"); - } - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - smap_add(args, "tx_tso_offload", "true"); - } else { - smap_add(args, "tx_tso_offload", "false"); - } - smap_add(args, "lsc_interrupt_mode", - dev->lsc_interrupt_mode ? "true" : "false"); + smap_add_format(args, "n_rxq", "%d", dev->user_n_rxq); - if (dpdk_port_is_representor(dev)) { - smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT, - ETH_ADDR_ARGS(dev->requested_hwaddr)); - } + if (dev->fc_conf.mode == RTE_ETH_FC_TX_PAUSE || + dev->fc_conf.mode == RTE_ETH_FC_FULL) { + smap_add(args, "rx-flow-ctrl", "true"); + } + + if (dev->fc_conf.mode == RTE_ETH_FC_RX_PAUSE || + dev->fc_conf.mode == RTE_ETH_FC_FULL) { + smap_add(args, "tx-flow-ctrl", "true"); + } + + if (dev->fc_conf.autoneg) { + smap_add(args, "flow-ctrl-autoneg", "true"); + } + + smap_add_format(args, "n_rxq_desc", "%d", dev->rxq_size); + smap_add_format(args, "n_txq_desc", "%d", dev->txq_size); + + if (dev->rx_steer_flags == DPDK_RX_STEER_LACP) { + smap_add(args, "rx-steering", "rss+lacp"); + } + + smap_add(args, "dpdk-lsc-interrupt", + dev->lsc_interrupt_mode ? "true" : "false"); + + if (dpdk_port_is_representor(dev)) { + smap_add_format(args, "dpdk-vf-mac", ETH_ADDR_FMT, + ETH_ADDR_ARGS(dev->requested_hwaddr)); } + ovs_mutex_unlock(&dev->mutex); return 0; @@ -1823,7 +2071,7 @@ static dpdk_port_t netdev_dpdk_get_port_by_devargs(const char *devargs) } /* - * Normally, a PCI id (optionally followed by a representor number) + * Normally, a PCI id (optionally followed by a representor identifier) * is enough for identifying a specific DPDK port. * However, for some NICs having multiple ports sharing the same PCI * id, using PCI id won't work then. @@ -1869,32 +2117,71 @@ netdev_dpdk_process_devargs(struct netdev_dpdk *dev, return new_port_id; } +static struct seq *netdev_dpdk_reset_seq; +static uint64_t netdev_dpdk_last_reset_seq; +static atomic_bool netdev_dpdk_pending_reset[RTE_MAX_ETHPORTS]; + +static void +netdev_dpdk_wait(const struct netdev_class *netdev_class OVS_UNUSED) +{ + uint64_t last_reset_seq = seq_read(netdev_dpdk_reset_seq); + + if (netdev_dpdk_last_reset_seq == last_reset_seq) { + seq_wait(netdev_dpdk_reset_seq, netdev_dpdk_last_reset_seq); + } else { + poll_immediate_wake(); + } +} + +static void +netdev_dpdk_run(const struct netdev_class *netdev_class OVS_UNUSED) +{ + uint64_t reset_seq = seq_read(netdev_dpdk_reset_seq); + + if (reset_seq != netdev_dpdk_last_reset_seq) { + dpdk_port_t port_id; + + netdev_dpdk_last_reset_seq = reset_seq; + + for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { + struct netdev_dpdk *dev; + bool pending_reset; + + atomic_read_relaxed(&netdev_dpdk_pending_reset[port_id], + &pending_reset); + if (!pending_reset) { + continue; + } + + ovs_mutex_lock(&dpdk_mutex); + dev = netdev_dpdk_lookup_by_port_id(port_id); + if (dev) { + ovs_mutex_lock(&dev->mutex); + netdev_request_reconfigure(&dev->up); + VLOG_DBG_RL(&rl, "%s: Device reset requested.", + netdev_get_name(&dev->up)); + ovs_mutex_unlock(&dev->mutex); + } + ovs_mutex_unlock(&dpdk_mutex); + } + } +} + static int dpdk_eth_event_callback(dpdk_port_t port_id, enum rte_eth_event_type type, void *param OVS_UNUSED, void *ret_param OVS_UNUSED) { - struct netdev_dpdk *dev; - switch ((int) type) { case RTE_ETH_EVENT_INTR_RESET: - ovs_mutex_lock(&dpdk_mutex); - dev = netdev_dpdk_lookup_by_port_id(port_id); - if (dev) { - ovs_mutex_lock(&dev->mutex); - dev->reset_needed = true; - netdev_request_reconfigure(&dev->up); - VLOG_DBG_RL(&rl, "%s: Device reset requested.", - netdev_get_name(&dev->up)); - ovs_mutex_unlock(&dev->mutex); - } - ovs_mutex_unlock(&dpdk_mutex); + atomic_store_relaxed(&netdev_dpdk_pending_reset[port_id], true); + seq_change(netdev_dpdk_reset_seq); break; default: /* Ignore all other types. */ break; - } - return 0; + } + return 0; } static void @@ -1904,25 +2191,98 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct smap *args) int new_n_rxq; new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1); - if (new_n_rxq != dev->requested_n_rxq) { - dev->requested_n_rxq = new_n_rxq; + if (new_n_rxq != dev->user_n_rxq) { + dev->user_n_rxq = new_n_rxq; netdev_request_reconfigure(&dev->up); } } static void dpdk_process_queue_size(struct netdev *netdev, const struct smap *args, - const char *flag, int default_size, int *new_size) + struct rte_eth_dev_info *info, bool is_rx) { - int queue_size = smap_get_int(args, flag, default_size); + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_eth_desc_lim *lim; + int default_size, queue_size, cur_size, new_requested_size; + int *cur_requested_size; + bool reconfig = false; + + if (is_rx) { + default_size = NIC_PORT_DEFAULT_RXQ_SIZE; + new_requested_size = smap_get_int(args, "n_rxq_desc", default_size); + cur_size = dev->rxq_size; + cur_requested_size = &dev->requested_rxq_size; + lim = info ? &info->rx_desc_lim : NULL; + } else { + default_size = NIC_PORT_DEFAULT_TXQ_SIZE; + new_requested_size = smap_get_int(args, "n_txq_desc", default_size); + cur_size = dev->txq_size; + cur_requested_size = &dev->requested_txq_size; + lim = info ? &info->tx_desc_lim : NULL; + } + queue_size = new_requested_size; + + /* Check for OVS limits. */ if (queue_size <= 0 || queue_size > NIC_PORT_MAX_Q_SIZE || !is_pow2(queue_size)) { queue_size = default_size; } - if (queue_size != *new_size) { - *new_size = queue_size; + if (lim) { + /* Check for device limits. */ + if (lim->nb_align) { + queue_size = ROUND_UP(queue_size, lim->nb_align); + } + queue_size = MIN(queue_size, lim->nb_max); + queue_size = MAX(queue_size, lim->nb_min); + } + + *cur_requested_size = queue_size; + + if (cur_size != queue_size) { + netdev_request_reconfigure(netdev); + reconfig = true; + } + if (new_requested_size != queue_size) { + VLOG(reconfig ? VLL_INFO : VLL_DBG, + "%s: Unable to set the number of %s descriptors to %d. " + "Adjusted to %d.", netdev_get_name(netdev), + is_rx ? "rx": "tx", new_requested_size, queue_size); + } +} + +static void +dpdk_set_rx_steer_config(struct netdev *netdev, struct netdev_dpdk *dev, + const struct smap *args, char **errp) +{ + const char *arg = smap_get_def(args, "rx-steering", "rss"); + uint64_t flags = 0; + + if (!strcmp(arg, "rss+lacp")) { + flags = DPDK_RX_STEER_LACP; + } else if (strcmp(arg, "rss")) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "unsupported parameter value '%s'", + netdev_get_name(netdev), arg); + } + + if (flags && dev->type != DPDK_DEV_ETH) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "is only supported on ethernet ports", + netdev_get_name(netdev)); + flags = 0; + } + + if (flags && netdev_is_flow_api_enabled()) { + VLOG_WARN_BUF(errp, "%s: options:rx-steering " + "is incompatible with hw-offload", + netdev_get_name(netdev)); + flags = 0; + } + + if (flags != dev->requested_rx_steer_flags) { + dev->requested_rx_steer_flags = flags; netdev_request_reconfigure(netdev); } } @@ -1939,21 +2299,18 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, {RTE_ETH_FC_NONE, RTE_ETH_FC_TX_PAUSE}, {RTE_ETH_FC_RX_PAUSE, RTE_ETH_FC_FULL } }; + struct rte_eth_dev_info info; const char *new_devargs; const char *vf_mac; int err = 0; + int ret; ovs_mutex_lock(&dpdk_mutex); ovs_mutex_lock(&dev->mutex); - dpdk_set_rxq_config(dev, args); + dpdk_set_rx_steer_config(netdev, dev, args, errp); - dpdk_process_queue_size(netdev, args, "n_rxq_desc", - NIC_PORT_DEFAULT_RXQ_SIZE, - &dev->requested_rxq_size); - dpdk_process_queue_size(netdev, args, "n_txq_desc", - NIC_PORT_DEFAULT_TXQ_SIZE, - &dev->requested_txq_size); + dpdk_set_rxq_config(dev, args); new_devargs = smap_get(args, "dpdk-devargs"); @@ -2010,29 +2367,44 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, goto out; } + ret = rte_eth_dev_info_get(dev->port_id, &info); + + dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, true); + dpdk_process_queue_size(netdev, args, !ret ? &info : NULL, false); + vf_mac = smap_get(args, "dpdk-vf-mac"); if (vf_mac) { struct eth_addr mac; if (!dpdk_port_is_representor(dev)) { - VLOG_WARN_BUF(errp, "'%s' is trying to set the VF MAC '%s' " - "but 'options:dpdk-vf-mac' is only supported for " - "VF representors.", - netdev_get_name(netdev), vf_mac); + VLOG_WARN("'%s' is trying to set the VF MAC '%s' " + "but 'options:dpdk-vf-mac' is only supported for " + "VF representors.", + netdev_get_name(netdev), vf_mac); } else if (!eth_addr_from_string(vf_mac, &mac)) { - VLOG_WARN_BUF(errp, "interface '%s': cannot parse VF MAC '%s'.", - netdev_get_name(netdev), vf_mac); + VLOG_WARN("interface '%s': cannot parse VF MAC '%s'.", + netdev_get_name(netdev), vf_mac); } else if (eth_addr_is_multicast(mac)) { - VLOG_WARN_BUF(errp, - "interface '%s': cannot set VF MAC to multicast " - "address '%s'.", netdev_get_name(netdev), vf_mac); + VLOG_WARN("interface '%s': cannot set VF MAC to multicast " + "address '%s'.", netdev_get_name(netdev), vf_mac); } else if (!eth_addr_equals(dev->requested_hwaddr, mac)) { dev->requested_hwaddr = mac; netdev_request_reconfigure(netdev); } } - lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", false); + lsc_interrupt_mode = smap_get_bool(args, "dpdk-lsc-interrupt", true); + if (lsc_interrupt_mode && !(*info.dev_flags & RTE_ETH_DEV_INTR_LSC)) { + if (smap_get(args, "dpdk-lsc-interrupt")) { + VLOG_WARN_BUF(errp, "'%s': link status interrupt is not " + "supported.", netdev_get_name(netdev)); + err = EINVAL; + goto out; + } + VLOG_DBG("'%s': not enabling link status interrupt.", + netdev_get_name(netdev)); + lsc_interrupt_mode = false; + } if (dev->requested_lsc_interrupt_mode != lsc_interrupt_mode) { dev->requested_lsc_interrupt_mode = lsc_interrupt_mode; netdev_request_reconfigure(netdev); @@ -2062,8 +2434,8 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, } err = 0; /* Not fatal. */ } else { - VLOG_WARN("%s: Cannot get flow control parameters: %s", - netdev_get_name(netdev), rte_strerror(err)); + VLOG_WARN_BUF(errp, "%s: Cannot get flow control parameters: %s", + netdev_get_name(netdev), rte_strerror(err)); } goto out; } @@ -2081,6 +2453,29 @@ netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args, return err; } +static int +netdev_dpdk_vhost_client_get_config(const struct netdev *netdev, + struct smap *args) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + int tx_retries_max; + + ovs_mutex_lock(&dev->mutex); + + if (dev->vhost_id) { + smap_add(args, "vhost-server-path", dev->vhost_id); + } + + atomic_read_relaxed(&dev->vhost_tx_retries_max, &tx_retries_max); + if (tx_retries_max != VHOST_ENQ_RETRY_DEF) { + smap_add_format(args, "tx-retries-max", "%d", tx_retries_max); + } + + ovs_mutex_unlock(&dev->mutex); + + return 0; +} + static int netdev_dpdk_vhost_client_set_config(struct netdev *netdev, const struct smap *args, @@ -2195,31 +2590,133 @@ static bool netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) { struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf); + void *l2; + void *l3; + void *l4; + + const uint64_t all_inner_requests = (RTE_MBUF_F_TX_IP_CKSUM | + RTE_MBUF_F_TX_L4_MASK | + RTE_MBUF_F_TX_TCP_SEG); + const uint64_t all_outer_requests = (RTE_MBUF_F_TX_OUTER_IP_CKSUM | + RTE_MBUF_F_TX_OUTER_UDP_CKSUM); + const uint64_t all_requests = all_inner_requests | all_outer_requests; + const uint64_t all_inner_marks = (RTE_MBUF_F_TX_IPV4 | + RTE_MBUF_F_TX_IPV6); + const uint64_t all_outer_marks = (RTE_MBUF_F_TX_OUTER_IPV4 | + RTE_MBUF_F_TX_OUTER_IPV6 | + RTE_MBUF_F_TX_TUNNEL_MASK); + const uint64_t all_marks = all_inner_marks | all_outer_marks; + + if (!(mbuf->ol_flags & all_requests)) { + /* No offloads requested, no marks should be set. */ + mbuf->ol_flags &= ~all_marks; + + uint64_t unexpected = mbuf->ol_flags & RTE_MBUF_F_TX_OFFLOAD_MASK; + if (OVS_UNLIKELY(unexpected)) { + VLOG_WARN_RL(&rl, "%s: Unexpected Tx offload flags: %#"PRIx64, + netdev_get_name(&dev->up), unexpected); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "Packet with unexpected ol_flags", mbuf); + return false; + } + return true; + } + + const uint64_t tunnel_type = mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK; + if (OVS_UNLIKELY(tunnel_type && + tunnel_type != RTE_MBUF_F_TX_TUNNEL_GENEVE && + tunnel_type != RTE_MBUF_F_TX_TUNNEL_VXLAN)) { + VLOG_WARN_RL(&rl, "%s: Unexpected tunnel type: %#"PRIx64, + netdev_get_name(&dev->up), tunnel_type); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "Packet with unexpected tunnel type", mbuf); + return false; + } + + if (tunnel_type && (mbuf->ol_flags & all_inner_requests)) { + if (mbuf->ol_flags & all_outer_requests) { + mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) - + (char *) dp_packet_eth(pkt); + mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) - + (char *) dp_packet_l3(pkt); - if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) { - mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt); - mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); + /* Inner L2 length must account for the tunnel header length. */ + l2 = dp_packet_l4(pkt); + l3 = dp_packet_inner_l3(pkt); + l4 = dp_packet_inner_l4(pkt); + } else { + /* If no outer offloading is requested, clear outer marks. */ + mbuf->ol_flags &= ~all_outer_marks; + mbuf->outer_l2_len = 0; + mbuf->outer_l3_len = 0; + + /* Skip outer headers. */ + l2 = dp_packet_eth(pkt); + l3 = dp_packet_inner_l3(pkt); + l4 = dp_packet_inner_l4(pkt); + } + } else { + if (tunnel_type) { + /* No inner offload is requested, fallback to non tunnel + * checksum offloads. */ + mbuf->ol_flags &= ~all_inner_marks; + if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM) { + mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + mbuf->ol_flags |= RTE_MBUF_F_TX_IPV4; + } + if (mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM) { + mbuf->ol_flags |= RTE_MBUF_F_TX_UDP_CKSUM; + mbuf->ol_flags |= mbuf->ol_flags & RTE_MBUF_F_TX_OUTER_IPV4 + ? RTE_MBUF_F_TX_IPV4 : RTE_MBUF_F_TX_IPV6; + } + mbuf->ol_flags &= ~(all_outer_requests | all_outer_marks); + } mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + + l2 = dp_packet_eth(pkt); + l3 = dp_packet_l3(pkt); + l4 = dp_packet_l4(pkt); } + ovs_assert(l4); + + mbuf->l2_len = (char *) l3 - (char *) l2; + mbuf->l3_len = (char *) l4 - (char *) l3; + if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) { - struct tcp_header *th = dp_packet_l4(pkt); + struct tcp_header *th = l4; + uint16_t link_tso_segsz; + int hdr_len; - if (!th) { - VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); - return false; + mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; + if (tunnel_type) { + link_tso_segsz = dev->mtu - mbuf->l2_len - mbuf->l3_len - + mbuf->l4_len - mbuf->outer_l3_len; + } else { + link_tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; } - mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4; - mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM; - mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len; + if (mbuf->tso_segsz > link_tso_segsz) { + mbuf->tso_segsz = link_tso_segsz; + } - if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { - mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + hdr_len = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; + if (OVS_UNLIKELY((hdr_len + mbuf->tso_segsz) > dev->max_packet_len)) { + VLOG_WARN_RL(&rl, "%s: Oversized TSO packet. hdr: %"PRIu32", " + "gso: %"PRIu32", max len: %"PRIu32"", + dev->up.name, hdr_len, mbuf->tso_segsz, + dev->max_packet_len); + return false; } } + + /* If L4 checksum is requested, IPv4 should be requested as well. */ + if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK + && mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) { + mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM; + } + return true; } @@ -2250,6 +2747,35 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } +static void +netdev_dpdk_mbuf_dump(const char *prefix, const char *message, + const struct rte_mbuf *mbuf) +{ + static struct vlog_rate_limit dump_rl = VLOG_RATE_LIMIT_INIT(5, 5); + char *response = NULL; + FILE *stream; + size_t size; + + if (VLOG_DROP_DBG(&dump_rl)) { + return; + } + + stream = open_memstream(&response, &size); + if (!stream) { + VLOG_ERR("Unable to open memstream for mbuf dump: %s.", + ovs_strerror(errno)); + return; + } + + rte_pktmbuf_dump(stream, mbuf, rte_pktmbuf_pkt_len(mbuf)); + + fclose(stream); + + VLOG_DBG(prefix ? "%s: %s:\n%s" : "%s%s:\n%s", + prefix ? prefix : "", message, response); + free(response); +} + /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of * 'pkts', even in case of failure. * @@ -2261,13 +2787,13 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, uint32_t nb_tx = 0; uint16_t nb_tx_prep = cnt; - if (userspace_tso_enabled()) { - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); - if (nb_tx_prep != cnt) { - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, - cnt, rte_strerror(rte_errno)); - } + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); + if (nb_tx_prep != cnt) { + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " + "Only %u/%u are valid: %s", netdev_get_name(&dev->up), + nb_tx_prep, cnt, rte_strerror(rte_errno)); + netdev_dpdk_mbuf_dump(netdev_get_name(&dev->up), + "First invalid packet", pkts[nb_tx_prep]); } while (nb_tx != nb_tx_prep) { @@ -2283,13 +2809,8 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, } if (OVS_UNLIKELY(nb_tx != cnt)) { - /* Free buffers, which we couldn't transmit, one at a time (each - * packet could come from a different mempool) */ - int i; - - for (i = nb_tx; i < cnt; i++) { - rte_pktmbuf_free(pkts[i]); - } + /* Free buffers, which we couldn't transmit. */ + rte_pktmbuf_free_bulk(&pkts[nb_tx], cnt - nb_tx); } return cnt - nb_tx; @@ -2357,78 +2878,12 @@ is_vhost_running(struct netdev_dpdk *dev) return (netdev_dpdk_get_vid(dev) >= 0 && dev->vhost_reconfigured); } -static inline void -netdev_dpdk_vhost_update_rx_size_counters(struct netdev_stats *stats, - unsigned int packet_size) -{ - /* Hard-coded search for the size bucket. */ - if (packet_size < 256) { - if (packet_size >= 128) { - stats->rx_128_to_255_packets++; - } else if (packet_size <= 64) { - stats->rx_1_to_64_packets++; - } else { - stats->rx_65_to_127_packets++; - } - } else { - if (packet_size >= 1523) { - stats->rx_1523_to_max_packets++; - } else if (packet_size >= 1024) { - stats->rx_1024_to_1522_packets++; - } else if (packet_size < 512) { - stats->rx_256_to_511_packets++; - } else { - stats->rx_512_to_1023_packets++; - } - } -} - -static inline void -netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev, - struct dp_packet **packets, int count, - int qos_drops) -{ - struct netdev_stats *stats = &dev->stats; - struct dp_packet *packet; - unsigned int packet_size; - int i; - - stats->rx_packets += count; - stats->rx_dropped += qos_drops; - for (i = 0; i < count; i++) { - packet = packets[i]; - packet_size = dp_packet_size(packet); - - if (OVS_UNLIKELY(packet_size < ETH_HEADER_LEN)) { - /* This only protects the following multicast counting from - * too short packets, but it does not stop the packet from - * further processing. */ - stats->rx_errors++; - stats->rx_length_errors++; - continue; - } - - netdev_dpdk_vhost_update_rx_size_counters(stats, packet_size); - - struct eth_header *eh = (struct eth_header *) dp_packet_data(packet); - if (OVS_UNLIKELY(eth_addr_is_multicast(eh->eth_dst))) { - stats->multicast++; - } - - stats->rx_bytes += packet_size; - } - - if (OVS_UNLIKELY(qos_drops)) { - dev->sw_stats->rx_qos_drops += qos_drops; - } -} - -/* - * The receive path for the vhost port is the TX path out from guest. - */ -static int -netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, - struct dp_packet_batch *batch, int *qfill) +/* + * The receive path for the vhost port is the TX path out from guest. + */ +static int +netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, + struct dp_packet_batch *batch, int *qfill) { struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev); @@ -2467,10 +2922,12 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, qos_drops -= nb_rx; } - rte_spinlock_lock(&dev->stats_lock); - netdev_dpdk_vhost_update_rx_counters(dev, batch->packets, - nb_rx, qos_drops); - rte_spinlock_unlock(&dev->stats_lock); + if (OVS_UNLIKELY(qos_drops)) { + rte_spinlock_lock(&dev->stats_lock); + dev->stats.rx_dropped += qos_drops; + dev->sw_stats->rx_qos_drops += qos_drops; + rte_spinlock_unlock(&dev->stats_lock); + } batch->count = nb_rx; dp_packet_batch_init_packet_fields(batch); @@ -2560,7 +3017,8 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, int cnt = 0; struct rte_mbuf *pkt; - /* Filter oversized packets, unless are marked for TSO. */ + /* Filter oversized packets. The TSO packets are filtered out + * during the offloading preparation for performance reasons. */ for (i = 0; i < pkt_cnt; i++) { pkt = pkts[i]; if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len) @@ -2581,38 +3039,6 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } -static inline void -netdev_dpdk_vhost_update_tx_counters(struct netdev_dpdk *dev, - struct dp_packet **packets, - int attempted, - struct netdev_dpdk_sw_stats *sw_stats_add) -{ - int dropped = sw_stats_add->tx_mtu_exceeded_drops + - sw_stats_add->tx_qos_drops + - sw_stats_add->tx_failure_drops + - sw_stats_add->tx_invalid_hwol_drops; - struct netdev_stats *stats = &dev->stats; - int sent = attempted - dropped; - int i; - - stats->tx_packets += sent; - stats->tx_dropped += dropped; - - for (i = 0; i < sent; i++) { - stats->tx_bytes += dp_packet_size(packets[i]); - } - - if (OVS_UNLIKELY(dropped || sw_stats_add->tx_retries)) { - struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; - - sw_stats->tx_retries += sw_stats_add->tx_retries; - sw_stats->tx_failure_drops += sw_stats_add->tx_failure_drops; - sw_stats->tx_mtu_exceeded_drops += sw_stats_add->tx_mtu_exceeded_drops; - sw_stats->tx_qos_drops += sw_stats_add->tx_qos_drops; - sw_stats->tx_invalid_hwol_drops += sw_stats_add->tx_invalid_hwol_drops; - } -} - static void netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) { @@ -2703,15 +3129,24 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig) mbuf_dest->packet_type = pkt_orig->mbuf.packet_type; mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags & ~(RTE_MBUF_F_EXTERNAL | RTE_MBUF_F_INDIRECT)); + mbuf_dest->tso_segsz = pkt_orig->mbuf.tso_segsz; memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size, sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size)); - if (mbuf_dest->ol_flags & RTE_MBUF_F_TX_L4_MASK) { - mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest) - - (char *)dp_packet_eth(pkt_dest); - mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest) + if (dp_packet_l3(pkt_dest)) { + if (dp_packet_eth(pkt_dest)) { + mbuf_dest->l2_len = (char *) dp_packet_l3(pkt_dest) + - (char *) dp_packet_eth(pkt_dest); + } else { + mbuf_dest->l2_len = 0; + } + if (dp_packet_l4(pkt_dest)) { + mbuf_dest->l3_len = (char *) dp_packet_l4(pkt_dest) - (char *) dp_packet_l3(pkt_dest); + } else { + mbuf_dest->l3_len = 0; + } } return pkt_dest; @@ -2753,11 +3188,20 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch, struct rte_mbuf **pkts = (struct rte_mbuf **) batch->packets; struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); size_t cnt, pkt_cnt = dp_packet_batch_size(batch); + struct dp_packet *packet; + bool need_copy = false; memset(stats, 0, sizeof *stats); + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (packet->source != DPBUF_DPDK) { + need_copy = true; + break; + } + } + /* Copy dp-packets to mbufs. */ - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { + if (OVS_UNLIKELY(need_copy)) { cnt = dpdk_copy_batch_to_mbuf(netdev, batch); stats->tx_failure_drops += pkt_cnt - cnt; pkt_cnt = cnt; @@ -2769,11 +3213,9 @@ netdev_dpdk_common_send(struct netdev *netdev, struct dp_packet_batch *batch, pkt_cnt = cnt; /* Prepare each mbuf for hardware offloading. */ - if (userspace_tso_enabled()) { - cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt); - stats->tx_invalid_hwol_drops += pkt_cnt - cnt; - pkt_cnt = cnt; - } + cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt); + stats->tx_invalid_hwol_drops += pkt_cnt - cnt; + pkt_cnt = cnt; /* Apply Quality of Service policy. */ cnt = netdev_dpdk_qos_run(dev, pkts, pkt_cnt, true); @@ -2793,6 +3235,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, int vid = netdev_dpdk_get_vid(dev); struct netdev_dpdk_sw_stats stats; struct rte_mbuf **pkts; + int dropped; int retries; batch_cnt = cnt = dp_packet_batch_size(batch); @@ -2812,6 +3255,7 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, } cnt = netdev_dpdk_common_send(netdev, batch, &stats); + dropped = batch_cnt - cnt; pkts = (struct rte_mbuf **) batch->packets; vhost_batch_cnt = cnt; @@ -2842,18 +3286,25 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, rte_spinlock_unlock(&dev->tx_q[qid].tx_lock); stats.tx_failure_drops += cnt; + dropped += cnt; stats.tx_retries = MIN(retries, max_retries); - rte_spinlock_lock(&dev->stats_lock); - netdev_dpdk_vhost_update_tx_counters(dev, batch->packets, batch_cnt, - &stats); - rte_spinlock_unlock(&dev->stats_lock); + if (OVS_UNLIKELY(dropped || stats.tx_retries)) { + struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; - pkts = (struct rte_mbuf **) batch->packets; - for (int i = 0; i < vhost_batch_cnt; i++) { - rte_pktmbuf_free(pkts[i]); + rte_spinlock_lock(&dev->stats_lock); + dev->stats.tx_dropped += dropped; + sw_stats->tx_retries += stats.tx_retries; + sw_stats->tx_failure_drops += stats.tx_failure_drops; + sw_stats->tx_mtu_exceeded_drops += stats.tx_mtu_exceeded_drops; + sw_stats->tx_qos_drops += stats.tx_qos_drops; + sw_stats->tx_invalid_hwol_drops += stats.tx_invalid_hwol_drops; + rte_spinlock_unlock(&dev->stats_lock); } + pkts = (struct rte_mbuf **) batch->packets; + rte_pktmbuf_free_bulk(pkts, vhost_batch_cnt); + return 0; } @@ -2882,9 +3333,9 @@ netdev_dpdk_eth_send(struct netdev *netdev, int qid, cnt = netdev_dpdk_common_send(netdev, batch, &stats); - dropped = batch_cnt - cnt; - - dropped += netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); + dropped = netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); + stats.tx_failure_drops += dropped; + dropped += batch_cnt - cnt; if (OVS_UNLIKELY(dropped)) { struct netdev_dpdk_sw_stats *sw_stats = dev->sw_stats; @@ -3001,41 +3452,305 @@ netdev_dpdk_set_mtu(struct netdev *netdev, int mtu) return 0; } -static int -netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); - static int netdev_dpdk_vhost_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { + struct rte_vhost_stat_name *vhost_stats_names = NULL; struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_vhost_stat *vhost_stats = NULL; + int vhost_stats_count; + int err; + int qid; + int vid; ovs_mutex_lock(&dev->mutex); + if (!is_vhost_running(dev)) { + err = EPROTO; + goto out; + } + + vid = netdev_dpdk_get_vid(dev); + + /* We expect all rxqs have the same number of stats, only query rxq0. */ + qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + err = EPROTO; + goto out; + } + + vhost_stats_count = err; + vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats); + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + +#define VHOST_RXQ_STATS \ + VHOST_RXQ_STAT(rx_packets, "good_packets") \ + VHOST_RXQ_STAT(rx_bytes, "good_bytes") \ + VHOST_RXQ_STAT(rx_broadcast_packets, "broadcast_packets") \ + VHOST_RXQ_STAT(multicast, "multicast_packets") \ + VHOST_RXQ_STAT(rx_undersized_errors, "undersize_packets") \ + VHOST_RXQ_STAT(rx_1_to_64_packets, "size_64_packets") \ + VHOST_RXQ_STAT(rx_65_to_127_packets, "size_65_127_packets") \ + VHOST_RXQ_STAT(rx_128_to_255_packets, "size_128_255_packets") \ + VHOST_RXQ_STAT(rx_256_to_511_packets, "size_256_511_packets") \ + VHOST_RXQ_STAT(rx_512_to_1023_packets, "size_512_1023_packets") \ + VHOST_RXQ_STAT(rx_1024_to_1522_packets, "size_1024_1518_packets") \ + VHOST_RXQ_STAT(rx_1523_to_max_packets, "size_1519_max_packets") + +#define VHOST_RXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0; + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + + for (int q = 0; q < dev->up.n_rxq; q++) { + qid = q * VIRTIO_QNUM + VIRTIO_TXQ; + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + + for (int i = 0; i < vhost_stats_count; i++) { +#define VHOST_RXQ_STAT(MEMBER, NAME) \ + if (string_ends_with(vhost_stats_names[i].name, NAME)) { \ + dev->stats.MEMBER += vhost_stats[i].value; \ + continue; \ + } + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + } + } + + /* OVS reports 64 bytes and smaller packets into "rx_1_to_64_packets". + * Since vhost only reports good packets and has no error counter, + * rx_undersized_errors is highjacked (see above) to retrieve + * "undersize_packets". */ + dev->stats.rx_1_to_64_packets += dev->stats.rx_undersized_errors; + memset(&dev->stats.rx_undersized_errors, 0xff, + sizeof dev->stats.rx_undersized_errors); + +#define VHOST_RXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER; + VHOST_RXQ_STATS; +#undef VHOST_RXQ_STAT + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + + /* We expect all txqs have the same number of stats, only query txq0. */ + qid = 0 * VIRTIO_QNUM; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + err = EPROTO; + goto out; + } + + vhost_stats_count = err; + vhost_stats_names = xcalloc(vhost_stats_count, sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_stats_count, sizeof *vhost_stats); + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + +#define VHOST_TXQ_STATS \ + VHOST_TXQ_STAT(tx_packets, "good_packets") \ + VHOST_TXQ_STAT(tx_bytes, "good_bytes") \ + VHOST_TXQ_STAT(tx_broadcast_packets, "broadcast_packets") \ + VHOST_TXQ_STAT(tx_multicast_packets, "multicast_packets") \ + VHOST_TXQ_STAT(rx_undersized_errors, "undersize_packets") \ + VHOST_TXQ_STAT(tx_1_to_64_packets, "size_64_packets") \ + VHOST_TXQ_STAT(tx_65_to_127_packets, "size_65_127_packets") \ + VHOST_TXQ_STAT(tx_128_to_255_packets, "size_128_255_packets") \ + VHOST_TXQ_STAT(tx_256_to_511_packets, "size_256_511_packets") \ + VHOST_TXQ_STAT(tx_512_to_1023_packets, "size_512_1023_packets") \ + VHOST_TXQ_STAT(tx_1024_to_1522_packets, "size_1024_1518_packets") \ + VHOST_TXQ_STAT(tx_1523_to_max_packets, "size_1519_max_packets") + +#define VHOST_TXQ_STAT(MEMBER, NAME) dev->stats.MEMBER = 0; + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + + for (int q = 0; q < dev->up.n_txq; q++) { + qid = q * VIRTIO_QNUM; + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_stats_count); + if (err != vhost_stats_count) { + err = EPROTO; + goto out; + } + + for (int i = 0; i < vhost_stats_count; i++) { +#define VHOST_TXQ_STAT(MEMBER, NAME) \ + if (string_ends_with(vhost_stats_names[i].name, NAME)) { \ + dev->stats.MEMBER += vhost_stats[i].value; \ + continue; \ + } + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + } + } + + /* OVS reports 64 bytes and smaller packets into "tx_1_to_64_packets". + * Same as for rx, rx_undersized_errors is highjacked. */ + dev->stats.tx_1_to_64_packets += dev->stats.rx_undersized_errors; + memset(&dev->stats.rx_undersized_errors, 0xff, + sizeof dev->stats.rx_undersized_errors); + +#define VHOST_TXQ_STAT(MEMBER, NAME) stats->MEMBER = dev->stats.MEMBER; + VHOST_TXQ_STATS; +#undef VHOST_TXQ_STAT + rte_spinlock_lock(&dev->stats_lock); - /* Supported Stats */ - stats->rx_packets = dev->stats.rx_packets; - stats->tx_packets = dev->stats.tx_packets; stats->rx_dropped = dev->stats.rx_dropped; stats->tx_dropped = dev->stats.tx_dropped; - stats->multicast = dev->stats.multicast; - stats->rx_bytes = dev->stats.rx_bytes; - stats->tx_bytes = dev->stats.tx_bytes; - stats->rx_errors = dev->stats.rx_errors; - stats->rx_length_errors = dev->stats.rx_length_errors; - - stats->rx_1_to_64_packets = dev->stats.rx_1_to_64_packets; - stats->rx_65_to_127_packets = dev->stats.rx_65_to_127_packets; - stats->rx_128_to_255_packets = dev->stats.rx_128_to_255_packets; - stats->rx_256_to_511_packets = dev->stats.rx_256_to_511_packets; - stats->rx_512_to_1023_packets = dev->stats.rx_512_to_1023_packets; - stats->rx_1024_to_1522_packets = dev->stats.rx_1024_to_1522_packets; - stats->rx_1523_to_max_packets = dev->stats.rx_1523_to_max_packets; - rte_spinlock_unlock(&dev->stats_lock); + err = 0; +out: + + ovs_mutex_unlock(&dev->mutex); + + free(vhost_stats); + free(vhost_stats_names); + + return err; +} + +static int +netdev_dpdk_vhost_get_custom_stats(const struct netdev *netdev, + struct netdev_custom_stats *custom_stats) +{ + struct rte_vhost_stat_name *vhost_stats_names = NULL; + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_vhost_stat *vhost_stats = NULL; + int vhost_rxq_stats_count; + int vhost_txq_stats_count; + int stat_offset; + int err; + int qid; + int vid; + + netdev_dpdk_get_sw_custom_stats(netdev, custom_stats); + stat_offset = custom_stats->size; + + ovs_mutex_lock(&dev->mutex); + + if (!is_vhost_running(dev)) { + goto out; + } + + vid = netdev_dpdk_get_vid(dev); + + qid = 0 * VIRTIO_QNUM + VIRTIO_TXQ; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + goto out; + } + vhost_rxq_stats_count = err; + + qid = 0 * VIRTIO_QNUM; + err = rte_vhost_vring_stats_get_names(vid, qid, NULL, 0); + if (err < 0) { + goto out; + } + vhost_txq_stats_count = err; + + stat_offset += dev->up.n_rxq * vhost_rxq_stats_count; + stat_offset += dev->up.n_txq * vhost_txq_stats_count; + custom_stats->counters = xrealloc(custom_stats->counters, + stat_offset * + sizeof *custom_stats->counters); + stat_offset = custom_stats->size; + + vhost_stats_names = xcalloc(vhost_rxq_stats_count, + sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_rxq_stats_count, sizeof *vhost_stats); + + for (int q = 0; q < dev->up.n_rxq; q++) { + qid = q * VIRTIO_QNUM + VIRTIO_TXQ; + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_rxq_stats_count); + if (err != vhost_rxq_stats_count) { + goto out; + } + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_rxq_stats_count); + if (err != vhost_rxq_stats_count) { + goto out; + } + + for (int i = 0; i < vhost_rxq_stats_count; i++) { + ovs_strlcpy(custom_stats->counters[stat_offset + i].name, + vhost_stats_names[i].name, + NETDEV_CUSTOM_STATS_NAME_SIZE); + custom_stats->counters[stat_offset + i].value = + vhost_stats[i].value; + } + stat_offset += vhost_rxq_stats_count; + } + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + + vhost_stats_names = xcalloc(vhost_txq_stats_count, + sizeof *vhost_stats_names); + vhost_stats = xcalloc(vhost_txq_stats_count, sizeof *vhost_stats); + + for (int q = 0; q < dev->up.n_txq; q++) { + qid = q * VIRTIO_QNUM; + + err = rte_vhost_vring_stats_get_names(vid, qid, vhost_stats_names, + vhost_txq_stats_count); + if (err != vhost_txq_stats_count) { + goto out; + } + + err = rte_vhost_vring_stats_get(vid, qid, vhost_stats, + vhost_txq_stats_count); + if (err != vhost_txq_stats_count) { + goto out; + } + + for (int i = 0; i < vhost_txq_stats_count; i++) { + ovs_strlcpy(custom_stats->counters[stat_offset + i].name, + vhost_stats_names[i].name, + NETDEV_CUSTOM_STATS_NAME_SIZE); + custom_stats->counters[stat_offset + i].value = + vhost_stats[i].value; + } + stat_offset += vhost_txq_stats_count; + } + + free(vhost_stats_names); + vhost_stats_names = NULL; + free(vhost_stats); + vhost_stats = NULL; + +out: ovs_mutex_unlock(&dev->mutex); + custom_stats->size = stat_offset; + return 0; } @@ -3082,6 +3797,9 @@ netdev_dpdk_convert_xstats(struct netdev_stats *stats, #undef DPDK_XSTATS } +static int +netdev_dpdk_get_carrier(const struct netdev *netdev, bool *carrier); + static int netdev_dpdk_get_stats(const struct netdev *netdev, struct netdev_stats *stats) { @@ -3326,6 +4044,57 @@ netdev_dpdk_get_features(const struct netdev *netdev, return 0; } +static int +netdev_dpdk_get_speed(const struct netdev *netdev, uint32_t *current, + uint32_t *max) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + struct rte_eth_dev_info dev_info; + struct rte_eth_link link; + + ovs_mutex_lock(&dev->mutex); + link = dev->link; + rte_eth_dev_info_get(dev->port_id, &dev_info); + ovs_mutex_unlock(&dev->mutex); + + *current = link.link_speed != RTE_ETH_SPEED_NUM_UNKNOWN + ? link.link_speed : 0; + + if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_200G) { + *max = RTE_ETH_SPEED_NUM_200G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100G) { + *max = RTE_ETH_SPEED_NUM_100G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_56G) { + *max = RTE_ETH_SPEED_NUM_56G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_50G) { + *max = RTE_ETH_SPEED_NUM_50G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_40G) { + *max = RTE_ETH_SPEED_NUM_40G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_25G) { + *max = RTE_ETH_SPEED_NUM_25G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_20G) { + *max = RTE_ETH_SPEED_NUM_20G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10G) { + *max = RTE_ETH_SPEED_NUM_10G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_5G) { + *max = RTE_ETH_SPEED_NUM_5G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_2_5G) { + *max = RTE_ETH_SPEED_NUM_2_5G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_1G) { + *max = RTE_ETH_SPEED_NUM_1G; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M || + dev_info.speed_capa & RTE_ETH_LINK_SPEED_100M_HD) { + *max = RTE_ETH_SPEED_NUM_100M; + } else if (dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M || + dev_info.speed_capa & RTE_ETH_LINK_SPEED_10M_HD) { + *max = RTE_ETH_SPEED_NUM_10M; + } else { + *max = 0; + } + + return 0; +} + static struct ingress_policer * netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst) { @@ -3530,6 +4299,7 @@ netdev_dpdk_update_flags__(struct netdev_dpdk *dev, if (NETDEV_UP & on) { rte_spinlock_lock(&dev->stats_lock); memset(&dev->stats, 0, sizeof dev->stats); + memset(dev->sw_stats, 0, sizeof *dev->sw_stats); rte_spinlock_unlock(&dev->stats_lock); } } @@ -3606,6 +4376,15 @@ netdev_dpdk_vhost_user_get_status(const struct netdev *netdev, xasprintf("%d", vring.size)); } + if (userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_WORKAROUND) { + + smap_add_format(args, "userspace-tso", "disabled"); + } + + smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->n_txq); + ovs_mutex_unlock(&dev->mutex); return 0; } @@ -3639,8 +4418,12 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); struct rte_eth_dev_info dev_info; + size_t rx_steer_flows_num; + uint64_t rx_steer_flags; + const char *bus_info; uint32_t link_speed; uint32_t dev_flags; + int n_rxq; if (!rte_eth_dev_is_valid_port(dev->port_id)) { return ENODEV; @@ -3651,19 +4434,11 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) rte_eth_dev_info_get(dev->port_id, &dev_info); link_speed = dev->link.link_speed; dev_flags = *dev_info.dev_flags; + bus_info = rte_dev_bus_info(dev_info.device); + rx_steer_flags = dev->rx_steer_flags; + rx_steer_flows_num = dev->rx_steer_flows_num; + n_rxq = netdev->n_rxq; ovs_mutex_unlock(&dev->mutex); - const struct rte_bus *bus; - const struct rte_pci_device *pci_dev; - uint16_t vendor_id = RTE_PCI_ANY_ID; - uint16_t device_id = RTE_PCI_ANY_ID; - bus = rte_bus_find_by_device(dev_info.device); - if (bus && !strcmp(bus->name, "pci")) { - pci_dev = RTE_DEV_TO_PCI(dev_info.device); - if (pci_dev) { - vendor_id = pci_dev->id.vendor_id; - device_id = pci_dev->id.device_id; - } - } ovs_mutex_unlock(&dpdk_mutex); smap_add_format(args, "port_no", DPDK_PORT_ID_FMT, dev->port_id); @@ -3680,6 +4455,13 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs); smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools); + smap_add_format(args, "n_rxq", "%d", netdev->n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->n_txq); + + smap_add(args, "rx_csum_offload", + dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD + ? "true" : "false"); + /* Querying the DPDK library for iftype may be done in future, pending * support; cf. RFC 3635 Section 3.2.4. */ enum { IF_TYPE_ETHERNETCSMACD = 6 }; @@ -3687,8 +4469,10 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD); smap_add_format(args, "if_descr", "%s %s", rte_version(), dev_info.driver_name); - smap_add_format(args, "pci-vendor_id", "0x%x", vendor_id); - smap_add_format(args, "pci-device_id", "0x%x", device_id); + smap_add_format(args, "bus_info", "bus_name=%s%s%s", + rte_bus_name(rte_dev_bus(dev_info.device)), + bus_info != NULL ? ", " : "", + bus_info != NULL ? bus_info : ""); /* Not all link speeds are defined in the OpenFlow specs e.g. 25 Gbps. * In that case the speed will not be reported as part of the usual @@ -3703,6 +4487,24 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) ETH_ADDR_ARGS(dev->hwaddr)); } + if (rx_steer_flags && !rx_steer_flows_num) { + smap_add(args, "rx-steering", "unsupported"); + } else if (rx_steer_flags == DPDK_RX_STEER_LACP) { + smap_add(args, "rx-steering", "rss+lacp"); + } else { + ovs_assert(!rx_steer_flags); + smap_add(args, "rx-steering", "rss"); + } + + if (rx_steer_flags && rx_steer_flows_num) { + smap_add_format(args, "rx_steering_queue", "%d", n_rxq - 1); + if (n_rxq > 2) { + smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2); + } else { + smap_add(args, "rss_queues", "0"); + } + } + return 0; } @@ -3964,6 +4766,7 @@ new_device(int vid) ovs_mutex_lock(&dev->mutex); if (nullable_string_is_equal(ifname, dev->vhost_id)) { uint32_t qp_num = rte_vhost_get_vring_num(vid) / VIRTIO_QNUM; + uint64_t features; /* Get NUMA information */ newnode = rte_vhost_get_numa_node(vid); @@ -3975,6 +4778,8 @@ new_device(int vid) newnode = dev->socket_id; } + dev->virtio_features_state |= OVS_VIRTIO_F_NEGOTIATED; + if (dev->requested_n_txq < qp_num || dev->requested_n_rxq < qp_num || dev->requested_socket_id != newnode @@ -3988,6 +4793,38 @@ new_device(int vid) dev->vhost_reconfigured = true; } + if (rte_vhost_get_negotiated_features(vid, &features)) { + VLOG_INFO("Error checking guest features for " + "vHost Device '%s'", dev->vhost_id); + } else { + if (features & (1ULL << VIRTIO_NET_F_GUEST_CSUM)) { + dev->hw_ol_features |= NETDEV_TX_TCP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_UDP_CKSUM_OFFLOAD; + dev->hw_ol_features |= NETDEV_TX_SCTP_CKSUM_OFFLOAD; + } + + if (userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_CLEAN) { + + if (features & (1ULL << VIRTIO_NET_F_GUEST_TSO4) + && features & (1ULL << VIRTIO_NET_F_GUEST_TSO6)) { + + dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); + } else { + VLOG_WARN("%s: Tx TSO offload is not supported.", + netdev_get_name(&dev->up)); + } + } + } + + /* There is no support in virtio net to offload IPv4 csum, + * but the vhost library handles IPv4 csum offloading fine. */ + dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD; + + netdev_dpdk_update_netdev_flags(dev); + ovsrcu_index_set(&dev->vid, vid); exists = true; @@ -4051,6 +4888,10 @@ destroy_device(int vid) dev->up.n_rxq * sizeof *dev->vhost_rxq_enabled); netdev_dpdk_txq_map_clear(dev); + /* Clear offload capabilities before next new_device. */ + dev->hw_ol_features = 0; + netdev_dpdk_update_netdev_flags(dev); + netdev_change_seq_changed(&dev->up); ovs_mutex_unlock(&dev->mutex); exists = true; @@ -4077,30 +4918,38 @@ destroy_device(int vid) } } -static int -vring_state_changed(int vid, uint16_t queue_id, int enable) +static struct mpsc_queue vhost_state_change_queue + = MPSC_QUEUE_INITIALIZER(&vhost_state_change_queue); +static atomic_uint64_t vhost_state_change_queue_size; + +struct vhost_state_change { + struct mpsc_queue_node node; + char ifname[IF_NAME_SZ]; + uint16_t queue_id; + int enable; +}; + +static void +vring_state_changed__(struct vhost_state_change *sc) { struct netdev_dpdk *dev; bool exists = false; - int qid = queue_id / VIRTIO_QNUM; - bool is_rx = (queue_id % VIRTIO_QNUM) == VIRTIO_TXQ; - char ifname[IF_NAME_SZ]; - - rte_vhost_get_ifname(vid, ifname, sizeof ifname); + int qid = sc->queue_id / VIRTIO_QNUM; + bool is_rx = (sc->queue_id % VIRTIO_QNUM) == VIRTIO_TXQ; ovs_mutex_lock(&dpdk_mutex); LIST_FOR_EACH (dev, list_node, &dpdk_list) { ovs_mutex_lock(&dev->mutex); - if (nullable_string_is_equal(ifname, dev->vhost_id)) { + if (nullable_string_is_equal(sc->ifname, dev->vhost_id)) { if (is_rx) { bool old_state = dev->vhost_rxq_enabled[qid]; - dev->vhost_rxq_enabled[qid] = enable != 0; + dev->vhost_rxq_enabled[qid] = sc->enable != 0; if (old_state != dev->vhost_rxq_enabled[qid]) { netdev_change_seq_changed(&dev->up); } } else { - if (enable) { + if (sc->enable) { dev->tx_q[qid].map = qid; } else { dev->tx_q[qid].map = OVS_VHOST_QUEUE_DISABLED; @@ -4117,11 +4966,69 @@ vring_state_changed(int vid, uint16_t queue_id, int enable) if (exists) { VLOG_INFO("State of queue %d ( %s_qid %d ) of vhost device '%s' " - "changed to \'%s\'", queue_id, is_rx == true ? "rx" : "tx", - qid, ifname, (enable == 1) ? "enabled" : "disabled"); + "changed to \'%s\'", sc->queue_id, is_rx ? "rx" : "tx", + qid, sc->ifname, sc->enable == 1 ? "enabled" : "disabled"); } else { - VLOG_INFO("vHost Device '%s' not found", ifname); - return -1; + VLOG_INFO("vHost Device '%s' not found", sc->ifname); + } +} + +#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN 1 +#define NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX 64 +static void * +netdev_dpdk_vhost_events_main(void *arg OVS_UNUSED) +{ + mpsc_queue_acquire(&vhost_state_change_queue); + + for (;;) { + struct mpsc_queue_node *node; + uint64_t backoff; + + backoff = NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MIN; + while (mpsc_queue_tail(&vhost_state_change_queue) == NULL) { + xnanosleep(backoff * 1E6); + if (backoff < NETDEV_DPDK_VHOST_EVENTS_BACKOFF_MAX) { + backoff <<= 1; + } + } + + MPSC_QUEUE_FOR_EACH_POP (node, &vhost_state_change_queue) { + struct vhost_state_change *sc; + + sc = CONTAINER_OF(node, struct vhost_state_change, node); + vring_state_changed__(sc); + free(sc); + atomic_count_dec64(&vhost_state_change_queue_size); + } + } + + OVS_NOT_REACHED(); + mpsc_queue_release(&vhost_state_change_queue); + + return NULL; +} + +static int +vring_state_changed(int vid, uint16_t queue_id, int enable) +{ + static struct vlog_rate_limit vhost_rl = VLOG_RATE_LIMIT_INIT(5, 5); + struct vhost_state_change *sc; + + sc = xmalloc(sizeof *sc); + if (!rte_vhost_get_ifname(vid, sc->ifname, sizeof sc->ifname)) { + uint64_t queue_size; + + sc->queue_id = queue_id; + sc->enable = enable; + mpsc_queue_insert(&vhost_state_change_queue, &sc->node); + queue_size = atomic_count_inc64(&vhost_state_change_queue_size); + if (queue_size >= 1000) { + VLOG_WARN_RL(&vhost_rl, "vring state change queue has %"PRIu64" " + "entries. Last update was for socket %s.", queue_size, + sc->ifname); + } + } else { + free(sc); } return 0; @@ -4154,6 +5061,45 @@ destroy_connection(int vid) dev->requested_n_txq = qp_num; netdev_request_reconfigure(&dev->up); } + + if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) { + /* The socket disconnected before reaching new_device. It + * likely means that the guest did not agree with the virtio + * features. */ + VLOG_WARN_RL(&rl, "Connection on socket '%s' closed during " + "initialization.", dev->vhost_id); + } + if (!(dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING)) { + switch (dev->virtio_features_state) { + case OVS_VIRTIO_F_CLEAN: + dev->virtio_features_state = OVS_VIRTIO_F_WORKAROUND; + break; + + case OVS_VIRTIO_F_WORKAROUND: + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; + break; + + case OVS_VIRTIO_F_CLEAN_NEGOTIATED: + /* The virtio features were clean and got accepted by the + * guest. We expect it will be the case in the future and + * change nothing. */ + break; + + case OVS_VIRTIO_F_WORKAROUND_NEGOTIATED: + /* Let's try to go with clean virtio features on a next + * connection. */ + dev->virtio_features_state = OVS_VIRTIO_F_CLEAN; + break; + + default: + OVS_NOT_REACHED(); + } + if (!(dev->virtio_features_state & OVS_VIRTIO_F_NEGOTIATED)) { + dev->virtio_features_state |= OVS_VIRTIO_F_RECONF_PENDING; + netdev_request_reconfigure(&dev->up); + } + } + ovs_mutex_unlock(&dev->mutex); exists = true; break; @@ -4169,12 +5115,6 @@ destroy_connection(int vid) } } -static -void vhost_guest_notified(int vid OVS_UNUSED) -{ - COVERAGE_INC(vhost_notification); -} - /* * Retrieve the DPDK virtio device ID (vid) associated with a vhostuser * or vhostuserclient netdev. @@ -4194,12 +5134,6 @@ netdev_dpdk_get_vid(const struct netdev_dpdk *dev) return ovsrcu_index_get(&dev->vid); } -struct ingress_policer * -netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev) -{ - return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer); -} - static int netdev_dpdk_class_init(void) { @@ -4223,6 +5157,8 @@ netdev_dpdk_class_init(void) "[netdev]", 0, 1, netdev_dpdk_get_mempool_info, NULL); + netdev_dpdk_reset_seq = seq_create(); + netdev_dpdk_last_reset_seq = seq_read(netdev_dpdk_reset_seq); ret = rte_eth_dev_callback_register(RTE_ETH_ALL, RTE_ETH_EVENT_INTR_RESET, dpdk_eth_event_callback, NULL); @@ -4237,8 +5173,27 @@ netdev_dpdk_class_init(void) return 0; } +static int +netdev_dpdk_vhost_class_init(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + + if (ovsthread_once_start(&once)) { + ovs_thread_create("ovs_vhost", netdev_dpdk_vhost_events_main, NULL); + ovsthread_once_done(&once); + } + + return 0; +} + /* QoS Functions */ +struct ingress_policer * +netdev_dpdk_get_ingress_policer(const struct netdev_dpdk *dev) +{ + return ovsrcu_get(struct ingress_policer *, &dev->ingress_policer); +} + /* * Initialize QoS configuration operations. */ @@ -4933,32 +5888,238 @@ static const struct dpdk_qos_ops trtcm_policer_ops = { .qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init }; +static int +dpdk_rx_steer_add_flow(struct netdev_dpdk *dev, + const struct rte_flow_item items[], + const char *desc) +{ + const struct rte_flow_attr attr = { .ingress = 1 }; + const struct rte_flow_action actions[] = { + { + .type = RTE_FLOW_ACTION_TYPE_QUEUE, + .conf = &(const struct rte_flow_action_queue) { + .index = dev->up.n_rxq - 1, + }, + }, + { .type = RTE_FLOW_ACTION_TYPE_END }, + }; + struct rte_flow_error error; + struct rte_flow *flow; + size_t num; + int err; + + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + err = rte_flow_validate(dev->port_id, &attr, items, actions, &error); + if (err) { + VLOG_WARN("%s: rx-steering: device does not support %s flow: %s", + netdev_get_name(&dev->up), desc, + error.message ? error.message : ""); + goto out; + } + + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + flow = rte_flow_create(dev->port_id, &attr, items, actions, &error); + if (flow == NULL) { + VLOG_WARN("%s: rx-steering: failed to add %s flow: %s", + netdev_get_name(&dev->up), desc, + error.message ? error.message : ""); + err = rte_errno; + goto out; + } + + num = dev->rx_steer_flows_num + 1; + dev->rx_steer_flows = xrealloc(dev->rx_steer_flows, num * sizeof flow); + dev->rx_steer_flows[dev->rx_steer_flows_num] = flow; + dev->rx_steer_flows_num = num; + + VLOG_INFO("%s: rx-steering: redirected %s traffic to rx queue %d", + netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1); +out: + return err; +} + +#define RETA_CONF_SIZE (RTE_ETH_RSS_RETA_SIZE_512 / RTE_ETH_RETA_GROUP_SIZE) + +static int +dpdk_rx_steer_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq) +{ + struct rte_eth_rss_reta_entry64 reta_conf[RETA_CONF_SIZE]; + struct rte_eth_dev_info info; + int err; + + rte_eth_dev_info_get(dev->port_id, &info); + + if (info.reta_size % rss_n_rxq != 0 && + info.reta_size < RTE_ETH_RSS_RETA_SIZE_128) { + /* + * Some drivers set reta_size equal to the total number of rxqs that + * are configured when it is a power of two. Since we are actually + * reconfiguring the redirection table to exclude the last rxq, we may + * end up with an imbalanced redirection table. For example, such + * configuration: + * + * options:n_rxq=3 options:rx-steering=rss+lacp + * + * Will actually configure 4 rxqs on the NIC, and the default reta to: + * + * [0, 1, 2, 3] + * + * And dpdk_rx_steer_rss_configure() will reconfigure reta to: + * + * [0, 1, 2, 0] + * + * Causing queue 0 to receive twice as much traffic as queues 1 and 2. + * + * Work around that corner case by forcing a bigger redirection table + * size to 128 entries when reta_size is not a multiple of rss_n_rxq + * and when reta_size is less than 128. This value seems to be + * supported by most of the drivers that also support rte_flow. + */ + info.reta_size = RTE_ETH_RSS_RETA_SIZE_128; + } + + memset(reta_conf, 0, sizeof reta_conf); + for (uint16_t i = 0; i < info.reta_size; i++) { + uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE; + uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE; + + reta_conf[idx].mask |= 1ULL << shift; + reta_conf[idx].reta[shift] = i % rss_n_rxq; + } + + err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, info.reta_size); + if (err < 0) { + VLOG_WARN("%s: failed to configure RSS redirection table: err=%d", + netdev_get_name(&dev->up), err); + } + + return err; +} + +static int +dpdk_rx_steer_configure(struct netdev_dpdk *dev) +{ + int err = 0; + + if (dev->up.n_rxq < 2) { + err = ENOTSUP; + VLOG_WARN("%s: rx-steering: not enough available rx queues", + netdev_get_name(&dev->up)); + goto out; + } + + if (dev->requested_rx_steer_flags & DPDK_RX_STEER_LACP) { + const struct rte_flow_item items[] = { + { + .type = RTE_FLOW_ITEM_TYPE_ETH, + .spec = &(const struct rte_flow_item_eth){ + .type = htons(ETH_TYPE_LACP), + }, + .mask = &(const struct rte_flow_item_eth){ + .type = htons(0xffff), + }, + }, + { .type = RTE_FLOW_ITEM_TYPE_END }, + }; + err = dpdk_rx_steer_add_flow(dev, items, "lacp"); + if (err) { + goto out; + } + } + + if (dev->rx_steer_flows_num) { + /* Reconfigure RSS reta in all but the rx steering queue. */ + err = dpdk_rx_steer_rss_configure(dev, dev->up.n_rxq - 1); + if (err) { + goto out; + } + if (dev->up.n_rxq == 2) { + VLOG_INFO("%s: rx-steering: redirected other traffic to " + "rx queue 0", netdev_get_name(&dev->up)); + } else { + VLOG_INFO("%s: rx-steering: applied rss on rx queues 0-%u", + netdev_get_name(&dev->up), dev->up.n_rxq - 2); + } + } + +out: + return err; +} + +static void +dpdk_rx_steer_unconfigure(struct netdev_dpdk *dev) +{ + struct rte_flow_error error; + + if (!dev->rx_steer_flows_num) { + return; + } + + VLOG_DBG("%s: rx-steering: reset flows", netdev_get_name(&dev->up)); + + for (int i = 0; i < dev->rx_steer_flows_num; i++) { + set_error(&error, RTE_FLOW_ERROR_TYPE_NONE); + if (rte_flow_destroy(dev->port_id, dev->rx_steer_flows[i], &error)) { + VLOG_WARN("%s: rx-steering: failed to destroy flow: %s", + netdev_get_name(&dev->up), + error.message ? error.message : ""); + } + } + free(dev->rx_steer_flows); + dev->rx_steer_flows_num = 0; + dev->rx_steer_flows = NULL; + /* + * Most DPDK drivers seem to reset their RSS redirection table in + * rte_eth_dev_configure() or rte_eth_dev_start(), both of which are + * called in dpdk_eth_dev_init(). No need to explicitly reset it. + */ +} + static int netdev_dpdk_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + bool pending_reset; + bool try_rx_steer; int err = 0; ovs_mutex_lock(&dev->mutex); + try_rx_steer = dev->requested_rx_steer_flags != 0; + dev->requested_n_rxq = dev->user_n_rxq; + if (try_rx_steer) { + dev->requested_n_rxq += 1; + } + + atomic_read_relaxed(&netdev_dpdk_pending_reset[dev->port_id], + &pending_reset); + if (netdev->n_txq == dev->requested_n_txq && netdev->n_rxq == dev->requested_n_rxq + && dev->rx_steer_flags == dev->requested_rx_steer_flags && dev->mtu == dev->requested_mtu && dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode && dev->rxq_size == dev->requested_rxq_size && dev->txq_size == dev->requested_txq_size && eth_addr_equals(dev->hwaddr, dev->requested_hwaddr) && dev->socket_id == dev->requested_socket_id - && dev->started && !dev->reset_needed) { + && dev->started && !pending_reset) { /* Reconfiguration is unnecessary */ goto out; } - if (dev->reset_needed) { +retry: + dpdk_rx_steer_unconfigure(dev); + + if (pending_reset) { + /* + * Set false before reset to avoid missing a new reset interrupt event + * in a race with event callback. + */ + atomic_store_relaxed(&netdev_dpdk_pending_reset[dev->port_id], false); rte_eth_dev_reset(dev->port_id); if_notifier_manual_report(); - dev->reset_needed = false; } else { rte_eth_dev_stop(dev->port_id); } @@ -4979,6 +6140,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) dev->txq_size = dev->requested_txq_size; rte_free(dev->tx_q); + dev->tx_q = NULL; if (!eth_addr_equals(dev->hwaddr, dev->requested_hwaddr)) { err = netdev_dpdk_set_etheraddr__(dev, dev->requested_hwaddr); @@ -4988,15 +6150,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev) } err = dpdk_eth_dev_init(dev); - if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - } - } + netdev_dpdk_update_netdev_flags(dev); /* If both requested and actual hwaddr were previously * unset (initialized to 0), then first device init above @@ -5010,6 +6164,23 @@ netdev_dpdk_reconfigure(struct netdev *netdev) */ dev->requested_hwaddr = dev->hwaddr; + if (try_rx_steer) { + err = dpdk_rx_steer_configure(dev); + if (err) { + /* No hw support, disable & recover gracefully. */ + try_rx_steer = false; + /* + * The extra queue must be explicitly removed here to ensure that + * it is unconfigured immediately. + */ + dev->requested_n_rxq = dev->user_n_rxq; + goto retry; + } + } else { + VLOG_INFO("%s: rx-steering: default rss", netdev_get_name(&dev->up)); + } + dev->rx_steer_flags = dev->requested_rx_steer_flags; + dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq); if (!dev->tx_q) { err = ENOMEM; @@ -5038,10 +6209,10 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) dev->tx_q[0].map = 0; } - if (userspace_tso_enabled()) { - dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; - VLOG_DBG("%s: TSO enabled on vhost port", netdev_get_name(&dev->up)); - } + rte_spinlock_lock(&dev->stats_lock); + memset(&dev->stats, 0, sizeof dev->stats); + memset(dev->sw_stats, 0, sizeof *dev->sw_stats); + rte_spinlock_unlock(&dev->stats_lock); netdev_dpdk_remap_txqs(dev); @@ -5063,6 +6234,8 @@ dpdk_vhost_reconfigure_helper(struct netdev_dpdk *dev) } } + netdev_dpdk_update_netdev_flags(dev); + return 0; } @@ -5083,9 +6256,28 @@ static int netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + bool unregister = false; + char *vhost_id; int err; - uint64_t vhost_flags = 0; - uint64_t vhost_unsup_flags; + + ovs_mutex_lock(&dev->mutex); + + if (dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT && dev->vhost_id + && dev->virtio_features_state & OVS_VIRTIO_F_RECONF_PENDING) { + + /* This vhost-user port was registered to the vhost library already, + * but a socket disconnection happened and configuration must be + * re-evaluated wrt dev->virtio_features_state. */ + dev->vhost_driver_flags &= ~RTE_VHOST_USER_CLIENT; + vhost_id = dev->vhost_id; + unregister = true; + } + + ovs_mutex_unlock(&dev->mutex); + + if (unregister) { + dpdk_vhost_driver_unregister(dev, vhost_id); + } ovs_mutex_lock(&dev->mutex); @@ -5095,24 +6287,35 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) * 2. A path has been specified. */ if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && dev->vhost_id) { + uint64_t virtio_unsup_features = 0; + uint64_t vhost_flags = 0; + bool enable_tso; + + enable_tso = userspace_tso_enabled() + && dev->virtio_features_state & OVS_VIRTIO_F_CLEAN; + dev->virtio_features_state &= ~OVS_VIRTIO_F_RECONF_PENDING; + /* Register client-mode device. */ vhost_flags |= RTE_VHOST_USER_CLIENT; + /* Extended per vq statistics. */ + vhost_flags |= RTE_VHOST_USER_NET_STATS_ENABLE; + /* There is no support for multi-segments buffers. */ vhost_flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT; /* Enable IOMMU support, if explicitly requested. */ - if (dpdk_vhost_iommu_enabled()) { + if (vhost_iommu_enabled) { vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT; } /* Enable POSTCOPY support, if explicitly requested. */ - if (dpdk_vhost_postcopy_enabled()) { + if (vhost_postcopy_enabled) { vhost_flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT; } /* Enable External Buffers if TCP Segmentation Offload is enabled. */ - if (userspace_tso_enabled()) { + if (enable_tso) { vhost_flags |= RTE_VHOST_USER_EXTBUF_SUPPORT; } @@ -5137,23 +6340,23 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) goto unlock; } - if (userspace_tso_enabled()) { - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN - | 1ULL << VIRTIO_NET_F_HOST_UFO; + if (enable_tso) { + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_ECN + | 1ULL << VIRTIO_NET_F_HOST_UFO; + VLOG_DBG("%s: TSO enabled on vhost port", + netdev_get_name(&dev->up)); } else { - /* This disables checksum offloading and all the features - * that depends on it (TSO, UFO, ECN) according to virtio - * specification. */ - vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM; + /* Advertise checksum offloading to the guest, but explicitly + * disable TSO and friends. + * NOTE: we can't disable HOST_ECN which may have been wrongly + * negotiated by a running guest. */ + virtio_unsup_features = 1ULL << VIRTIO_NET_F_HOST_TSO4 + | 1ULL << VIRTIO_NET_F_HOST_TSO6 + | 1ULL << VIRTIO_NET_F_HOST_UFO; } err = rte_vhost_driver_disable_features(dev->vhost_id, - vhost_unsup_flags); + virtio_unsup_features); if (err) { VLOG_ERR("rte_vhost_driver_disable_features failed for " "vhost user client port: %s\n", dev->up.name); @@ -5214,6 +6417,13 @@ netdev_dpdk_flow_api_supported(struct netdev *netdev) dev = netdev_dpdk_cast(netdev); ovs_mutex_lock(&dev->mutex); if (dev->type == DPDK_DEV_ETH) { + if (dev->requested_rx_steer_flags) { + VLOG_WARN("%s: rx-steering is mutually exclusive with hw-offload," + " falling back to default rss mode", + netdev_get_name(netdev)); + dev->requested_rx_steer_flags = 0; + netdev_request_reconfigure(netdev); + } /* TODO: Check if we able to offload some minimal flow. */ ret = true; } @@ -5389,8 +6599,18 @@ netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *netdev, #endif /* ALLOW_EXPERIMENTAL_API */ static void -parse_user_mempools_list(const char *mtus) +parse_mempool_config(const struct smap *ovs_other_config) { + per_port_memory = smap_get_bool(ovs_other_config, + "per-port-memory", false); + VLOG_INFO("Per port memory for DPDK devices %s.", + per_port_memory ? "enabled" : "disabled"); +} + +static void +parse_user_mempools_list(const struct smap *ovs_other_config) +{ + const char *mtus = smap_get(ovs_other_config, "shared-mempool-config"); char *list, *copy, *key, *value; int error = 0; @@ -5438,11 +6658,79 @@ parse_user_mempools_list(const char *mtus) free(copy); } +static int +process_vhost_flags(char *flag, const char *default_val, int size, + const struct smap *ovs_other_config, + char **new_val) +{ + const char *val; + int changed = 0; + + val = smap_get(ovs_other_config, flag); + + /* Process the vhost-sock-dir flag if it is provided, otherwise resort to + * default value. + */ + if (val && (strlen(val) <= size)) { + changed = 1; + *new_val = xstrdup(val); + VLOG_INFO("User-provided %s in use: %s", flag, *new_val); + } else { + VLOG_INFO("No %s provided - defaulting to %s", flag, default_val); + *new_val = xstrdup(default_val); + } + + return changed; +} + +static void +parse_vhost_config(const struct smap *ovs_other_config) +{ + char *sock_dir_subcomponent; + + if (process_vhost_flags("vhost-sock-dir", ovs_rundir(), + NAME_MAX, ovs_other_config, + &sock_dir_subcomponent)) { + struct stat s; + + if (!strstr(sock_dir_subcomponent, "..")) { + vhost_sock_dir = xasprintf("%s/%s", ovs_rundir(), + sock_dir_subcomponent); + + if (stat(vhost_sock_dir, &s)) { + VLOG_ERR("vhost-user sock directory '%s' does not exist.", + vhost_sock_dir); + } + } else { + vhost_sock_dir = xstrdup(ovs_rundir()); + VLOG_ERR("vhost-user sock directory request '%s/%s' has invalid" + "characters '..' - using %s instead.", + ovs_rundir(), sock_dir_subcomponent, ovs_rundir()); + } + free(sock_dir_subcomponent); + } else { + vhost_sock_dir = sock_dir_subcomponent; + } + + vhost_iommu_enabled = smap_get_bool(ovs_other_config, + "vhost-iommu-support", false); + VLOG_INFO("IOMMU support for vhost-user-client %s.", + vhost_iommu_enabled ? "enabled" : "disabled"); + + vhost_postcopy_enabled = smap_get_bool(ovs_other_config, + "vhost-postcopy-support", false); + if (vhost_postcopy_enabled && memory_all_locked()) { + VLOG_WARN("vhost-postcopy-support and mlockall are not compatible."); + vhost_postcopy_enabled = false; + } + VLOG_INFO("POSTCOPY support for vhost-user-client %s.", + vhost_postcopy_enabled ? "enabled" : "disabled"); +} + #define NETDEV_DPDK_CLASS_COMMON \ .is_pmd = true, \ .alloc = netdev_dpdk_alloc, \ .dealloc = netdev_dpdk_dealloc, \ - .get_config = netdev_dpdk_get_config, \ .get_numa_id = netdev_dpdk_get_numa_id, \ .set_etheraddr = netdev_dpdk_set_etheraddr, \ .get_etheraddr = netdev_dpdk_get_etheraddr, \ @@ -5471,12 +6759,15 @@ parse_user_mempools_list(const char *mtus) #define NETDEV_DPDK_CLASS_BASE \ NETDEV_DPDK_CLASS_COMMON, \ .init = netdev_dpdk_class_init, \ + .run = netdev_dpdk_run, \ + .wait = netdev_dpdk_wait, \ .destruct = netdev_dpdk_destruct, \ .set_tx_multiq = netdev_dpdk_set_tx_multiq, \ .get_carrier = netdev_dpdk_get_carrier, \ .get_stats = netdev_dpdk_get_stats, \ .get_custom_stats = netdev_dpdk_get_custom_stats, \ .get_features = netdev_dpdk_get_features, \ + .get_speed = netdev_dpdk_get_speed, \ .get_status = netdev_dpdk_get_status, \ .reconfigure = netdev_dpdk_reconfigure, \ .rxq_recv = netdev_dpdk_rxq_recv @@ -5485,6 +6776,7 @@ static const struct netdev_class dpdk_class = { .type = "dpdk", NETDEV_DPDK_CLASS_BASE, .construct = netdev_dpdk_construct, + .get_config = netdev_dpdk_get_config, .set_config = netdev_dpdk_set_config, .send = netdev_dpdk_eth_send, }; @@ -5492,12 +6784,13 @@ static const struct netdev_class dpdk_class = { static const struct netdev_class dpdk_vhost_class = { .type = "dpdkvhostuser", NETDEV_DPDK_CLASS_COMMON, + .init = netdev_dpdk_vhost_class_init, .construct = netdev_dpdk_vhost_construct, .destruct = netdev_dpdk_vhost_destruct, .send = netdev_dpdk_vhost_send, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, - .get_custom_stats = netdev_dpdk_get_sw_custom_stats, + .get_custom_stats = netdev_dpdk_vhost_get_custom_stats, .get_status = netdev_dpdk_vhost_user_get_status, .reconfigure = netdev_dpdk_vhost_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, @@ -5507,13 +6800,15 @@ static const struct netdev_class dpdk_vhost_class = { static const struct netdev_class dpdk_vhost_client_class = { .type = "dpdkvhostuserclient", NETDEV_DPDK_CLASS_COMMON, + .init = netdev_dpdk_vhost_class_init, .construct = netdev_dpdk_vhost_client_construct, .destruct = netdev_dpdk_vhost_destruct, + .get_config = netdev_dpdk_vhost_client_get_config, .set_config = netdev_dpdk_vhost_client_set_config, .send = netdev_dpdk_vhost_send, .get_carrier = netdev_dpdk_vhost_get_carrier, .get_stats = netdev_dpdk_vhost_get_stats, - .get_custom_stats = netdev_dpdk_get_sw_custom_stats, + .get_custom_stats = netdev_dpdk_vhost_get_custom_stats, .get_status = netdev_dpdk_vhost_user_get_status, .reconfigure = netdev_dpdk_vhost_client_reconfigure, .rxq_recv = netdev_dpdk_vhost_rxq_recv, @@ -5523,10 +6818,10 @@ static const struct netdev_class dpdk_vhost_client_class = { void netdev_dpdk_register(const struct smap *ovs_other_config) { - const char *mempoolcfg = smap_get(ovs_other_config, - "shared-mempool-config"); + parse_mempool_config(ovs_other_config); + parse_user_mempools_list(ovs_other_config); + parse_vhost_config(ovs_other_config); - parse_user_mempools_list(mempoolcfg); netdev_register_provider(&dpdk_class); netdev_register_provider(&dpdk_vhost_class); netdev_register_provider(&dpdk_vhost_client_class); diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 7d2f64af23e..86df7a1e83c 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -52,6 +52,17 @@ netdev_dpdk_rte_flow_query_count(struct netdev *netdev, int netdev_dpdk_get_port_id(struct netdev *netdev); +static inline void +set_error(struct rte_flow_error *error, enum rte_flow_error_type type) +{ + if (!error) { + return; + } + error->type = type; + error->cause = NULL; + error->message = NULL; +} + #ifdef ALLOW_EXPERIMENTAL_API int netdev_dpdk_rte_flow_tunnel_decap_set(struct netdev *, @@ -79,17 +90,6 @@ int netdev_dpdk_rte_flow_tunnel_item_release(struct netdev *, #else -static inline void -set_error(struct rte_flow_error *error, enum rte_flow_error_type type) -{ - if (!error) { - return; - } - error->type = type; - error->cause = NULL; - error->message = NULL; -} - static inline int netdev_dpdk_rte_flow_tunnel_decap_set( struct netdev *netdev OVS_UNUSED, @@ -150,11 +150,6 @@ netdev_dpdk_rte_flow_tunnel_item_release( #else -static inline void -netdev_dpdk_register(const struct smap *ovs_other_config OVS_UNUSED) -{ - /* Nothing */ -} static inline void free_dpdk_buf(struct dp_packet *buf OVS_UNUSED) { diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c index 72cb9547110..e8bbf8d514d 100644 --- a/lib/netdev-dummy.c +++ b/lib/netdev-dummy.c @@ -39,11 +39,13 @@ #include "pcap-file.h" #include "openvswitch/poll-loop.h" #include "openvswitch/shash.h" +#include "ovs-router.h" #include "sset.h" #include "stream.h" #include "unaligned.h" #include "timeval.h" #include "unixctl.h" +#include "userspace-tso.h" #include "reconnect.h" VLOG_DEFINE_THIS_MODULE(netdev_dummy); @@ -136,8 +138,7 @@ struct netdev_dummy { struct pcap_file *tx_pcap, *rxq_pcap OVS_GUARDED; - struct in_addr address, netmask; - struct in6_addr ipv6, ipv6_mask; + struct ovs_list addrs OVS_GUARDED; struct ovs_list rxes OVS_GUARDED; /* List of child "netdev_rxq_dummy"s. */ struct hmap offloaded_flows OVS_GUARDED; @@ -148,6 +149,13 @@ struct netdev_dummy { int requested_n_txq OVS_GUARDED; int requested_n_rxq OVS_GUARDED; int requested_numa_id OVS_GUARDED; + + /* Enable netdev IP csum offload. */ + bool ol_ip_csum OVS_GUARDED; + /* Flag RX packet with good csum. */ + bool ol_ip_csum_set_good OVS_GUARDED; + /* Set the segment size for netdev TSO support. */ + int ol_tso_segsz OVS_GUARDED; }; /* Max 'recv_queue_len' in struct netdev_dummy. */ @@ -161,6 +169,12 @@ struct netdev_rxq_dummy { struct seq *seq; /* Reports newly queued packets. */ }; +struct netdev_addr_dummy { + struct in6_addr address; + struct in6_addr netmask; + struct ovs_list node; /* In netdev_dummy's "addrs" list. */ +}; + static unixctl_cb_func netdev_dummy_set_admin_state; static int netdev_dummy_construct(struct netdev *); static void netdev_dummy_queue_packet(struct netdev_dummy *, @@ -169,6 +183,7 @@ static void netdev_dummy_queue_packet(struct netdev_dummy *, static void dummy_packet_stream_close(struct dummy_packet_stream *); static void pkt_list_delete(struct ovs_list *); +static void addr_list_delete(struct ovs_list *); static bool is_dummy_class(const struct netdev_class *class) @@ -204,7 +219,7 @@ dummy_packet_stream_create(struct stream *stream) { struct dummy_packet_stream *s; - s = xzalloc(sizeof *s); + s = xzalloc_cacheline(sizeof *s); dummy_packet_stream_init(s, stream); return s; @@ -350,7 +365,7 @@ dummy_packet_conn_close(struct dummy_packet_conn *conn) pstream_close(pconn->pstream); for (i = 0; i < pconn->n_streams; i++) { dummy_packet_stream_close(pconn->streams[i]); - free(pconn->streams[i]); + free_cacheline(pconn->streams[i]); } free(pconn->streams); pconn->pstream = NULL; @@ -359,7 +374,7 @@ dummy_packet_conn_close(struct dummy_packet_conn *conn) case ACTIVE: dummy_packet_stream_close(rconn->rstream); - free(rconn->rstream); + free_cacheline(rconn->rstream); rconn->rstream = NULL; reconnect_destroy(rconn->reconnect); rconn->reconnect = NULL; @@ -469,7 +484,7 @@ dummy_pconn_run(struct netdev_dummy *dev) pconn->streams = xrealloc(pconn->streams, ((pconn->n_streams + 1) * sizeof s)); - s = xmalloc(sizeof *s); + s = xmalloc_cacheline(sizeof *s); pconn->streams[pconn->n_streams++] = s; dummy_packet_stream_init(s, new_stream); } else if (error != EAGAIN) { @@ -489,7 +504,7 @@ dummy_pconn_run(struct netdev_dummy *dev) stream_get_name(s->stream), ovs_retval_to_string(error)); dummy_packet_stream_close(s); - free(s); + free_cacheline(s); pconn->streams[i] = pconn->streams[--pconn->n_streams]; } else { i++; @@ -720,6 +735,7 @@ netdev_dummy_construct(struct netdev *netdev_) dummy_packet_conn_init(&netdev->conn); ovs_list_init(&netdev->rxes); + ovs_list_init(&netdev->addrs); hmap_init(&netdev->offloaded_flows); ovs_mutex_unlock(&netdev->mutex); @@ -756,6 +772,7 @@ netdev_dummy_destruct(struct netdev *netdev_) free(off_flow); } hmap_destroy(&netdev->offloaded_flows); + addr_list_delete(&netdev->addrs); ovs_mutex_unlock(&netdev->mutex); ovs_mutex_destroy(&netdev->mutex); @@ -782,14 +799,29 @@ netdev_dummy_get_config(const struct netdev *dev, struct smap *args) dummy_packet_conn_get_config(&netdev->conn, args); + /* pcap, rxq_pcap and tx_pcap cannot be recovered because filenames have + * been discarded after opening file descriptors */ + + if (netdev->ol_ip_csum) { + smap_add_format(args, "ol_ip_csum", "%s", "true"); + } + + if (netdev->ol_ip_csum_set_good) { + smap_add_format(args, "ol_ip_csum_set_good", "%s", "true"); + } + + if (netdev->ol_tso_segsz && userspace_tso_enabled()) { + smap_add_format(args, "ol_tso_segsz", "%d", netdev->ol_tso_segsz); + } + /* 'dummy-pmd' specific config. */ if (!netdev_is_pmd(dev)) { goto exit; } - smap_add_format(args, "requested_rx_queues", "%d", netdev->requested_n_rxq); - smap_add_format(args, "configured_rx_queues", "%d", dev->n_rxq); - smap_add_format(args, "requested_tx_queues", "%d", netdev->requested_n_txq); - smap_add_format(args, "configured_tx_queues", "%d", dev->n_txq); + + smap_add_format(args, "n_rxq", "%d", netdev->requested_n_rxq); + smap_add_format(args, "n_txq", "%d", netdev->requested_n_txq); + smap_add_format(args, "numa_id", "%d", netdev->requested_numa_id); exit: ovs_mutex_unlock(&netdev->mutex); @@ -803,32 +835,24 @@ netdev_dummy_get_addr_list(const struct netdev *netdev_, struct in6_addr **paddr struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); int cnt = 0, i = 0, err = 0; struct in6_addr *addr, *mask; + struct netdev_addr_dummy *addr_dummy; ovs_mutex_lock(&netdev->mutex); - if (netdev->address.s_addr != INADDR_ANY) { - cnt++; - } - if (ipv6_addr_is_set(&netdev->ipv6)) { - cnt++; - } + cnt = ovs_list_size(&netdev->addrs); if (!cnt) { err = EADDRNOTAVAIL; goto out; } addr = xmalloc(sizeof *addr * cnt); mask = xmalloc(sizeof *mask * cnt); - if (netdev->address.s_addr != INADDR_ANY) { - in6_addr_set_mapped_ipv4(&addr[i], netdev->address.s_addr); - in6_addr_set_mapped_ipv4(&mask[i], netdev->netmask.s_addr); - i++; - } - if (ipv6_addr_is_set(&netdev->ipv6)) { - memcpy(&addr[i], &netdev->ipv6, sizeof *addr); - memcpy(&mask[i], &netdev->ipv6_mask, sizeof *mask); + LIST_FOR_EACH (addr_dummy, node, &netdev->addrs) { + memcpy(&addr[i], &addr_dummy->address, sizeof *addr); + memcpy(&mask[i], &addr_dummy->netmask, sizeof *mask); i++; } + if (paddr) { *paddr = addr; *pmask = mask; @@ -844,14 +868,16 @@ netdev_dummy_get_addr_list(const struct netdev *netdev_, struct in6_addr **paddr } static int -netdev_dummy_set_in4(struct netdev *netdev_, struct in_addr address, +netdev_dummy_add_in4(struct netdev *netdev_, struct in_addr address, struct in_addr netmask) { struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + struct netdev_addr_dummy *addr_dummy = xmalloc(sizeof *addr_dummy); ovs_mutex_lock(&netdev->mutex); - netdev->address = address; - netdev->netmask = netmask; + in6_addr_set_mapped_ipv4(&addr_dummy->address, address.s_addr); + in6_addr_set_mapped_ipv4(&addr_dummy->netmask, netmask.s_addr); + ovs_list_push_back(&netdev->addrs, &addr_dummy->node); netdev_change_seq_changed(netdev_); ovs_mutex_unlock(&netdev->mutex); @@ -859,14 +885,16 @@ netdev_dummy_set_in4(struct netdev *netdev_, struct in_addr address, } static int -netdev_dummy_set_in6(struct netdev *netdev_, struct in6_addr *in6, +netdev_dummy_add_in6(struct netdev *netdev_, struct in6_addr *in6, struct in6_addr *mask) { struct netdev_dummy *netdev = netdev_dummy_cast(netdev_); + struct netdev_addr_dummy *addr_dummy = xmalloc(sizeof *addr_dummy); ovs_mutex_lock(&netdev->mutex); - netdev->ipv6 = *in6; - netdev->ipv6_mask = *mask; + addr_dummy->address = *in6; + addr_dummy->netmask = *mask; + ovs_list_push_back(&netdev->addrs, &addr_dummy->node); netdev_change_seq_changed(netdev_); ovs_mutex_unlock(&netdev->mutex); @@ -910,6 +938,21 @@ netdev_dummy_set_config(struct netdev *netdev_, const struct smap *args, } } + netdev->ol_ip_csum_set_good = smap_get_bool(args, "ol_ip_csum_set_good", + false); + netdev->ol_ip_csum = smap_get_bool(args, "ol_ip_csum", false); + if (netdev->ol_ip_csum) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + + if (userspace_tso_enabled()) { + netdev->ol_tso_segsz = smap_get_int(args, "ol_tso_segsz", 0); + if (netdev->ol_tso_segsz) { + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_TCP_TSO + | NETDEV_TX_OFFLOAD_TCP_CKSUM); + } + } + netdev_change_seq_changed(netdev_); /* 'dummy-pmd' specific config. */ @@ -1088,6 +1131,17 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, netdev->rxq_stats[rxq_->queue_id].bytes += dp_packet_size(packet); netdev->custom_stats[0].value++; netdev->custom_stats[1].value++; + if (netdev->ol_ip_csum_set_good) { + /* The netdev hardware sets the flag when the packet has good csum. */ + dp_packet_ol_set_ip_csum_good(packet); + } + + if (userspace_tso_enabled() && netdev->ol_tso_segsz) { + dp_packet_set_tso_segsz(packet, netdev->ol_tso_segsz); + dp_packet_hwol_set_tcp_seg(packet); + dp_packet_hwol_set_csum_tcp(packet); + } + ovs_mutex_unlock(&netdev->mutex); dp_packet_batch_init_packet(batch, packet); @@ -1143,6 +1197,12 @@ netdev_dummy_send(struct netdev *netdev, int qid, DP_PACKET_BATCH_FOR_EACH(i, packet, batch) { const void *buffer = dp_packet_data(packet); size_t size = dp_packet_size(packet); + bool is_tso; + + ovs_mutex_lock(&dev->mutex); + is_tso = userspace_tso_enabled() && dev->ol_tso_segsz && + dp_packet_hwol_is_tso(packet); + ovs_mutex_unlock(&dev->mutex); if (!dp_packet_is_eth(packet)) { error = EPFNOSUPPORT; @@ -1163,12 +1223,18 @@ netdev_dummy_send(struct netdev *netdev, int qid, if (eth->eth_type == htons(ETH_TYPE_VLAN)) { max_size += VLAN_HEADER_LEN; } - if (size > max_size) { + if (size > max_size && !is_tso) { error = EMSGSIZE; break; } } + if (dp_packet_hwol_tx_ip_csum(packet) && + !dp_packet_ip_checksum_good(packet)) { + dp_packet_ip_set_header_csum(packet, false); + dp_packet_ol_set_ip_csum_good(packet); + } + ovs_mutex_lock(&dev->mutex); dev->stats.tx_packets++; dev->txq_stats[qid].packets++; @@ -1178,7 +1244,10 @@ netdev_dummy_send(struct netdev *netdev, int qid, dummy_packet_conn_send(&dev->conn, buffer, size); /* Reply to ARP requests for 'dev''s assigned IP address. */ - if (dev->address.s_addr) { + struct netdev_addr_dummy *addr_dummy; + LIST_FOR_EACH (addr_dummy, node, &dev->addrs) { + ovs_be32 address = in6_addr_get_mapped_ipv4(&addr_dummy->address); + struct dp_packet dp; struct flow flow; @@ -1186,11 +1255,12 @@ netdev_dummy_send(struct netdev *netdev, int qid, flow_extract(&dp, &flow); if (flow.dl_type == htons(ETH_TYPE_ARP) && flow.nw_proto == ARP_OP_REQUEST - && flow.nw_dst == dev->address.s_addr) { + && flow.nw_dst == address) { struct dp_packet *reply = dp_packet_new(0); compose_arp(reply, ARP_OP_REPLY, dev->hwaddr, flow.dl_src, false, flow.nw_dst, flow.nw_src); netdev_dummy_queue_packet(dev, reply, NULL, 0); + break; } } @@ -1677,6 +1747,16 @@ pkt_list_delete(struct ovs_list *l) } } +static void +addr_list_delete(struct ovs_list *l) +{ + struct netdev_addr_dummy *addr_dummy; + + LIST_FOR_EACH_POP (addr_dummy, node, l) { + free(addr_dummy); + } +} + static struct dp_packet * eth_from_packet(const char *s) { @@ -1718,7 +1798,7 @@ eth_from_flow_str(const char *s, size_t packet_size, packet = dp_packet_new(0); if (packet_size) { - flow_compose(packet, flow, NULL, 0); + flow_compose(packet, flow, NULL, 0, false); if (dp_packet_size(packet) < packet_size) { packet_expand(packet, flow, packet_size); } else if (dp_packet_size(packet) > packet_size){ @@ -1726,7 +1806,7 @@ eth_from_flow_str(const char *s, size_t packet_size, packet = NULL; } } else { - flow_compose(packet, flow, NULL, 64); + flow_compose(packet, flow, NULL, 64, false); } ofpbuf_uninit(&odp_key); @@ -2005,11 +2085,20 @@ netdev_dummy_ip4addr(struct unixctl_conn *conn, int argc OVS_UNUSED, if (netdev && is_dummy_class(netdev->netdev_class)) { struct in_addr ip, mask; + struct in6_addr ip6; + uint32_t plen; char *error; - error = ip_parse_masked(argv[2], &ip.s_addr, &mask.s_addr); + error = ip_parse_cidr(argv[2], &ip.s_addr, &plen); if (!error) { - netdev_dummy_set_in4(netdev, ip, mask); + mask.s_addr = be32_prefix_mask(plen); + netdev_dummy_add_in4(netdev, ip, mask); + + /* Insert local route entry for the new address. */ + in6_addr_set_mapped_ipv4(&ip6, ip.s_addr); + ovs_router_force_insert(0, &ip6, plen + 96, true, argv[1], + &in6addr_any, &ip6); + unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); @@ -2038,7 +2127,12 @@ netdev_dummy_ip6addr(struct unixctl_conn *conn, int argc OVS_UNUSED, struct in6_addr mask; mask = ipv6_create_mask(plen); - netdev_dummy_set_in6(netdev, &ip6, &mask); + netdev_dummy_add_in6(netdev, &ip6, &mask); + + /* Insert local route entry for the new address. */ + ovs_router_force_insert(0, &ip6, plen, true, argv[1], + &in6addr_any, &ip6); + unixctl_command_reply(conn, "OK"); } else { unixctl_command_reply_error(conn, error); diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index deb015bdb80..8e572e3b3b1 100644 --- a/lib/netdev-linux-private.h +++ b/lib/netdev-linux-private.h @@ -50,6 +50,7 @@ struct netdev_rxq_linux { }; int netdev_linux_construct(struct netdev *); +int netdev_linux_get_status(const struct netdev *, struct smap *); void netdev_linux_run(const struct netdev_class *); int get_stats_via_netlink(const struct netdev *netdev_, @@ -92,6 +93,7 @@ struct netdev_linux { enum netdev_features current; /* Cached from ETHTOOL_GSET. */ enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */ enum netdev_features supported; /* Cached from ETHTOOL_GSET. */ + uint32_t current_speed; /* Cached from ETHTOOL_GSET. */ struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */ struct tc *tc; @@ -103,7 +105,7 @@ struct netdev_linux { uint64_t rx_dropped; /* Packets dropped while recv from kernel. */ /* LAG information. */ - bool is_lag_master; /* True if the netdev is a LAG master. */ + bool is_lag_primary; /* True if the netdev is a LAG primary. */ int numa_id; /* NUMA node id. */ diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index cdc66246ced..c316238cd56 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -484,9 +483,9 @@ static const struct tc_ops *const tcs[] = { NULL }; -static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks); -static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size); -static unsigned int tc_buffer_per_jiffy(unsigned int rate); +static unsigned int tc_ticks_to_bytes(uint64_t rate, unsigned int ticks); +static unsigned int tc_bytes_to_ticks(uint64_t rate, unsigned int size); +static unsigned int tc_buffer_per_jiffy(uint64_t rate); static uint32_t tc_time_to_ticks(uint32_t time); static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *, @@ -494,7 +493,7 @@ static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *, unsigned int flags, struct ofpbuf *); -static int tc_add_policer(struct netdev *, uint32_t kbits_rate, +static int tc_add_policer(struct netdev *, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst); @@ -510,12 +509,15 @@ static int tc_delete_class(const struct netdev *, unsigned int handle); static int tc_del_qdisc(struct netdev *netdev); static int tc_query_qdisc(const struct netdev *netdev); +static void tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate, + uint64_t kbits_burst); void -tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate); +tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate, + uint64_t rate64); static int tc_calc_cell_log(unsigned int mtu); static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu); -static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes); +static int tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes); /* This is set pretty low because we probably won't learn anything from the @@ -530,8 +532,13 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); * changes in the device miimon status, so we can use atomic_count. */ static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0); +/* Very old kernels from the 2.6 era don't support vnet headers with the tun + * device. We can detect this while constructing a netdev, but need this for + * packet rx/tx. */ +static bool tap_supports_vnet_hdr = true; + static int netdev_linux_parse_vnet_hdr(struct dp_packet *b); -static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); +static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu); static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *, int cmd, const char *cmd_name); static int get_flags(const struct netdev *, unsigned int *flags); @@ -550,6 +557,7 @@ static bool netdev_linux_miimon_enabled(void); static void netdev_linux_miimon_run(void); static void netdev_linux_miimon_wait(void); static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup); +static void netdev_linux_set_ol(struct netdev *netdev); static bool is_tap_netdev(const struct netdev *netdev) @@ -677,26 +685,26 @@ netdev_linux_update_lag(struct rtnetlink_change *change) lag = shash_find_data(&lag_shash, change->ifname); if (!lag) { - struct netdev *master_netdev; - char master_name[IFNAMSIZ]; + struct netdev *primary_netdev; + char primary_name[IFNAMSIZ]; uint32_t block_id; int error = 0; - if (!if_indextoname(change->master_ifindex, master_name)) { + if (!if_indextoname(change->master_ifindex, primary_name)) { return; } - master_netdev = netdev_from_name(master_name); - if (!master_netdev) { + primary_netdev = netdev_from_name(primary_name); + if (!primary_netdev) { return; } - /* If LAG master is not attached to ovs, ingress block on LAG - * members shoud not be updated. */ - if (!master_netdev->auto_classified && - is_netdev_linux_class(master_netdev->netdev_class)) { - block_id = netdev_get_block_id(master_netdev); + /* If LAG primary member is not attached to ovs, + * ingress block on LAG members should not be updated. */ + if (!primary_netdev->auto_classified && + is_netdev_linux_class(primary_netdev->netdev_class)) { + block_id = netdev_get_block_id(primary_netdev); if (!block_id) { - netdev_close(master_netdev); + netdev_close(primary_netdev); return; } @@ -706,7 +714,7 @@ netdev_linux_update_lag(struct rtnetlink_change *change) /* delete ingress block in case it exists */ tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS); - /* LAG master is linux netdev so add member to same block. */ + /* LAG primary is linux netdev so add member to same block. */ error = tc_add_del_qdisc(change->if_index, true, block_id, TC_INGRESS); if (error) { @@ -717,7 +725,7 @@ netdev_linux_update_lag(struct rtnetlink_change *change) } } - netdev_close(master_netdev); + netdev_close(primary_netdev); } } else if (change->master_ifindex == 0) { /* Check if this was a lag member that has been removed. */ @@ -876,7 +884,7 @@ netdev_linux_update__(struct netdev_linux *dev, } if (change->primary && netdev_linux_kind_is_lag(change->primary)) { - dev->is_lag_master = true; + dev->is_lag_primary = true; } dev->ifindex = change->if_index; @@ -938,14 +946,6 @@ netdev_linux_common_construct(struct netdev *netdev_) netnsid_unset(&netdev->netnsid); ovs_mutex_init(&netdev->mutex); - if (userspace_tso_enabled()) { - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; - netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; - } - return 0; } @@ -959,6 +959,14 @@ netdev_linux_construct(struct netdev *netdev_) return error; } + if (userspace_tso_enabled()) { + /* The AF_PACKET socket interface uses the same option to facilitate + * both csum and segmentation offloading. However, these features can + * be toggled off or on individually at the interface level. The netdev + * flags are set based on the features indicated by ethtool. */ + netdev_linux_set_ol(netdev_); + } + error = get_flags(&netdev->up, &netdev->ifi_flags); if (error == ENODEV) { if (netdev->up.netdev_class != &netdev_internal_class) { @@ -984,9 +992,12 @@ netdev_linux_construct(struct netdev *netdev_) static int netdev_linux_construct_tap(struct netdev *netdev_) { + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; struct netdev_linux *netdev = netdev_linux_cast(netdev_); static const char tap_dev[] = "/dev/net/tun"; const char *name = netdev_->name; + unsigned long oflags; + unsigned int up; struct ifreq ifr; int error = netdev_linux_common_construct(netdev_); @@ -1004,8 +1015,21 @@ netdev_linux_construct_tap(struct netdev *netdev_) /* Create tap device. */ get_flags(&netdev->up, &netdev->ifi_flags); + + if (ovsthread_once_start(&once)) { + if (ioctl(netdev->tap_fd, TUNGETFEATURES, &up) == -1) { + VLOG_WARN("%s: querying tap features failed: %s", name, + ovs_strerror(errno)); + tap_supports_vnet_hdr = false; + } else if (!(up & IFF_VNET_HDR)) { + VLOG_WARN("TAP interfaces do not support virtio-net headers"); + tap_supports_vnet_hdr = false; + } + ovsthread_once_done(&once); + } + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - if (userspace_tso_enabled()) { + if (tap_supports_vnet_hdr) { ifr.ifr_flags |= IFF_VNET_HDR; } @@ -1030,21 +1054,23 @@ netdev_linux_construct_tap(struct netdev *netdev_) goto error_close; } + oflags = TUN_F_CSUM; if (userspace_tso_enabled()) { - /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is - * available, it will return EINVAL when a flag is unknown. - * Therefore, try enabling offload with no flags to check - * if TUNSETOFFLOAD support is available or not. */ - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) { - unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6; - - if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) { - VLOG_WARN("%s: enabling tap offloading failed: %s", name, - ovs_strerror(errno)); - error = errno; - goto error_close; - } + oflags |= (TUN_F_TSO4 | TUN_F_TSO6); + } + + if (tap_supports_vnet_hdr + && ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == 0) { + netdev_->ol_flags |= (NETDEV_TX_OFFLOAD_IPV4_CKSUM + | NETDEV_TX_OFFLOAD_TCP_CKSUM + | NETDEV_TX_OFFLOAD_UDP_CKSUM); + + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; } + } else { + VLOG_INFO("%s: Disabling checksum and segment offloading due to " + "missing kernel support", name); } netdev->present = true; @@ -1344,18 +1370,23 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, pkt = buffers[i]; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { - struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); - struct netdev_linux *netdev = netdev_linux_cast(netdev_); + if (virtio_net_hdr_size) { + int ret = netdev_linux_parse_vnet_hdr(pkt); + if (OVS_UNLIKELY(ret)) { + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); + struct netdev_linux *netdev = netdev_linux_cast(netdev_); - /* Unexpected error situation: the virtio header is not present - * or corrupted. Drop the packet but continue in case next ones - * are correct. */ - dp_packet_delete(pkt); - netdev->rx_dropped += 1; - VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header", - netdev_get_name(netdev_)); - continue; + /* Unexpected error situation: the virtio header is not + * present or corrupted or contains unsupported features. + * Drop the packet but continue in case next ones are + * correct. */ + dp_packet_delete(pkt); + netdev->rx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Dropped packet: vnet header is missing " + "or corrupt: %s", netdev_get_name(netdev_), + ovs_strerror(ret)); + continue; + } } for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg; @@ -1413,10 +1444,13 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, /* Use the buffer from the allocated packet below to receive MTU * sized packets and an aux_buf for extra TSO data. */ iovlen = IOV_TSO_SIZE; - virtio_net_hdr_size = sizeof(struct virtio_net_hdr); } else { /* Use only the buffer from the allocated packet. */ iovlen = IOV_STD_SIZE; + } + if (OVS_LIKELY(tap_supports_vnet_hdr)) { + virtio_net_hdr_size = sizeof(struct virtio_net_hdr); + } else { virtio_net_hdr_size = 0; } @@ -1462,7 +1496,8 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, pkt = buffer; } - if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) { + if (OVS_LIKELY(virtio_net_hdr_size) && + netdev_linux_parse_vnet_hdr(pkt)) { struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1561,9 +1596,10 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_) } static int -netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, - struct dp_packet_batch *batch) +netdev_linux_sock_batch_send(struct netdev *netdev_, int sock, int ifindex, + bool tso, int mtu, struct dp_packet_batch *batch) { + struct netdev_linux *netdev = netdev_linux_cast(netdev_); const size_t size = dp_packet_batch_size(batch); /* We don't bother setting most fields in sockaddr_ll because the * kernel ignores them for SOCK_RAW. */ @@ -1572,26 +1608,36 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size); struct iovec *iov = xmalloc(sizeof(*iov) * size); - struct dp_packet *packet; + int cnt = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { if (tso) { - netdev_linux_prepend_vnet_hdr(packet, mtu); - } + int ret = netdev_linux_prepend_vnet_hdr(packet, mtu); + + if (OVS_UNLIKELY(ret)) { + netdev->tx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet " + "dropped. %s", netdev_get_name(netdev_), + ovs_strerror(ret)); + continue; + } + } - iov[i].iov_base = dp_packet_data(packet); - iov[i].iov_len = dp_packet_size(packet); - mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, - .msg_namelen = sizeof sll, - .msg_iov = &iov[i], - .msg_iovlen = 1 }; + iov[cnt].iov_base = dp_packet_data(packet); + iov[cnt].iov_len = dp_packet_size(packet); + mmsg[cnt].msg_hdr = (struct msghdr) { .msg_name = &sll, + .msg_namelen = sizeof sll, + .msg_iov = &iov[cnt], + .msg_iovlen = 1 }; + cnt++; } int error = 0; - for (uint32_t ofs = 0; ofs < size; ) { + for (uint32_t ofs = 0; ofs < cnt;) { ssize_t retval; do { - retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0); + retval = sendmmsg(sock, mmsg + ofs, cnt - ofs, 0); error = retval < 0 ? errno : 0; } while (error == EINTR); if (error) { @@ -1611,7 +1657,7 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, * on other interface types because we attach a socket filter to the rx * socket. */ static int -netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, +netdev_linux_tap_batch_send(struct netdev *netdev_, int mtu, struct dp_packet_batch *batch) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -1632,8 +1678,15 @@ netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu, ssize_t retval; int error; - if (tso) { - netdev_linux_prepend_vnet_hdr(packet, mtu); + if (OVS_LIKELY(tap_supports_vnet_hdr)) { + error = netdev_linux_prepend_vnet_hdr(packet, mtu); + if (OVS_UNLIKELY(error)) { + netdev->tx_dropped++; + VLOG_WARN_RL(&rl, "%s: Prepend vnet hdr failed, packet " + "dropped. %s", netdev_get_name(netdev_), + ovs_strerror(error)); + continue; + } } size = dp_packet_size(packet); @@ -1763,9 +1816,10 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, goto free_batch; } - error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); + error = netdev_linux_sock_batch_send(netdev_, sock, ifindex, tso, mtu, + batch); } else { - error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); + error = netdev_linux_tap_batch_send(netdev_, mtu, batch); } if (error) { if (error == ENOBUFS) { @@ -2156,16 +2210,16 @@ swap_uint64(uint64_t *a, uint64_t *b) * 'src' is allowed to be misaligned. */ static void netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst, - const struct ovs_vport_stats *src) -{ - dst->rx_packets = get_32aligned_u64(&src->rx_packets); - dst->tx_packets = get_32aligned_u64(&src->tx_packets); - dst->rx_bytes = get_32aligned_u64(&src->rx_bytes); - dst->tx_bytes = get_32aligned_u64(&src->tx_bytes); - dst->rx_errors = get_32aligned_u64(&src->rx_errors); - dst->tx_errors = get_32aligned_u64(&src->tx_errors); - dst->rx_dropped = get_32aligned_u64(&src->rx_dropped); - dst->tx_dropped = get_32aligned_u64(&src->tx_dropped); + const struct dpif_netlink_vport *vport) +{ + dst->rx_packets = get_32aligned_u64(&vport->stats->rx_packets); + dst->tx_packets = get_32aligned_u64(&vport->stats->tx_packets); + dst->rx_bytes = get_32aligned_u64(&vport->stats->rx_bytes); + dst->tx_bytes = get_32aligned_u64(&vport->stats->tx_bytes); + dst->rx_errors = get_32aligned_u64(&vport->stats->rx_errors); + dst->tx_errors = get_32aligned_u64(&vport->stats->tx_errors); + dst->rx_dropped = get_32aligned_u64(&vport->stats->rx_dropped); + dst->tx_dropped = get_32aligned_u64(&vport->stats->tx_dropped); dst->multicast = 0; dst->collisions = 0; dst->rx_length_errors = 0; @@ -2179,6 +2233,8 @@ netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst, dst->tx_fifo_errors = 0; dst->tx_heartbeat_errors = 0; dst->tx_window_errors = 0; + dst->upcall_packets = vport->upcall_success; + dst->upcall_errors = vport->upcall_fail; } static int @@ -2196,7 +2252,7 @@ get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats) return EOPNOTSUPP; } - netdev_stats_from_ovs_vport_stats(stats, reply.stats); + netdev_stats_from_ovs_vport_stats(stats, &reply); ofpbuf_delete(buf); @@ -2342,11 +2398,151 @@ netdev_internal_get_stats(const struct netdev *netdev_, return error; } +static int +netdev_linux_read_stringset_info(struct netdev_linux *netdev, uint32_t *len) +{ + union { + struct ethtool_cmd ecmd; + struct ethtool_sset_info hdr; + struct { + uint64_t pad[2]; + uint32_t sset_len[1]; + }; + } sset_info; + int error; + + sset_info.hdr.cmd = ETHTOOL_GSSET_INFO; + sset_info.hdr.reserved = 0; + sset_info.hdr.sset_mask = 1ULL << ETH_SS_FEATURES; + + error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up), + (struct ethtool_cmd *) &sset_info, + ETHTOOL_GSSET_INFO, "ETHTOOL_GSSET_INFO"); + if (error) { + return error; + } + if (sset_info.hdr.sset_mask & (1ULL << ETH_SS_FEATURES)) { + *len = sset_info.sset_len[0]; + return 0; + } else { + /* ETH_SS_FEATURES is not supported. */ + return -EOPNOTSUPP; + } +} + + +static int +netdev_linux_read_definitions(struct netdev_linux *netdev, + struct ethtool_gstrings **pstrings) +{ + struct ethtool_gstrings *strings = NULL; + uint32_t len = 0; + int error = 0; + + error = netdev_linux_read_stringset_info(netdev, &len); + if (error) { + return error; + } else if (!len) { + return -EOPNOTSUPP; + } + + strings = xzalloc(sizeof *strings + len * ETH_GSTRING_LEN); + + strings->cmd = ETHTOOL_GSTRINGS; + strings->string_set = ETH_SS_FEATURES; + strings->len = len; + error = netdev_linux_do_ethtool(netdev_get_name(&netdev->up), + (struct ethtool_cmd *) strings, + ETHTOOL_GSTRINGS, "ETHTOOL_GSTRINGS"); + if (error) { + goto out; + } + + for (int i = 0; i < len; i++) { + strings->data[(i + 1) * ETH_GSTRING_LEN - 1] = 0; + } + + *pstrings = strings; + + return 0; +out: + *pstrings = NULL; + free(strings); + return error; +} + +static void +netdev_linux_set_ol(struct netdev *netdev_) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + struct ethtool_gfeatures *features = NULL; + struct ethtool_gstrings *names = NULL; + int error; + + COVERAGE_INC(netdev_get_ethtool); + + error = netdev_linux_read_definitions(netdev, &names); + if (error) { + return; + } + + features = xzalloc(sizeof *features + + DIV_ROUND_UP(names->len, 32) * + sizeof features->features[0]); + + features->cmd = ETHTOOL_GFEATURES; + features->size = DIV_ROUND_UP(names->len, 32); + error = netdev_linux_do_ethtool(netdev_get_name(netdev_), + (struct ethtool_cmd *) features, + ETHTOOL_GFEATURES, "ETHTOOL_GFEATURES"); + + if (error) { + goto out; + } + +#define FEATURE_WORD(blocks, index, field) ((blocks)[(index) / 32U].field) +#define FEATURE_FIELD_FLAG(index) (1U << (index) % 32U) +#define FEATURE_BIT_IS_SET(blocks, index, field) \ + (FEATURE_WORD(blocks, index, field) & FEATURE_FIELD_FLAG(index)) + + netdev->up.ol_flags = 0; + static const struct { + char *string; + uint32_t value; + } t_list[] = { + {"tx-checksum-ipv4", NETDEV_TX_OFFLOAD_IPV4_CKSUM | + NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-ipv6", NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-ip-generic", NETDEV_TX_OFFLOAD_IPV4_CKSUM | + NETDEV_TX_OFFLOAD_TCP_CKSUM | + NETDEV_TX_OFFLOAD_UDP_CKSUM}, + {"tx-checksum-sctp", NETDEV_TX_OFFLOAD_SCTP_CKSUM}, + {"tx-tcp-segmentation", NETDEV_TX_OFFLOAD_TCP_TSO}, + }; + + for (int j = 0; j < ARRAY_SIZE(t_list); j++) { + for (int i = 0; i < names->len; i++) { + char *name = (char *) names->data + i * ETH_GSTRING_LEN; + if (strcmp(t_list[j].string, name) == 0) { + if (FEATURE_BIT_IS_SET(features->features, i, active)) { + netdev_->ol_flags |= t_list[j].value; + } + break; + } + } + } + +out: + free(names); + free(features); +} + static void netdev_linux_read_features(struct netdev_linux *netdev) { struct ethtool_cmd ecmd; - uint32_t speed; int error; if (netdev->cache_valid & VALID_FEATURES) { @@ -2460,20 +2656,20 @@ netdev_linux_read_features(struct netdev_linux *netdev) } /* Current settings. */ - speed = ethtool_cmd_speed(&ecmd); - if (speed == SPEED_10) { + netdev->current_speed = ethtool_cmd_speed(&ecmd); + if (netdev->current_speed == SPEED_10) { netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD; - } else if (speed == SPEED_100) { + } else if (netdev->current_speed == SPEED_100) { netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD; - } else if (speed == SPEED_1000) { + } else if (netdev->current_speed == SPEED_1000) { netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD; - } else if (speed == SPEED_10000) { + } else if (netdev->current_speed == SPEED_10000) { netdev->current = NETDEV_F_10GB_FD; - } else if (speed == 40000) { + } else if (netdev->current_speed == 40000) { netdev->current = NETDEV_F_40GB_FD; - } else if (speed == 100000) { + } else if (netdev->current_speed == 100000) { netdev->current = NETDEV_F_100GB_FD; - } else if (speed == 1000000) { + } else if (netdev->current_speed == 1000000) { netdev->current = NETDEV_F_1TB_FD; } else { netdev->current = 0; @@ -2527,6 +2723,40 @@ netdev_linux_get_features(const struct netdev *netdev_, return error; } +static int +netdev_linux_get_speed_locked(struct netdev_linux *netdev, + uint32_t *current, uint32_t *max) +{ + if (netdev_linux_netnsid_is_remote(netdev)) { + *current = *max = 0; + return EOPNOTSUPP; + } + + netdev_linux_read_features(netdev); + if (!netdev->get_features_error) { + *current = netdev->current_speed == SPEED_UNKNOWN + ? 0 : netdev->current_speed; + *max = MIN(UINT32_MAX, + netdev_features_to_bps(netdev->supported, 0) / 1000000ULL); + } else { + *current = *max = 0; + } + return netdev->get_features_error; +} + +static int +netdev_linux_get_speed(const struct netdev *netdev_, uint32_t *current, + uint32_t *max) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + int error; + + ovs_mutex_lock(&netdev->mutex); + error = netdev_linux_get_speed_locked(netdev, current, max); + ovs_mutex_unlock(&netdev->mutex); + return error; +} + /* Set the features advertised by 'netdev' to 'advertise'. */ static int netdev_linux_set_advertisements(struct netdev *netdev_, @@ -2598,35 +2828,19 @@ netdev_linux_set_advertisements(struct netdev *netdev_, return error; } -static struct tc_police -tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst) -{ - unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8; - unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8; - struct tc_police police; - struct tc_ratespec rate; - int mtu = 65535; - - memset(&rate, 0, sizeof rate); - rate.rate = bps; - rate.cell_log = tc_calc_cell_log(mtu); - rate.mpu = ETH_TOTAL_MIN; - - memset(&police, 0, sizeof police); - police.burst = tc_bytes_to_ticks(bps, bsize); - police.action = TC_POLICE_SHOT; - police.rate = rate; - police.mtu = mtu; - - return police; -} - static void nl_msg_act_police_start_nest(struct ofpbuf *request, uint32_t prio, - size_t *offset, size_t *act_offset) + size_t *offset, size_t *act_offset, + bool single_action) { *act_offset = nl_msg_start_nested(request, prio); nl_msg_put_string(request, TCA_ACT_KIND, "police"); + + /* If police action is added independently from filter, we need to + * add action flag according to tc-policy. */ + if (single_action) { + nl_msg_put_act_tc_policy_flag(request); + } *offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS); } @@ -2640,21 +2854,33 @@ nl_msg_act_police_end_nest(struct ofpbuf *request, size_t offset, } static void -nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, +nl_msg_put_act_police(struct ofpbuf *request, uint32_t index, + uint64_t kbits_rate, uint64_t kbits_burst, uint64_t pkts_rate, uint64_t pkts_burst, - uint32_t notexceed_act) + uint32_t notexceed_act, bool single_action) { + uint64_t bytes_rate = kbits_rate / 8 * 1000; size_t offset, act_offset; + struct tc_police police; uint32_t prio = 0; - if (!police->rate.rate && !pkts_rate) { + if (!kbits_rate && !pkts_rate) { return; } - nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset); - if (police->rate.rate) { - tc_put_rtab(request, TCA_POLICE_RATE, &police->rate); + tc_policer_init(&police, kbits_rate, kbits_burst); + police.index = index; + + nl_msg_act_police_start_nest(request, ++prio, &offset, &act_offset, + single_action); + if (police.rate.rate) { + tc_put_rtab(request, TCA_POLICE_RATE, &police.rate, bytes_rate); + } +#ifdef HAVE_TCA_POLICE_PKTRATE64 + if (bytes_rate > UINT32_MAX) { + nl_msg_put_u64(request, TCA_POLICE_RATE64, bytes_rate); } +#endif if (pkts_rate) { uint64_t pkt_burst_ticks; /* Here tc_bytes_to_ticks is used to convert packets rather than bytes @@ -2663,12 +2889,12 @@ nl_msg_put_act_police(struct ofpbuf *request, struct tc_police *police, nl_msg_put_u64(request, TCA_POLICE_PKTRATE64, pkts_rate); nl_msg_put_u64(request, TCA_POLICE_PKTBURST64, pkt_burst_ticks); } - nl_msg_put_unspec(request, TCA_POLICE_TBF, police, sizeof *police); + nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police); nl_msg_act_police_end_nest(request, offset, act_offset, notexceed_act); } static int -tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, +tc_add_matchall_policer(struct netdev *netdev, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst) { @@ -2676,7 +2902,6 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, size_t basic_offset, action_offset; uint16_t prio = TC_RESERVED_PRIORITY_POLICE; int ifindex, err = 0; - struct tc_police pol_act; struct ofpbuf request; struct ofpbuf *reply; struct tcmsg *tcmsg; @@ -2693,19 +2918,27 @@ tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate, tcmsg->tcm_info = tc_make_handle(prio, eth_type); tcmsg->tcm_handle = handle; - pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst); nl_msg_put_string(&request, TCA_KIND, "matchall"); basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT); - nl_msg_put_act_police(&request, &pol_act, kpkts_rate * 1000, - kpkts_burst * 1000, TC_ACT_UNSPEC); + nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst, + kpkts_rate * 1000ULL, kpkts_burst * 1000ULL, + TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, action_offset); nl_msg_end_nested(&request, basic_offset); err = tc_transact(&request, &reply); if (!err) { - struct tcmsg *tc = - ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); + + if (!nlmsg || !tc) { + VLOG_ERR_RL(&rl, + "Failed to add match all policer, malformed reply"); + ofpbuf_delete(reply); + return EPROTO; + } ofpbuf_delete(reply); } @@ -2727,7 +2960,7 @@ tc_del_matchall_policer(struct netdev *netdev) } id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); - err = tc_del_filter(&id); + err = tc_del_filter(&id, "matchall"); if (err) { return err; } @@ -2976,12 +3209,18 @@ netdev_linux_set_qos(struct netdev *netdev_, /* Delete existing qdisc. */ error = tc_del_qdisc(netdev_); if (error) { + VLOG_WARN_RL(&rl, "%s: Failed to delete existing qdisc: %s", + netdev_get_name(netdev_), ovs_strerror(error)); goto exit; } ovs_assert(netdev->tc == NULL); /* Install new qdisc. */ error = new_ops->tc_install(netdev_, details); + if (error) { + VLOG_WARN_RL(&rl, "%s: Failed to install new qdisc: %s", + netdev_get_name(netdev_), ovs_strerror(error)); + } ovs_assert((error == 0) == (netdev->tc != NULL)); } @@ -3419,7 +3658,7 @@ netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop, return ENXIO; } -static int +int netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); @@ -3470,8 +3709,9 @@ netdev_linux_get_block_id(struct netdev *netdev_) netdev_linux_update_via_netlink(netdev); } - /* Only assigning block ids to linux netdevs that are LAG masters. */ - if (netdev->is_lag_master) { + /* Only assigning block ids to linux netdevs that are + * LAG primary members. */ + if (netdev->is_lag_primary) { block_id = netdev->ifindex; } ovs_mutex_unlock(&netdev->mutex); @@ -3639,6 +3879,7 @@ const struct netdev_class netdev_linux_class = { .destruct = netdev_linux_destruct, .get_stats = netdev_linux_get_stats, .get_features = netdev_linux_get_features, + .get_speed = netdev_linux_get_speed, .get_status = netdev_linux_get_status, .get_block_id = netdev_linux_get_block_id, .send = netdev_linux_send, @@ -3655,6 +3896,7 @@ const struct netdev_class netdev_tap_class = { .destruct = netdev_linux_destruct, .get_stats = netdev_tap_get_stats, .get_features = netdev_linux_get_features, + .get_speed = netdev_linux_get_speed, .get_status = netdev_linux_get_status, .send = netdev_linux_send, .rxq_construct = netdev_linux_rxq_construct, @@ -3678,12 +3920,11 @@ const struct netdev_class netdev_internal_class = { #ifdef HAVE_AF_XDP #define NETDEV_AFXDP_CLASS_COMMON \ - .init = netdev_afxdp_init, \ .construct = netdev_afxdp_construct, \ .destruct = netdev_afxdp_destruct, \ .get_stats = netdev_afxdp_get_stats, \ .get_custom_stats = netdev_afxdp_get_custom_stats, \ - .get_status = netdev_linux_get_status, \ + .get_status = netdev_afxdp_get_status, \ .set_config = netdev_afxdp_set_config, \ .get_config = netdev_afxdp_get_config, \ .reconfigure = netdev_afxdp_reconfigure, \ @@ -4331,6 +4572,7 @@ struct netem { uint32_t latency; uint32_t limit; uint32_t loss; + uint32_t jitter; }; static struct netem * @@ -4342,7 +4584,7 @@ netem_get__(const struct netdev *netdev_) static void netem_install__(struct netdev *netdev_, uint32_t latency, - uint32_t limit, uint32_t loss) + uint32_t limit, uint32_t loss, uint32_t jitter) { struct netdev_linux *netdev = netdev_linux_cast(netdev_); struct netem *netem; @@ -4352,13 +4594,14 @@ netem_install__(struct netdev *netdev_, uint32_t latency, netem->latency = latency; netem->limit = limit; netem->loss = loss; + netem->jitter = jitter; netdev->tc = &netem->tc; } static int netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, - uint32_t limit, uint32_t loss) + uint32_t limit, uint32_t loss, uint32_t jitter) { struct tc_netem_qopt opt; struct ofpbuf request; @@ -4394,6 +4637,7 @@ netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, } opt.latency = tc_time_to_ticks(latency); + opt.jitter = tc_time_to_ticks(jitter); nl_msg_put_string(&request, TCA_KIND, "netem"); nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt); @@ -4401,9 +4645,10 @@ netem_setup_qdisc__(struct netdev *netdev, uint32_t latency, error = tc_transact(&request, NULL); if (error) { VLOG_WARN_RL(&rl, "failed to replace %s qdisc, " - "latency %u, limit %u, loss %u error %d(%s)", + "latency %u, limit %u, loss %u, jitter %u " + "error %d(%s)", netdev_get_name(netdev), - opt.latency, opt.limit, opt.loss, + opt.latency, opt.limit, opt.loss, opt.jitter, error, ovs_strerror(error)); } return error; @@ -4416,6 +4661,7 @@ netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED, netem->latency = smap_get_ullong(details, "latency", 0); netem->limit = smap_get_ullong(details, "limit", 0); netem->loss = smap_get_ullong(details, "loss", 0); + netem->jitter = smap_get_ullong(details, "jitter", 0); if (!netem->limit) { netem->limit = 1000; @@ -4430,9 +4676,10 @@ netem_tc_install(struct netdev *netdev, const struct smap *details) netem_parse_qdisc_details__(netdev, details, &netem); error = netem_setup_qdisc__(netdev, netem.latency, - netem.limit, netem.loss); + netem.limit, netem.loss, netem.jitter); if (!error) { - netem_install__(netdev, netem.latency, netem.limit, netem.loss); + netem_install__(netdev, netem.latency, + netem.limit, netem.loss, netem.jitter); } return error; } @@ -4448,7 +4695,8 @@ netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg) error = tc_parse_qdisc(nlmsg, &kind, &nlattr); if (error == 0) { netem = nl_attr_get(nlattr); - netem_install__(netdev, netem->latency, netem->limit, netem->loss); + netem_install__(netdev, netem->latency, + netem->limit, netem->loss, netem->jitter); return 0; } @@ -4470,6 +4718,7 @@ netem_qdisc_get(const struct netdev *netdev, struct smap *details) smap_add_format(details, "latency", "%u", netem->latency); smap_add_format(details, "limit", "%u", netem->limit); smap_add_format(details, "loss", "%u", netem->loss); + smap_add_format(details, "jitter", "%u", netem->jitter); return 0; } @@ -4479,10 +4728,12 @@ netem_qdisc_set(struct netdev *netdev, const struct smap *details) struct netem netem; netem_parse_qdisc_details__(netdev, details, &netem); - netem_install__(netdev, netem.latency, netem.limit, netem.loss); + netem_install__(netdev, netem.latency, + netem.limit, netem.loss, netem.jitter); netem_get__(netdev)->latency = netem.latency; netem_get__(netdev)->limit = netem.limit; netem_get__(netdev)->loss = netem.loss; + netem_get__(netdev)->jitter = netem.jitter; return 0; } @@ -4504,13 +4755,13 @@ static const struct tc_ops tc_ops_netem = { struct htb { struct tc tc; - unsigned int max_rate; /* In bytes/s. */ + uint64_t max_rate; /* In bytes/s. */ }; struct htb_class { struct tc_queue tc_queue; - unsigned int min_rate; /* In bytes/s. */ - unsigned int max_rate; /* In bytes/s. */ + uint64_t min_rate; /* In bytes/s. */ + uint64_t max_rate; /* In bytes/s. */ unsigned int burst; /* In bytes. */ unsigned int priority; /* Lower values are higher priorities. */ }; @@ -4598,8 +4849,8 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) { opt.quantum = mtu; } - opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst); - opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst); + opt.buffer = tc_calc_buffer(class->min_rate, mtu, class->burst); + opt.cbuffer = tc_calc_buffer(class->max_rate, mtu, class->burst); opt.prio = class->priority; tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE, @@ -4612,15 +4863,26 @@ htb_setup_class__(struct netdev *netdev, unsigned int handle, nl_msg_put_string(&request, TCA_KIND, "htb"); opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS); + +#ifdef HAVE_TCA_HTB_RATE64 + if (class->min_rate > UINT32_MAX) { + nl_msg_put_u64(&request, TCA_HTB_RATE64, class->min_rate); + } + if (class->max_rate > UINT32_MAX) { + nl_msg_put_u64(&request, TCA_HTB_CEIL64, class->max_rate); + } +#endif nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt); - tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate); - tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil); + + tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate, class->min_rate); + tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil, class->max_rate); nl_msg_end_nested(&request, opt_offset); error = tc_transact(&request, NULL); if (error) { VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, " - "min_rate=%u max_rate=%u burst=%u prio=%u (%s)", + "min_rate=%"PRIu64" max_rate=%"PRIu64" burst=%u prio=%u " + "(%s)", netdev_get_name(netdev), tc_get_major(handle), tc_get_minor(handle), tc_get_major(parent), tc_get_minor(parent), @@ -4640,6 +4902,10 @@ htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class) static const struct nl_policy tca_htb_policy[] = { [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false, .min_len = sizeof(struct tc_htb_opt) }, +#ifdef HAVE_TCA_HTB_RATE64 + [TCA_HTB_RATE64] = { .type = NL_A_U64, .optional = true }, + [TCA_HTB_CEIL64] = { .type = NL_A_U64, .optional = true }, +#endif }; struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)]; @@ -4654,7 +4920,15 @@ htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class) htb = nl_attr_get(attrs[TCA_HTB_PARMS]); class->min_rate = htb->rate.rate; class->max_rate = htb->ceil.rate; - class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer); +#ifdef HAVE_TCA_HTB_RATE64 + if (attrs[TCA_HTB_RATE64]) { + class->min_rate = nl_attr_get_u64(attrs[TCA_HTB_RATE64]); + } + if (attrs[TCA_HTB_CEIL64]) { + class->max_rate = nl_attr_get_u64(attrs[TCA_HTB_CEIL64]); + } +#endif + class->burst = tc_ticks_to_bytes(class->min_rate, htb->buffer); class->priority = htb->prio; return 0; } @@ -4685,18 +4959,18 @@ htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id, } static void -htb_parse_qdisc_details__(struct netdev *netdev_, - const struct smap *details, struct htb_class *hc) +htb_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, + struct htb_class *hc) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); - hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!hc->max_rate) { - enum netdev_features current; + uint32_t current_speed; + uint32_t max_speed OVS_UNUSED; - netdev_linux_read_features(netdev); - current = !netdev->get_features_error ? netdev->current : 0; - hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; + netdev_linux_get_speed_locked(netdev_linux_cast(netdev), + ¤t_speed, &max_speed); + hc->max_rate = current_speed ? current_speed / 8 * 1000000ULL + : NETDEV_DEFAULT_BPS / 8; } hc->min_rate = hc->max_rate; hc->burst = 0; @@ -5157,18 +5431,18 @@ hfsc_query_class__(const struct netdev *netdev, unsigned int handle, } static void -hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details, +hfsc_parse_qdisc_details__(struct netdev *netdev, const struct smap *details, struct hfsc_class *class) { - struct netdev_linux *netdev = netdev_linux_cast(netdev_); - uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8; if (!max_rate) { - enum netdev_features current; + uint32_t current_speed; + uint32_t max_speed OVS_UNUSED; - netdev_linux_read_features(netdev); - current = !netdev->get_features_error ? netdev->current : 0; - max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8; + netdev_linux_get_speed_locked(netdev_linux_cast(netdev), + ¤t_speed, &max_speed); + max_rate = current_speed ? current_speed / 8 * 1000000ULL + : NETDEV_DEFAULT_BPS / 8; } class->min_rate = max_rate; @@ -5643,12 +5917,10 @@ tc_policer_init(struct tc_police *tc_police, uint64_t kbits_rate, * Returns 0 if successful, otherwise a positive errno value. */ static int -tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, - uint32_t kbits_burst, uint32_t kpkts_rate, - uint32_t kpkts_burst) +tc_add_policer(struct netdev *netdev, uint64_t kbits_rate, + uint32_t kbits_burst, uint32_t kpkts_rate, uint32_t kpkts_burst) { size_t basic_offset, police_offset; - struct tc_police tc_police; struct ofpbuf request; struct tcmsg *tcmsg; int error; @@ -5665,9 +5937,9 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS); police_offset = nl_msg_start_nested(&request, TCA_BASIC_ACT); - tc_policer_init(&tc_police, kbits_rate, kbits_burst); - nl_msg_put_act_police(&request, &tc_police, kpkts_rate * 1000ULL, - kpkts_burst * 1000ULL, TC_ACT_UNSPEC); + nl_msg_put_act_police(&request, 0, kbits_rate, kbits_burst, + kpkts_rate * 1000ULL, kpkts_burst * 1000ULL, + TC_ACT_UNSPEC, false); nl_msg_end_nested(&request, police_offset); nl_msg_end_nested(&request, basic_offset); @@ -5680,20 +5952,16 @@ tc_add_policer(struct netdev *netdev, uint32_t kbits_rate, } int -tc_add_policer_action(uint32_t index, uint32_t kbits_rate, +tc_add_policer_action(uint32_t index, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t pkts_rate, uint32_t pkts_burst, bool update) { - struct tc_police tc_police; struct ofpbuf request; struct tcamsg *tcamsg; size_t offset; int flags; int error; - tc_policer_init(&tc_police, kbits_rate, kbits_burst); - tc_police.index = index; - flags = (update ? NLM_F_REPLACE : NLM_F_EXCL) | NLM_F_CREATE; tcamsg = tc_make_action_request(RTM_NEWACTION, flags, &request); if (!tcamsg) { @@ -5701,8 +5969,8 @@ tc_add_policer_action(uint32_t index, uint32_t kbits_rate, } offset = nl_msg_start_nested(&request, TCA_ACT_TAB); - nl_msg_put_act_police(&request, &tc_police, pkts_rate, pkts_burst, - TC_ACT_PIPE); + nl_msg_put_act_police(&request, index, kbits_rate, kbits_burst, pkts_rate, + pkts_burst, TC_ACT_PIPE, true); nl_msg_end_nested(&request, offset); error = tc_transact(&request, NULL); @@ -5718,26 +5986,27 @@ static int tc_update_policer_action_stats(struct ofpbuf *msg, struct ofputil_meter_stats *stats) { + struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca); struct ovs_flow_stats stats_dropped; struct ovs_flow_stats stats_hw; struct ovs_flow_stats stats_sw; const struct nlattr *act; struct nlattr *prio; - struct tcamsg *tca; int error = 0; if (!stats) { goto exit; } - if (NLMSG_HDRLEN + sizeof *tca > msg->size) { + if (!nlmsg || !tca) { VLOG_ERR_RL(&rl, "Failed to get action stats, size error"); error = EPROTO; goto exit; } - tca = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tca); - act = nl_attr_find(msg, NLMSG_HDRLEN + sizeof *tca, TCA_ACT_TAB); + act = nl_attr_find(&b, 0, TCA_ACT_TAB); if (!act) { VLOG_ERR_RL(&rl, "Failed to get action stats, can't find attribute"); error = EPROTO; @@ -5917,7 +6186,7 @@ read_psched(void) /* Returns the number of bytes that can be transmitted in 'ticks' ticks at a * rate of 'rate' bytes per second. */ static unsigned int -tc_ticks_to_bytes(unsigned int rate, unsigned int ticks) +tc_ticks_to_bytes(uint64_t rate, unsigned int ticks) { read_psched(); return (rate * ticks) / ticks_per_s; @@ -5926,7 +6195,7 @@ tc_ticks_to_bytes(unsigned int rate, unsigned int ticks) /* Returns the number of ticks that it would take to transmit 'size' bytes at a * rate of 'rate' bytes per second. */ static unsigned int -tc_bytes_to_ticks(unsigned int rate, unsigned int size) +tc_bytes_to_ticks(uint64_t rate, unsigned int size) { read_psched(); return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0; @@ -5935,7 +6204,7 @@ tc_bytes_to_ticks(unsigned int rate, unsigned int size) /* Returns the number of bytes that need to be reserved for qdisc buffering at * a transmission rate of 'rate' bytes per second. */ static unsigned int -tc_buffer_per_jiffy(unsigned int rate) +tc_buffer_per_jiffy(uint64_t rate) { read_psched(); return rate / buffer_hz; @@ -6002,20 +6271,26 @@ static int tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep, struct nlattr **options, struct netdev_queue_stats *stats) { + struct ofpbuf b = ofpbuf_const_initializer(msg->data, msg->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); static const struct nl_policy tca_policy[] = { [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false }, [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false }, }; struct nlattr *ta[ARRAY_SIZE(tca_policy)]; - if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg), - tca_policy, ta, ARRAY_SIZE(ta))) { + if (!nlmsg || !tc) { + VLOG_ERR_RL(&rl, "failed to parse class message, malformed reply"); + goto error; + } + + if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) { VLOG_WARN_RL(&rl, "failed to parse class message"); goto error; } if (handlep) { - struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc); *handlep = tc->tcm_handle; } @@ -6135,13 +6410,12 @@ tc_del_qdisc(struct netdev *netdev_) if (!tcmsg) { return ENODEV; } - tcmsg->tcm_handle = tc_make_handle(1, 0); tcmsg->tcm_parent = TC_H_ROOT; error = tc_transact(&request, NULL); - if (error == EINVAL) { - /* EINVAL probably means that the default qdisc was in use, in which - * case we've accomplished our purpose. */ + if (error == EINVAL || error == ENOENT) { + /* EINVAL or ENOENT probably means that the default qdisc was in use, + * in which case we've accomplished our purpose. */ error = 0; } if (!error && netdev->tc) { @@ -6160,18 +6434,10 @@ getqdisc_is_safe(void) static bool safe = false; if (ovsthread_once_start(&once)) { - struct utsname utsname; - int major, minor; - - if (uname(&utsname) == -1) { - VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); - } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) { - VLOG_WARN("uname reported bad OS release (%s)", utsname.release); - } else if (major < 2 || (major == 2 && minor < 35)) { - VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s", - utsname.release); - } else { + if (ovs_kernel_is_version_or_newer(2, 35)) { safe = true; + } else { + VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel"); } ovsthread_once_done(&once); } @@ -6293,15 +6559,19 @@ tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu) /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */ /* rate->cell_align = 0; */ /* distro headers. */ rate->mpu = ETH_TOTAL_MIN; - rate->rate = Bps; + rate->rate = MIN(UINT32_MAX, Bps); } /* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink * attribute of the specified "type". * + * A 64-bit rate can be provided via 'rate64' in bps. + * If zero, the rate in 'rate' will be used. + * * See tc_calc_cell_log() above for a description of "rtab"s. */ void -tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) +tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate, + uint64_t rate64) { uint32_t *rtab; unsigned int i; @@ -6312,7 +6582,7 @@ tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) if (packet_size < rate->mpu) { packet_size = rate->mpu; } - rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size); + rtab[i] = tc_bytes_to_ticks(rate64 ? rate64 : rate->rate, packet_size); } } @@ -6321,7 +6591,7 @@ tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate) * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of * 0 is fine.) */ static int -tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes) +tc_calc_buffer(uint64_t Bps, int mtu, uint64_t burst_bytes) { unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu; return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst)); @@ -6632,7 +6902,7 @@ netdev_linux_update_via_netlink(struct netdev_linux *netdev) changed = true; } if (change->primary && netdev_linux_kind_is_lag(change->primary)) { - netdev->is_lag_master = true; + netdev->is_lag_primary = true; } if (changed) { netdev_change_seq_changed(&netdev->up); @@ -6806,86 +7076,230 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) return 0; } +/* Initializes packet 'b' with features enabled in the prepended + * struct virtio_net_hdr. Returns 0 if successful, otherwise a + * positive errno value. */ static int netdev_linux_parse_vnet_hdr(struct dp_packet *b) { struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet); - uint16_t l4proto = 0; if (OVS_UNLIKELY(!vnet)) { - return -EINVAL; + return EINVAL; } if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { return 0; } - if (netdev_linux_parse_l2(b, &l4proto)) { - return -EINVAL; - } - if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) { - if (l4proto == IPPROTO_TCP) { - dp_packet_hwol_set_csum_tcp(b); - } else if (l4proto == IPPROTO_UDP) { + uint16_t l4proto = 0; + + if (netdev_linux_parse_l2(b, &l4proto)) { + return EINVAL; + } + + if (l4proto == IPPROTO_UDP) { dp_packet_hwol_set_csum_udp(b); - } else if (l4proto == IPPROTO_SCTP) { - dp_packet_hwol_set_csum_sctp(b); } + /* The packet has offloaded checksum. However, there is no + * additional information like the protocol used, so it would + * require to parse the packet here. The checksum starting point + * and offset are going to be verified when the packet headers + * are parsed during miniflow extraction. */ + b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start; + b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset; + } else { + b->csum_start = 0; + b->csum_offset = 0; } - if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) { - uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4 - | VIRTIO_NET_HDR_GSO_TCPV6 - | VIRTIO_NET_HDR_GSO_UDP; - uint8_t type = vnet->gso_type & allowed_mask; + int ret = 0; + switch (vnet->gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_TCPV6: + dp_packet_set_tso_segsz(b, (OVS_FORCE uint16_t) vnet->gso_size); + dp_packet_hwol_set_tcp_seg(b); + break; - if (type == VIRTIO_NET_HDR_GSO_TCPV4 - || type == VIRTIO_NET_HDR_GSO_TCPV6) { - dp_packet_hwol_set_tcp_seg(b); - } + case VIRTIO_NET_HDR_GSO_UDP: + /* UFO is not supported. */ + VLOG_WARN_RL(&rl, "Received an unsupported packet with UFO enabled."); + ret = ENOTSUP; + break; + + case VIRTIO_NET_HDR_GSO_NONE: + break; + + default: + ret = ENOTSUP; + VLOG_WARN_RL(&rl, "Received an unsupported packet with GSO type: 0x%x", + vnet->gso_type); } - return 0; + return ret; } -static void +/* Prepends struct virtio_net_hdr to packet 'b'. + * Returns 0 if successful, otherwise a positive errno value. + * Returns EMSGSIZE if the packet 'b' cannot be sent over MTU 'mtu'. */ +static int netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) { - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); + struct virtio_net_hdr v; + struct virtio_net_hdr *vnet = &v; if (dp_packet_hwol_is_tso(b)) { - uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) - + TCP_HEADER_LEN; + uint16_t tso_segsz = dp_packet_get_tso_segsz(b); + struct tcp_header *tcp = dp_packet_l4(b); + struct tcp_header *inner_tcp = dp_packet_inner_l4(b); + if (inner_tcp) { + tcp = inner_tcp; + } + int tcp_hdr_len = TCP_OFFSET(tcp->tcp_ctl) * 4; + int hdr_len = ((char *) tcp - (char *) dp_packet_eth(b)) + + tcp_hdr_len; + int max_packet_len = mtu + ETH_HEADER_LEN + VLAN_HEADER_LEN; + + if (OVS_UNLIKELY((hdr_len + tso_segsz) > max_packet_len)) { + VLOG_WARN_RL(&rl, "Oversized TSO packet. hdr_len: %"PRIu32", " + "gso: %"PRIu16", max length: %"PRIu32".", hdr_len, + tso_segsz, max_packet_len); + return EMSGSIZE; + } vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; - vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); + vnet->gso_size = (OVS_FORCE __virtio16)(tso_segsz); if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; - } else { + } else if (dp_packet_hwol_tx_ipv6(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + } else { + VLOG_ERR_RL(&rl, "Unknown gso_type for TSO packet. " + "Flags: %#"PRIx64, + (uint64_t) *dp_packet_ol_flags_ptr(b)); + return EINVAL; } - } else { - vnet->flags = VIRTIO_NET_HDR_GSO_NONE; - } + vnet->hdr_len = 0; + vnet->gso_size = 0; + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; + } + + bool l4_is_good = dp_packet_l4_checksum_good(b); + + if ((dp_packet_hwol_is_tunnel_vxlan(b) || + dp_packet_hwol_is_tunnel_geneve(b)) && + dp_packet_hwol_tx_l4_checksum(b)) { + /* This condition is needed because dp-packet doesn't currently track + * outer and inner checksum statuses seperately. In the case of these + * two tunnel types we can end up setting outer l4 as good but still + * need to complete the inner l4. */ + l4_is_good = !(dp_packet_hwol_l4_is_tcp(b) || + dp_packet_hwol_l4_is_udp(b)); + } + + if (l4_is_good) { + /* The packet has good L4 checksum. No need to validate again. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = VIRTIO_NET_HDR_F_DATA_VALID; + + /* It is possible that L4 is good but the IPv4 checksum isn't + * complete. For example in the case of UDP encapsulation of an ARP + * packet where the UDP checksum is 0. */ + if (dp_packet_hwol_l3_csum_ipv4_ol(b)) { + dp_packet_ip_set_header_csum(b, false); + } + } else if (dp_packet_hwol_tx_l4_checksum(b)) { + /* The csum calculation is offloaded. */ + if (dp_packet_hwol_l4_is_tcp(b)) { + /* Virtual I/O Device (VIRTIO) Version 1.1 + * 5.1.6.2 Packet Transmission + * If the driver negotiated VIRTIO_NET_F_CSUM, it can skip + * checksumming the packet: + * - flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set, + * - csum_start is set to the offset within the packet + * to begin checksumming, and + * - csum_offset indicates how many bytes after the + * csum_start the new (16 bit ones complement) checksum + * is placed by the device. + * The TCP checksum field in the packet is set to the sum of + * the TCP pseudo header, so that replacing it by the ones + * complement checksum of the TCP header and body will give + * the correct result. */ + void *l3_off = dp_packet_inner_l3(b); + void *l4_off = dp_packet_inner_l4(b); + + if (!l3_off || !l4_off) { + l3_off = dp_packet_l3(b); + l4_off = dp_packet_l4(b); + } - if (dp_packet_hwol_l4_mask(b)) { - vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b) - - (char *)dp_packet_eth(b)); + struct tcp_header *tcp_hdr = l4_off; + ovs_be16 csum = 0; + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = l3_off; + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = l3_off; + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } - if (dp_packet_hwol_l4_is_tcp(b)) { + tcp_hdr->tcp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) ((char *) l4_off - + (char *) dp_packet_data(b)); vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct tcp_header, tcp_csum); } else if (dp_packet_hwol_l4_is_udp(b)) { + /* Favour the inner packet when indicating checksum offsets. */ + void *l3_off = dp_packet_inner_l3(b); + void *l4_off = dp_packet_inner_l4(b); + + if (!l3_off || !l4_off) { + l3_off = dp_packet_l3(b); + l4_off = dp_packet_l4(b); + } + struct udp_header *udp_hdr = l4_off; + + ovs_be16 csum = 0; + + if (dp_packet_hwol_is_ipv4(b)) { + const struct ip_header *ip_hdr = l3_off; + csum = ~csum_finish(packet_csum_pseudoheader(ip_hdr)); + } else if (dp_packet_hwol_tx_ipv6(b)) { + const struct ovs_16aligned_ip6_hdr *ip6_hdr = l4_off; + csum = ~csum_finish(packet_csum_pseudoheader6(ip6_hdr)); + } + + udp_hdr->udp_csum = csum; + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = (OVS_FORCE __virtio16) ((char *) udp_hdr - + (char *) dp_packet_data(b));; vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( struct udp_header, udp_csum); } else if (dp_packet_hwol_l4_is_sctp(b)) { - vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof( - struct sctp_header, sctp_csum); + /* The Linux kernel networking stack only supports csum_start + * and csum_offset when SCTP GSO is enabled. See kernel's + * skb_csum_hwoffload_help(). Currently there is no SCTP + * segmentation offload support in OVS. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } else { - VLOG_WARN_RL(&rl, "Unsupported L4 protocol"); + /* This should only happen when DP_PACKET_OL_TX_L4_MASK includes + * a new flag that is not covered in above checks. */ + VLOG_WARN_RL(&rl, "Unsupported L4 checksum offload. " + "Flags: %"PRIu64, + (uint64_t)*dp_packet_ol_flags_ptr(b)); + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + } else { + /* Packet L4 csum is unknown. */ + vnet->csum_start = vnet->csum_offset = (OVS_FORCE __virtio16) 0; + vnet->flags = 0; } + + dp_packet_push(b, vnet, sizeof *vnet); + return 0; } diff --git a/lib/netdev-linux.h b/lib/netdev-linux.h index 9a416ce505c..ec19b0dedc4 100644 --- a/lib/netdev-linux.h +++ b/lib/netdev-linux.h @@ -29,7 +29,7 @@ struct netdev; int netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag, const char *flag_name, bool enable); int linux_get_ifindex(const char *netdev_name); -int tc_add_policer_action(uint32_t index, uint32_t kbits_rate, +int tc_add_policer_action(uint32_t index, uint64_t kbits_rate, uint32_t kbits_burst, uint32_t pkts_rate, uint32_t pkts_burst, bool update); int tc_del_policer_action(uint32_t index, struct ofputil_meter_stats *stats); diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c index b89dfdd52a8..16c56608d87 100644 --- a/lib/netdev-native-tnl.c +++ b/lib/netdev-native-tnl.c @@ -43,6 +43,7 @@ #include "seq.h" #include "unaligned.h" #include "unixctl.h" +#include "util.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(native_tnl); @@ -88,7 +89,9 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, ovs_be32 ip_src, ip_dst; - if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(packet))) { + /* A packet coming from a network device might have the + * csum already checked. In this case, skip the check. */ + if (OVS_UNLIKELY(!dp_packet_hwol_l3_csum_ipv4_ol(packet))) { if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) { VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum"); return NULL; @@ -142,12 +145,13 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, * * This function sets the IP header's ip_tot_len field (which should be zeroed * as part of 'header') and puts its value into '*ip_tot_size' as well. Also - * updates IP header checksum, as well as the l3 and l4 offsets in 'packet'. + * updates IP header checksum if not offloaded, as well as the l3 and l4 + * offsets in the 'packet'. * * Return pointer to the L4 header added to 'packet'. */ void * -netdev_tnl_push_ip_header(struct dp_packet *packet, - const void *header, int size, int *ip_tot_size) +netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, + int size, int *ip_tot_size, ovs_be32 ipv6_label) { struct eth_header *eth; struct ip_header *ip; @@ -166,12 +170,32 @@ netdev_tnl_push_ip_header(struct dp_packet *packet, ip6 = netdev_tnl_ipv6_hdr(eth); *ip_tot_size -= IPV6_HEADER_LEN; ip6->ip6_plen = htons(*ip_tot_size); + packet_set_ipv6_flow_label(&ip6->ip6_flow, ipv6_label); packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; + + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_tx_outer_ipv6(packet); + } else { + dp_packet_hwol_set_tx_ipv6(packet); + } + + dp_packet_ol_reset_ip_csum_good(packet); return ip6 + 1; } else { ip = netdev_tnl_ip_hdr(eth); ip->ip_tot_len = htons(*ip_tot_size); - ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len); + /* Postpone checksum to when the packet is pushed to the port. */ + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_tx_outer_ipv4(packet); + dp_packet_hwol_set_tx_outer_ipv4_csum(packet); + } else { + dp_packet_hwol_set_tx_ipv4(packet); + dp_packet_hwol_set_tx_ip_csum(packet); + } + + dp_packet_ol_reset_ip_csum_good(packet); *ip_tot_size -= IP_HEADER_LEN; packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size; return ip + 1; @@ -190,7 +214,8 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, } if (udp->udp_csum) { - if (OVS_UNLIKELY(!dp_packet_l4_checksum_valid(packet))) { + if (OVS_LIKELY(!dp_packet_ol_l4_csum_partial(packet)) && + OVS_UNLIKELY(!dp_packet_l4_checksum_good(packet))) { uint32_t csum; if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { csum = packet_csum_pseudoheader6(dp_packet_l3(packet)); @@ -216,24 +241,30 @@ udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, } static void -netdev_tnl_calc_udp_csum(struct udp_header *udp, struct dp_packet *packet, - int ip_tot_size) +dp_packet_tnl_ol_process(struct dp_packet *packet, + const struct ovs_action_push_tnl *data) { - uint32_t csum; + struct ip_header *ip = NULL; - if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) { - csum = packet_csum_pseudoheader6(netdev_tnl_ipv6_hdr( - dp_packet_data(packet))); - } else { - csum = packet_csum_pseudoheader(netdev_tnl_ip_hdr( - dp_packet_data(packet))); - } + if (dp_packet_hwol_l4_mask(packet)) { + ip = dp_packet_l3(packet); - csum = csum_continue(csum, udp, ip_tot_size); - udp->udp_csum = csum_finish(csum); + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE || + data->tnl_type == OVS_VPORT_TYPE_VXLAN) { - if (!udp->udp_csum) { - udp->udp_csum = htons(0xffff); + if (IP_VER(ip->ip_ihl_ver) == 4) { + dp_packet_hwol_set_tx_ipv4(packet); + dp_packet_hwol_tx_ip_csum(packet); + } else if (IP_VER(ip->ip_ihl_ver) == 6) { + dp_packet_hwol_set_tx_ipv6(packet); + } + } + } + + if (data->tnl_type == OVS_VPORT_TYPE_GENEVE) { + dp_packet_hwol_set_tunnel_geneve(packet); + } else if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) { + dp_packet_hwol_set_tunnel_vxlan(packet); } } @@ -242,17 +273,40 @@ netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED, struct dp_packet *packet, const struct ovs_action_push_tnl *data) { + uint16_t l3_ofs = packet->l3_ofs; + uint16_t l4_ofs = packet->l4_ofs; struct udp_header *udp; int ip_tot_size; - udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, &ip_tot_size); + dp_packet_tnl_ol_process(packet, data); + udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); /* set udp src port */ udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); if (udp->udp_csum) { - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + dp_packet_ol_reset_l4_csum_good(packet); + if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + dp_packet_hwol_set_outer_udp_csum(packet); + } else { + dp_packet_hwol_set_csum_udp(packet); + } + } + + if (packet->csum_start && packet->csum_offset) { + dp_packet_ol_set_l4_csum_partial(packet); + } else if (!udp->udp_csum) { + dp_packet_ol_set_l4_csum_good(packet); + } + + if (l3_ofs != UINT16_MAX) { + packet->inner_l3_ofs = l3_ofs + data->header_len; + } + if (l4_ofs != UINT16_MAX) { + packet->inner_l4_ofs = l4_ofs + data->header_len; } } @@ -276,7 +330,7 @@ eth_build_header(struct ovs_action_push_tnl *data, void * netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params, - uint8_t next_proto) + uint8_t next_proto, ovs_be32 ipv6_label) { void *l3; @@ -297,8 +351,8 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, ip->ip_frag_off = (params->flow->tunnel.flags & FLOW_TNL_F_DONT_FRAGMENT) ? htons(IP_DF) : 0; - /* Checksum has already been zeroed by eth_build_header. */ - ip->ip_csum = csum(ip, sizeof *ip); + /* The checksum will be calculated when the headers are pushed + * to the packet if offloading is not enabled. */ data->header_len += IP_HEADER_LEN; return ip + 1; @@ -308,7 +362,8 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, ip6 = (struct ovs_16aligned_ip6_hdr *) l3; put_16aligned_be32(&ip6->ip6_flow, htonl(6 << 28) | - htonl(params->flow->tunnel.ip_tos << 20)); + htonl(params->flow->tunnel.ip_tos << 20) | + (ipv6_label & htonl(IPV6_LABEL_MASK))); ip6->ip6_hlim = params->flow->tunnel.ip_ttl; ip6->ip6_nxt = next_proto; memcpy(&ip6->ip6_src, params->s_ip, sizeof(ovs_be32[4])); @@ -320,16 +375,16 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, } static void * -udp_build_header(struct netdev_tunnel_config *tnl_cfg, +udp_build_header(const struct netdev_tunnel_config *tnl_cfg, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { struct udp_header *udp; - udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP); + udp = netdev_tnl_ip_build_header(data, params, IPPROTO_UDP, 0); udp->udp_dst = tnl_cfg->dst_port; - if (params->is_ipv6 || params->flow->tunnel.flags & FLOW_TNL_F_CSUM) { + if (params->flow->tunnel.flags & FLOW_TNL_F_CSUM) { /* Write a value in now to mark that we should compute the checksum * later. 0xffff is handy because it is transparent to the * calculation. */ @@ -421,11 +476,14 @@ parse_gre_header(struct dp_packet *packet, struct dp_packet * netdev_gre_pop_header(struct dp_packet *packet) { + const void *data_dp = dp_packet_data(packet); struct pkt_metadata *md = &packet->md; struct flow_tnl *tnl = &md->tunnel; int hlen = sizeof(struct eth_header) + 4; - hlen += netdev_tnl_is_header_ipv6(dp_packet_data(packet)) ? + ovs_assert(data_dp); + + hlen += netdev_tnl_is_header_ipv6(data_dp) ? IPV6_HEADER_LEN : IP_HEADER_LEN; pkt_metadata_init_tnl(md); @@ -452,11 +510,11 @@ netdev_gre_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; int ip_tot_size; - greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, &ip_tot_size); + greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); if (greh->flags & htons(GRE_CSUM)) { ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1); @@ -468,8 +526,7 @@ netdev_gre_push_header(const struct netdev *netdev, int seq_ofs = gre_header_len(greh->flags) - 4; ovs_16aligned_be32 *seq_opt = ALIGNED_CAST(ovs_16aligned_be32 *, (char *)greh + seq_ofs); - tnl_cfg = &dev->tnl_cfg; - put_16aligned_be32(seq_opt, htonl(tnl_cfg->seqno++)); + put_16aligned_be32(seq_opt, htonl(atomic_count_inc(&dev->gre_seqno))); } } @@ -478,25 +535,19 @@ netdev_gre_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; ovs_16aligned_be32 *options; unsigned int hlen; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - - greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); + greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0); if (params->flow->packet_type == htonl(PT_ETH)) { greh->protocol = htons(ETH_TYPE_TEB); } else if (pt_ns(params->flow->packet_type) == OFPHTN_ETHERTYPE) { greh->protocol = pt_ns_type_be(params->flow->packet_type); } else { - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } greh->flags = 0; @@ -507,6 +558,8 @@ netdev_gre_build_header(const struct netdev *netdev, options++; } + tnl_cfg = netdev_get_tunnel_config(netdev); + if (tnl_cfg->out_key_present) { greh->flags |= htons(GRE_KEY); put_16aligned_be32(options, be64_to_be32(params->flow->tunnel.tun_id)); @@ -519,8 +572,6 @@ netdev_gre_build_header(const struct netdev *netdev, options++; } - ovs_mutex_unlock(&dev->mutex); - hlen = (uint8_t *) options - (uint8_t *) greh; data->header_len += hlen; @@ -605,19 +656,17 @@ netdev_erspan_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct erspan_base_hdr *ersh; struct gre_base_hdr *greh; struct erspan_md2 *md2; int ip_tot_size; - greh = netdev_tnl_push_ip_header(packet, data->header, - data->header_len, &ip_tot_size); + greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); /* update GRE seqno */ - tnl_cfg = &dev->tnl_cfg; ovs_16aligned_be32 *seqno = (ovs_16aligned_be32 *) (greh + 1); - put_16aligned_be32(seqno, htonl(tnl_cfg->seqno++)); + put_16aligned_be32(seqno, htonl(atomic_count_inc(&dev->gre_seqno))); /* update v2 timestamp */ if (greh->protocol == htons(ETH_TYPE_ERSPAN2)) { @@ -632,8 +681,7 @@ netdev_erspan_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gre_base_hdr *greh; struct erspan_base_hdr *ersh; unsigned int hlen; @@ -641,21 +689,19 @@ netdev_erspan_build_header(const struct netdev *netdev, int erspan_ver; uint16_t sid; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE); + greh = netdev_tnl_ip_build_header(data, params, IPPROTO_GRE, 0); ersh = ERSPAN_HDR(greh); tun_id = ntohl(be64_to_be32(params->flow->tunnel.tun_id)); /* ERSPAN only has 10-bit session ID */ if (tun_id & ~ERSPAN_SID_MASK) { - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } else { sid = (uint16_t) tun_id; } + tnl_cfg = netdev_get_tunnel_config(netdev); + if (tnl_cfg->erspan_ver_flow) { erspan_ver = params->flow->tunnel.erspan_ver; } else { @@ -702,12 +748,9 @@ netdev_erspan_build_header(const struct netdev *netdev, hlen = ERSPAN_GREHDR_LEN + sizeof *ersh + ERSPAN_V2_MDSIZE; } else { VLOG_WARN_RL(&err_rl, "ERSPAN version error %d", tnl_cfg->erspan_ver); - ovs_mutex_unlock(&dev->mutex); - return 1; + return EINVAL; } - ovs_mutex_unlock(&dev->mutex); - data->header_len += hlen; if (params->is_ipv6) { @@ -786,25 +829,25 @@ netdev_gtpu_push_header(const struct netdev *netdev, const struct ovs_action_push_tnl *data) { struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct udp_header *udp; struct gtpuhdr *gtpuh; int ip_tot_size; unsigned int payload_len; payload_len = dp_packet_size(packet); - udp = netdev_tnl_push_ip_header(packet, data->header, - data->header_len, &ip_tot_size); + udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len, + &ip_tot_size, 0); udp->udp_src = netdev_tnl_get_src_port(packet); udp->udp_len = htons(ip_tot_size); - netdev_tnl_calc_udp_csum(udp, packet, ip_tot_size); + /* Postpone checksum to the egress netdev. */ + dp_packet_hwol_set_csum_udp(packet); + dp_packet_ol_reset_l4_csum_good(packet); gtpuh = ALIGNED_CAST(struct gtpuhdr *, udp + 1); - tnl_cfg = &dev->tnl_cfg; - if (tnl_cfg->set_seq) { + if (gtpuh->md.flags & GTPU_S_MASK) { ovs_be16 *seqno = ALIGNED_CAST(ovs_be16 *, gtpuh + 1); - *seqno = htons(tnl_cfg->seqno++); + *seqno = htons(atomic_count_inc(&dev->gre_seqno)); payload_len += sizeof(struct gtpuhdr_opt); } gtpuh->len = htons(payload_len); @@ -815,13 +858,12 @@ netdev_gtpu_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct gtpuhdr *gtph; unsigned int gtpu_hlen; - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; + tnl_cfg = netdev_get_tunnel_config(netdev); + gtph = udp_build_header(tnl_cfg, data, params); /* Set to default if not set in flow. */ @@ -837,7 +879,6 @@ netdev_gtpu_build_header(const struct netdev *netdev, gtph->md.flags |= GTPU_S_MASK; gtpu_hlen += sizeof(struct gtpuhdr_opt); } - ovs_mutex_unlock(&dev->mutex); data->header_len += gtpu_hlen; data->tnl_type = OVS_VPORT_TYPE_GTPU; @@ -845,6 +886,159 @@ netdev_gtpu_build_header(const struct netdev *netdev, return 0; } +int +netdev_srv6_build_header(const struct netdev *netdev, + struct ovs_action_push_tnl *data, + const struct netdev_tnl_build_header_params *params) +{ + const struct netdev_tunnel_config *tnl_cfg; + union ovs_16aligned_in6_addr *s; + const struct in6_addr *segs; + struct srv6_base_hdr *srh; + ovs_be16 dl_type; + int nr_segs; + int i; + + tnl_cfg = netdev_get_tunnel_config(netdev); + if (tnl_cfg->srv6_num_segs) { + nr_segs = tnl_cfg->srv6_num_segs; + segs = tnl_cfg->srv6_segs; + } else { + /* + * If explicit segment list setting is omitted, tunnel destination + * is considered to be the first segment list. + */ + nr_segs = 1; + segs = ¶ms->flow->tunnel.ipv6_dst; + } + + if (!ipv6_addr_equals(&segs[0], ¶ms->flow->tunnel.ipv6_dst)) { + return EINVAL; + } + + /* Writes the netdev_srv6_flowlabel enum value to the ipv6 + * flowlabel field. It must later be replaced by a valid value + * in the header push. */ + srh = netdev_tnl_ip_build_header(data, params, IPPROTO_ROUTING, + htonl(tnl_cfg->srv6_flowlabel)); + + srh->rt_hdr.segments_left = nr_segs - 1; + srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; + srh->rt_hdr.hdrlen = 2 * nr_segs; + srh->last_entry = nr_segs - 1; + srh->flags = 0; + srh->tag = 0; + + dl_type = params->flow->dl_type; + if (dl_type == htons(ETH_TYPE_IP)) { + srh->rt_hdr.nexthdr = IPPROTO_IPIP; + } else if (dl_type == htons(ETH_TYPE_IPV6)) { + srh->rt_hdr.nexthdr = IPPROTO_IPV6; + } else { + return EOPNOTSUPP; + } + + s = (union ovs_16aligned_in6_addr *) (srh + 1); + for (i = 0; i < nr_segs; i++) { + /* Segment list is written to the header in reverse order. */ + memcpy(s, &segs[nr_segs - i - 1], sizeof *s); + s++; + } + + data->header_len += sizeof *srh + 8 * srh->rt_hdr.hdrlen; + data->tnl_type = OVS_VPORT_TYPE_SRV6; + + return 0; +} + +void +netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED, + struct dp_packet *packet, + const struct ovs_action_push_tnl *data) +{ + struct ovs_16aligned_ip6_hdr *inner_ip6, *outer_ip6; + enum netdev_srv6_flowlabel srv6_flowlabel; + ovs_be32 ipv6_label = 0; + int ip_tot_size; + uint32_t flow; + + inner_ip6 = dp_packet_l3(packet); + outer_ip6 = netdev_tnl_ipv6_hdr((void *) data->header); + srv6_flowlabel = ntohl(get_16aligned_be32(&outer_ip6->ip6_flow)) & + IPV6_LABEL_MASK; + + switch (srv6_flowlabel) { + case SRV6_FLOWLABEL_COPY: + flow = ntohl(get_16aligned_be32(&inner_ip6->ip6_flow)); + ipv6_label = (flow >> 28) == 6 ? htonl(flow & IPV6_LABEL_MASK) : 0; + break; + + case SRV6_FLOWLABEL_ZERO: + ipv6_label = 0; + break; + + case SRV6_FLOWLABEL_COMPUTE: + ipv6_label = htonl(dp_packet_get_rss_hash(packet) & IPV6_LABEL_MASK); + break; + } + + netdev_tnl_push_ip_header(packet, data->header, + data->header_len, &ip_tot_size, ipv6_label); +} + +struct dp_packet * +netdev_srv6_pop_header(struct dp_packet *packet) +{ + const struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + size_t size = dp_packet_l3_size(packet) - IPV6_HEADER_LEN; + struct pkt_metadata *md = &packet->md; + struct flow_tnl *tnl = &md->tunnel; + const struct ip6_rt_hdr *rt_hdr; + uint8_t nw_proto = nh->ip6_nxt; + const void *data = nh + 1; + uint8_t nw_frag = 0; + unsigned int hlen; + + /* + * Verifies that the routing header is present in the IPv6 + * extension headers and that its type is SRv6. + */ + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, + NULL, &rt_hdr)) { + goto err; + } + + if (!rt_hdr || rt_hdr->type != IPV6_SRCRT_TYPE_4) { + goto err; + } + + if (rt_hdr->segments_left > 0) { + VLOG_WARN_RL(&err_rl, "invalid srv6 segments_left=%d\n", + rt_hdr->segments_left); + goto err; + } + + if (rt_hdr->nexthdr == IPPROTO_IPIP) { + packet->packet_type = htonl(PT_IPV4); + } else if (rt_hdr->nexthdr == IPPROTO_IPV6) { + packet->packet_type = htonl(PT_IPV6); + } else { + goto err; + } + + pkt_metadata_init_tnl(md); + if (!netdev_tnl_ip_extract_tnl_md(packet, tnl, &hlen)) { + goto err; + } + + dp_packet_reset_packet(packet, hlen); + + return packet; +err: + dp_packet_delete(packet); + return NULL; +} + struct dp_packet * netdev_vxlan_pop_header(struct dp_packet *packet) { @@ -920,13 +1114,10 @@ netdev_vxlan_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; + const struct netdev_tunnel_config *tnl_cfg; struct vxlanhdr *vxh; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; + tnl_cfg = netdev_get_tunnel_config(netdev); vxh = udp_build_header(tnl_cfg, data, params); @@ -951,10 +1142,10 @@ netdev_vxlan_build_header(const struct netdev *netdev, vxh->vx_gpe.next_protocol = VXLAN_GPE_NP_ETHERNET; break; default: - goto drop; + return EINVAL; } } else { - goto drop; + return EINVAL; } } else { put_16aligned_be32(&vxh->vx_flags, htonl(VXLAN_FLAGS)); @@ -962,14 +1153,9 @@ netdev_vxlan_build_header(const struct netdev *netdev, htonl(ntohll(params->flow->tunnel.tun_id) << 8)); } - ovs_mutex_unlock(&dev->mutex); data->header_len += sizeof *vxh; data->tnl_type = OVS_VPORT_TYPE_VXLAN; return 0; - -drop: - ovs_mutex_unlock(&dev->mutex); - return 1; } struct dp_packet * @@ -1033,22 +1219,14 @@ netdev_geneve_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params) { - struct netdev_vport *dev = netdev_vport_cast(netdev); - struct netdev_tunnel_config *tnl_cfg; struct genevehdr *gnh; int opt_len; bool crit_opt; - /* XXX: RCUfy tnl_cfg. */ - ovs_mutex_lock(&dev->mutex); - tnl_cfg = &dev->tnl_cfg; - - gnh = udp_build_header(tnl_cfg, data, params); + gnh = udp_build_header(netdev_get_tunnel_config(netdev), data, params); put_16aligned_be32(&gnh->vni, htonl(ntohll(params->flow->tunnel.tun_id) << 8)); - ovs_mutex_unlock(&dev->mutex); - opt_len = tun_metadata_to_geneve_header(¶ms->flow->tunnel, gnh->options, &crit_opt); diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h index 22ae2ce5369..eb55dd0417a 100644 --- a/lib/netdev-native-tnl.h +++ b/lib/netdev-native-tnl.h @@ -65,6 +65,16 @@ netdev_gtpu_build_header(const struct netdev *netdev, struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *p); +struct dp_packet *netdev_srv6_pop_header(struct dp_packet *); + +void netdev_srv6_push_header(const struct netdev *, + struct dp_packet *, + const struct ovs_action_push_tnl *); + +int netdev_srv6_build_header(const struct netdev *, + struct ovs_action_push_tnl *, + const struct netdev_tnl_build_header_params *); + void netdev_tnl_push_udp_header(const struct netdev *netdev, struct dp_packet *packet, @@ -108,7 +118,7 @@ netdev_tnl_ipv6_hdr(void *eth) void * netdev_tnl_ip_build_header(struct ovs_action_push_tnl *data, const struct netdev_tnl_build_header_params *params, - uint8_t next_proto); + uint8_t next_proto, ovs_be32 ipv6_label); extern uint16_t tnl_udp_port_min; extern uint16_t tnl_udp_port_max; @@ -128,8 +138,8 @@ void * netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl, unsigned int *hlen); void * -netdev_tnl_push_ip_header(struct dp_packet *packet, - const void *header, int size, int *ip_tot_size); +netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header, + int size, int *ip_tot_size, ovs_be32 ipv6_label); void netdev_tnl_egress_port_range(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED); diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c index cceefbc5075..623005b1cb9 100644 --- a/lib/netdev-offload-dpdk.c +++ b/lib/netdev-offload-dpdk.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -734,14 +735,15 @@ dump_flow_action(struct ds *s, struct ds *s_extra, ds_put_cstr(s, "rss / "); } else if (actions->type == RTE_FLOW_ACTION_TYPE_COUNT) { ds_put_cstr(s, "count / "); - } else if (actions->type == RTE_FLOW_ACTION_TYPE_PORT_ID) { - const struct rte_flow_action_port_id *port_id = actions->conf; + } else if (actions->type == RTE_FLOW_ACTION_TYPE_REPRESENTED_PORT) { + const struct rte_flow_action_ethdev *ethdev = actions->conf; - ds_put_cstr(s, "port_id "); - if (port_id) { - ds_put_format(s, "original %d id %d ", - port_id->original, port_id->id); + ds_put_cstr(s, "represented_port "); + + if (ethdev) { + ds_put_format(s, "ethdev_port_id %d ", ethdev->port_id); } + ds_put_cstr(s, "/ "); } else if (actions->type == RTE_FLOW_ACTION_TYPE_DROP) { ds_put_cstr(s, "drop / "); @@ -1098,12 +1100,18 @@ vport_to_rte_tunnel(struct netdev *vport, const struct netdev_tunnel_config *tnl_cfg; memset(tunnel, 0, sizeof *tunnel); + + tnl_cfg = netdev_get_tunnel_config(vport); + if (!tnl_cfg) { + return -1; + } + + if (!IN6_IS_ADDR_V4MAPPED(&tnl_cfg->ipv6_dst)) { + tunnel->is_ipv6 = true; + } + if (!strcmp(netdev_get_type(vport), "vxlan")) { tunnel->type = RTE_FLOW_ITEM_TYPE_VXLAN; - tnl_cfg = netdev_get_tunnel_config(vport); - if (!tnl_cfg) { - return -1; - } tunnel->tp_dst = tnl_cfg->dst_port; if (!VLOG_DROP_DBG(&rl)) { ds_put_format(s_tnl, "flow tunnel create %d type vxlan; ", @@ -1769,19 +1777,22 @@ add_count_action(struct flow_actions *actions) } static int -add_port_id_action(struct flow_actions *actions, - struct netdev *outdev) +add_represented_port_action(struct flow_actions *actions, + struct netdev *outdev) { - struct rte_flow_action_port_id *port_id; + struct rte_flow_action_ethdev *ethdev; int outdev_id; outdev_id = netdev_dpdk_get_port_id(outdev); if (outdev_id < 0) { return -1; } - port_id = xzalloc(sizeof *port_id); - port_id->id = outdev_id; - add_flow_action(actions, RTE_FLOW_ACTION_TYPE_PORT_ID, port_id); + + ethdev = xzalloc(sizeof *ethdev); + ethdev->port_id = outdev_id; + + add_flow_action(actions, RTE_FLOW_ACTION_TYPE_REPRESENTED_PORT, ethdev); + return 0; } @@ -1801,7 +1812,7 @@ add_output_action(struct netdev *netdev, return -1; } if (!netdev_flow_api_equals(netdev, outdev) || - add_port_id_action(actions, outdev)) { + add_represented_port_action(actions, outdev)) { VLOG_DBG_RL(&rl, "%s: Output to port \'%s\' cannot be offloaded.", netdev_get_name(netdev), netdev_get_name(outdev)); ret = -1; @@ -2235,7 +2246,7 @@ netdev_offload_dpdk_actions(struct netdev *netdev, struct nlattr *nl_actions, size_t actions_len) { - const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 }; + const struct rte_flow_attr flow_attr = { .transfer = 1, }; struct flow_actions actions = { .actions = NULL, .cnt = 0, @@ -2338,13 +2349,13 @@ netdev_offload_dpdk_flow_destroy(struct ufid_to_rte_flow_data *rte_flow_data) ovsrcu_get(void *, &netdev->hw_info.offload_data); data->rte_flow_counters[tid]--; - ufid_to_rte_flow_disassociate(rte_flow_data); VLOG_DBG_RL(&rl, "%s/%s: rte_flow 0x%"PRIxPTR " flow destroy %d ufid " UUID_FMT, netdev_get_name(netdev), netdev_get_name(physdev), (intptr_t) rte_flow, netdev_dpdk_get_port_id(physdev), UUID_ARGS((struct uuid *) ufid)); + ufid_to_rte_flow_disassociate(rte_flow_data); } else { VLOG_ERR("Failed flow: %s/%s: flow destroy %d ufid " UUID_FMT, netdev_get_name(netdev), netdev_get_name(physdev), @@ -2530,15 +2541,15 @@ netdev_offload_dpdk_flow_get(struct netdev *netdev, return ret; } -static int -netdev_offload_dpdk_flow_flush(struct netdev *netdev) +static void +flush_netdev_flows_in_related(struct netdev *netdev, struct netdev *related) { - struct cmap *map = offload_data_map(netdev); - struct ufid_to_rte_flow_data *data; unsigned int tid = netdev_offload_thread_id(); + struct cmap *map = offload_data_map(related); + struct ufid_to_rte_flow_data *data; if (!map) { - return -1; + return; } CMAP_FOR_EACH (data, node, map) { @@ -2549,6 +2560,31 @@ netdev_offload_dpdk_flow_flush(struct netdev *netdev) netdev_offload_dpdk_flow_destroy(data); } } +} + +static bool +flush_in_vport_cb(struct netdev *vport, + odp_port_t odp_port OVS_UNUSED, + void *aux) +{ + struct netdev *netdev = aux; + + /* Only vports are related to physical devices. */ + if (netdev_vport_is_vport_class(vport->netdev_class)) { + flush_netdev_flows_in_related(netdev, vport); + } + + return false; +} + +static int +netdev_offload_dpdk_flow_flush(struct netdev *netdev) +{ + flush_netdev_flows_in_related(netdev, netdev); + + if (!netdev_vport_is_vport_class(netdev->netdev_class)) { + netdev_ports_traverse(netdev->dpif_type, flush_in_vport_cb, netdev); + } return 0; } @@ -2665,7 +2701,7 @@ netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *netdev, if (rte_restore_info.flags & RTE_FLOW_RESTORE_INFO_ENCAPSULATED) { if (!vport_netdev->netdev_class || !vport_netdev->netdev_class->pop_header) { - VLOG_ERR_RL(&rl, "vport nedtdev=%s with no pop_header method", + VLOG_ERR_RL(&rl, "vport netdev=%s with no pop_header method", netdev_get_name(vport_netdev)); ret = EOPNOTSUPP; goto close_vport_netdev; diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index f6f90a741fd..3be1c08d24f 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -52,6 +52,7 @@ static struct hmap tc_to_ufid = HMAP_INITIALIZER(&tc_to_ufid); static bool multi_mask_per_prio = false; static bool block_support = false; static uint16_t ct_state_support; +static bool vxlan_gbp_support = false; struct netlink_field { int offset; @@ -97,6 +98,12 @@ static int netdev_tc_parse_nl_actions(struct netdev *netdev, bool *recirc_act, bool more_actions, struct tc_action **need_jump_update); +static void parse_tc_flower_to_stats(struct tc_flower *flower, + struct dpif_flow_stats *stats); + +static int get_ufid_adjust_stats(const ovs_u128 *ufid, + struct dpif_flow_stats *stats); + static bool is_internal_port(const char *type) { @@ -193,6 +200,9 @@ static struct ovs_mutex ufid_lock = OVS_MUTEX_INITIALIZER; * @ufid: ufid assigned to the flow * @id: tc filter id (tcf_id) * @netdev: netdev associated with the tc rule + * @adjust_stats: When flow gets updated with new actions, we need to adjust + * the reported stats to include previous values as the hardware + * rule is removed and re-added. This stats copy is used for it. */ struct ufid_tc_data { struct hmap_node ufid_to_tc_node; @@ -200,6 +210,7 @@ struct ufid_tc_data { ovs_u128 ufid; struct tcf_id id; struct netdev *netdev; + struct dpif_flow_stats adjust_stats; }; static void @@ -233,15 +244,42 @@ del_ufid_tc_mapping(const ovs_u128 *ufid) ovs_mutex_unlock(&ufid_lock); } +static void +netdev_tc_adjust_stats(struct dpif_flow_stats *stats, + const struct dpif_flow_stats *adjust_stats) +{ + /* Do not try to restore the stats->used, as in terse mode dumps TC doesn't + * report TCA_ACT_OPTIONS, so the 'lastused' value is not available, hence + * we report used as 0. + * tcp_flags is not collected by tc, so no need to update it. */ + stats->n_bytes += adjust_stats->n_bytes; + stats->n_packets += adjust_stats->n_packets; +} + /* Wrapper function to delete filter and ufid tc mapping */ static int -del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) +del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid, + struct dpif_flow_stats *stats) { + struct tc_flower flower; int err; - err = tc_del_filter(id); - if (!err) { + if (stats) { + memset(stats, 0, sizeof *stats); + if (!tc_get_flower(id, &flower)) { + struct dpif_flow_stats adjust_stats; + + parse_tc_flower_to_stats(&flower, stats); + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + } + } + + err = tc_del_flower_filter(id); + if (!err || err == ENODEV) { del_ufid_tc_mapping(ufid); + return 0; } return err; } @@ -249,7 +287,7 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) /* Add ufid entry to ufid_to_tc hashmap. */ static void add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid, - struct tcf_id *id) + struct tcf_id *id, struct dpif_flow_stats *stats) { struct ufid_tc_data *new_data = xzalloc(sizeof *new_data); size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0); @@ -261,6 +299,9 @@ add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid, new_data->ufid = *ufid; new_data->id = *id; new_data->netdev = netdev_ref(netdev); + if (stats) { + new_data->adjust_stats = *stats; + } ovs_mutex_lock(&ufid_lock); hmap_insert(&ufid_to_tc, &new_data->ufid_to_tc_node, ufid_hash); @@ -292,6 +333,30 @@ get_ufid_tc_mapping(const ovs_u128 *ufid, struct tcf_id *id) return ENOENT; } +/* Get adjust_stats from ufid_to_tc hashmap. + * + * Returns 0 if successful and fills stats with adjust_stats. + * Otherwise returns the error. +*/ +static int +get_ufid_adjust_stats(const ovs_u128 *ufid, struct dpif_flow_stats *stats) +{ + size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0); + struct ufid_tc_data *data; + + ovs_mutex_lock(&ufid_lock); + HMAP_FOR_EACH_WITH_HASH (data, ufid_to_tc_node, ufid_hash, &ufid_to_tc) { + if (ovs_u128_equals(*ufid, data->ufid)) { + *stats = data->adjust_stats; + ovs_mutex_unlock(&ufid_lock); + return 0; + } + } + ovs_mutex_unlock(&ufid_lock); + + return ENOENT; +} + /* Find ufid entry in ufid_to_tc hashmap using tcf_id id. * The result is saved in ufid. * @@ -325,6 +390,30 @@ struct prio_map_data { uint16_t prio; }; +static uint16_t +get_next_available_prio(ovs_be16 protocol) +{ + static uint16_t last_prio = TC_RESERVED_PRIORITY_MAX; + + if (multi_mask_per_prio) { + if (protocol == htons(ETH_P_IP)) { + return TC_RESERVED_PRIORITY_IPV4; + } else if (protocol == htons(ETH_P_IPV6)) { + return TC_RESERVED_PRIORITY_IPV6; + } else if (protocol == htons(ETH_P_8021Q)) { + return TC_RESERVED_PRIORITY_VLAN; + } + } + + /* last_prio can overflow if there will be many different kinds of + * flows which shouldn't happen organically. */ + if (last_prio == UINT16_MAX) { + return TC_RESERVED_PRIORITY_NONE; + } + + return ++last_prio; +} + /* Get free prio for tc flower * If prio is already allocated for mask/eth_type combination then return it. * If not assign new prio. @@ -336,11 +425,11 @@ get_prio_for_tc_flower(struct tc_flower *flower) { static struct hmap prios = HMAP_INITIALIZER(&prios); static struct ovs_mutex prios_lock = OVS_MUTEX_INITIALIZER; - static uint16_t last_prio = TC_RESERVED_PRIORITY_MAX; size_t key_len = sizeof(struct tc_flower_key); size_t hash = hash_int((OVS_FORCE uint32_t) flower->key.eth_type, 0); struct prio_map_data *data; struct prio_map_data *new_data; + uint16_t prio; if (!multi_mask_per_prio) { hash = hash_bytes(&flower->mask, key_len, hash); @@ -359,21 +448,20 @@ get_prio_for_tc_flower(struct tc_flower *flower) } } - if (last_prio == UINT16_MAX) { - /* last_prio can overflow if there will be many different kinds of - * flows which shouldn't happen organically. */ + prio = get_next_available_prio(flower->key.eth_type); + if (prio == TC_RESERVED_PRIORITY_NONE) { ovs_mutex_unlock(&prios_lock); - return 0; + return prio; } new_data = xzalloc(sizeof *new_data); memcpy(&new_data->mask, &flower->mask, key_len); - new_data->prio = ++last_prio; + new_data->prio = prio; new_data->protocol = flower->key.eth_type; hmap_insert(&prios, &new_data->node, hash); ovs_mutex_unlock(&prios_lock); - return new_data->prio; + return prio; } static uint32_t @@ -440,7 +528,11 @@ delete_chains_from_netdev(struct netdev *netdev, struct tcf_id *id) */ HMAP_FOR_EACH_POP (chain_node, node, &map) { id->chain = chain_node->chain; - tc_del_filter(id); + /* Delete empty chain doesn't seem to work with + * tc_del_flower_filter() so use tc_del_filter() + * without specifying TCA_KIND. + */ + tc_del_filter(id, NULL); free(chain_node); } } @@ -461,7 +553,7 @@ netdev_tc_flow_flush(struct netdev *netdev) continue; } - err = tc_del_filter(&data->id); + err = tc_del_flower_filter(&data->id); if (!err) { del_ufid_tc_mapping_unlocked(&data->ufid); } @@ -579,6 +671,27 @@ static void parse_tc_flower_geneve_opts(struct tc_action *action, nl_msg_end_nested(buf, geneve_off); } +static int +parse_tc_flower_vxlan_tun_opts(struct tc_action *action, struct ofpbuf *buf) +{ + size_t gbp_off; + uint32_t gbp_raw; + + if (!action->encap.gbp.id_present) { + return 0; + } + if (!vxlan_gbp_support) { + return -EOPNOTSUPP; + } + + gbp_off = nl_msg_start_nested(buf, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + gbp_raw = odp_encode_gbp_raw(action->encap.gbp.flags, + action->encap.gbp.id); + nl_msg_put_u32(buf, OVS_VXLAN_EXT_GBP, gbp_raw); + nl_msg_end_nested(buf, gbp_off); + return 0; +} + static void flower_tun_opt_to_match(struct match *match, struct tc_flower *flower) { @@ -739,6 +852,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, size_t set_offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_SET); size_t tunnel_offset = nl_msg_start_nested(buf, OVS_KEY_ATTR_TUNNEL); + int ret; if (action->encap.id_present) { nl_msg_put_be64(buf, OVS_TUNNEL_KEY_ATTR_ID, action->encap.id); @@ -774,7 +888,10 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, if (!action->encap.no_csum) { nl_msg_put_flag(buf, OVS_TUNNEL_KEY_ATTR_CSUM); } - + ret = parse_tc_flower_vxlan_tun_opts(action, buf); + if (ret) { + return ret; + } parse_tc_flower_geneve_opts(action, buf); nl_msg_end_nested(buf, tunnel_offset); nl_msg_end_nested(buf, set_offset); @@ -787,7 +904,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, outport = netdev_ifindex_to_odp_port(action->out.ifindex_out); if (!outport) { - return ENOENT; + return -ENOENT; } } nl_msg_put_u32(buf, OVS_ACTION_ATTR_OUTPUT, odp_to_u32(outport)); @@ -804,7 +921,11 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, ct_offset = nl_msg_start_nested(buf, OVS_ACTION_ATTR_CT); if (action->ct.commit) { - nl_msg_put_flag(buf, OVS_CT_ATTR_COMMIT); + if (action->ct.force) { + nl_msg_put_flag(buf, OVS_CT_ATTR_FORCE_COMMIT); + } else { + nl_msg_put_flag(buf, OVS_CT_ATTR_COMMIT); + } } if (action->ct.zone) { @@ -822,13 +943,13 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, struct { ovs_u128 key; ovs_u128 mask; - } *ct_label; + } ct_label = { + .key = action->ct.label, + .mask = action->ct.label_mask, + }; - ct_label = nl_msg_put_unspec_uninit(buf, - OVS_CT_ATTR_LABELS, - sizeof *ct_label); - ct_label->key = action->ct.label; - ct_label->mask = action->ct.label_mask; + nl_msg_put_unspec(buf, OVS_CT_ATTR_LABELS, + &ct_label, sizeof ct_label); } if (action->ct.nat_type) { @@ -876,7 +997,7 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, uint32_t meter_id; if (police_idx_lookup(action->police.index, &meter_id)) { - return ENOENT; + return -ENOENT; } nl_msg_put_u32(buf, OVS_ACTION_ATTR_METER, meter_id); } @@ -895,6 +1016,9 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, buf, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); i = parse_tc_flower_to_actions__(flower, buf, i + 1, action->police.result_jump); + if (i < 0) { + return i; + } nl_msg_end_nested(buf, act_offset); act_offset = nl_msg_start_nested( @@ -906,6 +1030,9 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, } if (jump != 0) { i = parse_tc_flower_to_actions__(flower, buf, i, jump); + if (i < 0) { + return i; + } } nl_msg_end_nested(buf, act_offset); @@ -925,11 +1052,11 @@ parse_tc_flower_to_actions__(struct tc_flower *flower, struct ofpbuf *buf, return i; } -static void +static int parse_tc_flower_to_actions(struct tc_flower *flower, struct ofpbuf *buf) { - parse_tc_flower_to_actions__(flower, buf, 0, 0); + return parse_tc_flower_to_actions__(flower, buf, 0, 0); } static int @@ -942,9 +1069,10 @@ parse_tc_flower_to_match(const struct netdev *netdev, struct ofpbuf *buf, bool terse) { - size_t act_off; struct tc_flower_key *key = &flower->key; struct tc_flower_key *mask = &flower->mask; + size_t act_off; + int err; if (terse) { return parse_tc_flower_terse_to_match(flower, match, stats, attrs); @@ -1134,6 +1262,15 @@ parse_tc_flower_to_match(const struct netdev *netdev, match_set_tun_tp_dst_masked(match, flower->key.tunnel.tp_dst, flower->mask.tunnel.tp_dst); } + if (flower->mask.tunnel.gbp.id) { + match_set_tun_gbp_id_masked(match, flower->key.tunnel.gbp.id, + flower->mask.tunnel.gbp.id); + } + if (flower->mask.tunnel.gbp.flags) { + match_set_tun_gbp_flags_masked(match, + flower->key.tunnel.gbp.flags, + flower->mask.tunnel.gbp.flags); + } if (!strcmp(netdev_get_type(netdev), "geneve")) { flower_tun_opt_to_match(match, flower); @@ -1141,7 +1278,10 @@ parse_tc_flower_to_match(const struct netdev *netdev, } act_off = nl_msg_start_nested(buf, OVS_FLOW_ATTR_ACTIONS); - parse_tc_flower_to_actions(flower, buf); + err = parse_tc_flower_to_actions(flower, buf); + if (err < 0) { + return -err; + } nl_msg_end_nested(buf, act_off); *actions = ofpbuf_at_assert(buf, act_off, sizeof(struct nlattr)); @@ -1172,6 +1312,7 @@ netdev_tc_flow_dump_next(struct netdev_flow_dump *dump, get_tc_qdisc_hook(netdev)); while (nl_dump_next(dump->nl_dump, &nl_flow, rbuffer)) { + struct dpif_flow_stats adjust_stats; struct tc_flower flower; if (parse_netlink_to_tc_flower(&nl_flow, &id, &flower, dump->terse)) { @@ -1183,12 +1324,16 @@ netdev_tc_flow_dump_next(struct netdev_flow_dump *dump, continue; } - if (flower.act_cookie.len) { - *ufid = *((ovs_u128 *) flower.act_cookie.data); + if (flower.act_cookie.len >= sizeof *ufid) { + *ufid = get_32aligned_u128(flower.act_cookie.data); } else if (!find_ufid(netdev, &id, ufid)) { continue; } + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + match->wc.masks.in_port.odp_port = u32_to_odp(UINT32_MAX); match->flow.in_port.odp_port = dump->port; match_set_recirc_id(match, id.chain); @@ -1288,7 +1433,12 @@ parse_put_flow_ct_action(struct tc_flower *flower, NL_ATTR_FOR_EACH_UNSAFE (ct_attr, ct_left, ct, ct_len) { switch (nl_attr_type(ct_attr)) { case OVS_CT_ATTR_COMMIT: { - action->ct.commit = true; + action->ct.commit = true; + } + break; + case OVS_CT_ATTR_FORCE_COMMIT: { + action->ct.commit = true; + action->ct.force = true; } break; case OVS_CT_ATTR_ZONE: { @@ -1318,15 +1468,20 @@ parse_put_flow_ct_action(struct tc_flower *flower, break; case OVS_CT_ATTR_LABELS: { const struct { - ovs_u128 key; - ovs_u128 mask; + ovs_32aligned_u128 key; + ovs_32aligned_u128 mask; } *ct_label; ct_label = nl_attr_get_unspec(ct_attr, sizeof *ct_label); - action->ct.label = ct_label->key; - action->ct.label_mask = ct_label->mask; + action->ct.label = get_32aligned_u128(&ct_label->key); + action->ct.label_mask = + get_32aligned_u128(&ct_label->mask); } break; + /* The following option we do not support in tc-ct, and should + * not be ignored for proper operation. */ + case OVS_CT_ATTR_HELPER: + return EOPNOTSUPP; } } @@ -1425,6 +1580,7 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, action->type = TC_ACT_ENCAP; action->encap.id_present = false; + action->encap.gbp.id_present = false; action->encap.no_csum = 1; flower->action_count++; NL_ATTR_FOR_EACH_UNSAFE(tun_attr, tun_left, tunnel, tunnel_len) { @@ -1473,7 +1629,9 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, } break; case OVS_TUNNEL_KEY_ATTR_TP_SRC: { - action->encap.tp_src = nl_attr_get_be16(tun_attr); + /* There is no corresponding attribute in TC. */ + VLOG_DBG_RL(&rl, "unsupported tunnel key attribute TP_SRC"); + return EOPNOTSUPP; } break; case OVS_TUNNEL_KEY_ATTR_TP_DST: { @@ -1486,6 +1644,19 @@ parse_put_flow_set_action(struct tc_flower *flower, struct tc_action *action, action->encap.data.present.len = nl_attr_get_size(tun_attr); } break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: { + if (!vxlan_gbp_support) { + return EOPNOTSUPP; + } + if (odp_vxlan_tun_opts_from_attr(tun_attr, + &action->encap.gbp.id, + &action->encap.gbp.flags, + &action->encap.gbp.id_present)) { + VLOG_ERR_RL(&rl, "error parsing VXLAN options"); + return EINVAL; + } + } + break; default: VLOG_DBG_RL(&rl, "unsupported tunnel key attribute %d", nl_attr_type(tun_attr)); @@ -1616,12 +1787,12 @@ test_key_and_mask(struct match *match) return 0; } -static void +static int flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, struct flow_tnl *tnl_mask) { struct geneve_opt *opt, *opt_mask; - int len, cnt = 0; + int tot_opt_len, len, cnt = 0; /* 'flower' always has an exact match on tunnel metadata length, so having * it in a wrong format is not acceptable unless it is empty. */ @@ -1637,7 +1808,7 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, memset(&tnl_mask->metadata.present.map, 0, sizeof tnl_mask->metadata.present.map); } - return; + return 0; } tnl_mask->flags &= ~FLOW_TNL_F_UDPIF; @@ -1651,7 +1822,7 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, sizeof tnl_mask->metadata.present.len); if (!tnl->metadata.present.len) { - return; + return 0; } memcpy(flower->key.tunnel.metadata.opts.gnv, tnl->metadata.opts.gnv, @@ -1665,7 +1836,16 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, * also not masks, but actual lengths in the 'flower' structure. */ len = flower->key.tunnel.metadata.present.len; while (len) { + if (len < sizeof *opt) { + return EOPNOTSUPP; + } + opt = &flower->key.tunnel.metadata.opts.gnv[cnt]; + tot_opt_len = sizeof *opt + opt->length * 4; + if (len < tot_opt_len) { + return EOPNOTSUPP; + } + opt_mask = &flower->mask.tunnel.metadata.opts.gnv[cnt]; opt_mask->length = opt->length; @@ -1673,6 +1853,8 @@ flower_match_to_tun_opt(struct tc_flower *flower, const struct flow_tnl *tnl, cnt += sizeof(struct geneve_opt) / 4 + opt->length; len -= sizeof(struct geneve_opt) + opt->length * 4; } + + return 0; } static void @@ -2037,6 +2219,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, struct flow *mask = &match->wc.masks; const struct flow_tnl *tnl = &match->flow.tunnel; struct flow_tnl *tnl_mask = &mask->tunnel; + struct dpif_flow_stats adjust_stats; bool recirc_act = false; uint32_t block_id = 0; struct tcf_id id; @@ -2074,6 +2257,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, flower.key.tunnel.ttl = tnl->ip_ttl; flower.key.tunnel.tp_src = tnl->tp_src; flower.key.tunnel.tp_dst = tnl->tp_dst; + flower.key.tunnel.gbp.id = tnl->gbp_id; + flower.key.tunnel.gbp.flags = tnl->gbp_flags; + flower.key.tunnel.gbp.id_present = !!tnl_mask->gbp_id; flower.mask.tunnel.ipv4.ipv4_src = tnl_mask->ip_src; flower.mask.tunnel.ipv4.ipv4_dst = tnl_mask->ip_dst; @@ -2088,6 +2274,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, * Degrading the flow down to exact match for now as a workaround. */ flower.mask.tunnel.tp_dst = OVS_BE16_MAX; flower.mask.tunnel.id = (tnl->flags & FLOW_TNL_F_KEY) ? tnl_mask->tun_id : 0; + flower.mask.tunnel.gbp.id = tnl_mask->gbp_id; + flower.mask.tunnel.gbp.flags = tnl_mask->gbp_flags; + flower.mask.tunnel.gbp.id_present = !!tnl_mask->gbp_id; memset(&tnl_mask->ip_src, 0, sizeof tnl_mask->ip_src); memset(&tnl_mask->ip_dst, 0, sizeof tnl_mask->ip_dst); @@ -2099,6 +2288,8 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, memset(&tnl_mask->tp_dst, 0, sizeof tnl_mask->tp_dst); memset(&tnl_mask->tun_id, 0, sizeof tnl_mask->tun_id); + memset(&tnl_mask->gbp_id, 0, sizeof tnl_mask->gbp_id); + memset(&tnl_mask->gbp_flags, 0, sizeof tnl_mask->gbp_flags); tnl_mask->flags &= ~FLOW_TNL_F_KEY; /* XXX: This is wrong! We're ignoring DF and CSUM flags configuration @@ -2109,7 +2300,11 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, tnl_mask->flags &= ~(FLOW_TNL_F_DONT_FRAGMENT | FLOW_TNL_F_CSUM); if (!strcmp(netdev_get_type(netdev), "geneve")) { - flower_match_to_tun_opt(&flower, tnl, tnl_mask); + err = flower_match_to_tun_opt(&flower, tnl, tnl_mask); + if (err) { + VLOG_WARN_RL(&warn_rl, "Unable to parse geneve options"); + return err; + } } flower.tunnel = true; } else { @@ -2326,14 +2521,16 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, } if ((chain || recirc_act) && !info->recirc_id_shared_with_tc) { - VLOG_ERR_RL(&error_rl, "flow_put: recirc_id sharing not supported"); + VLOG_DBG_RL(&rl, "flow_put: recirc_id sharing not supported"); return EOPNOTSUPP; } + memset(&adjust_stats, 0, sizeof adjust_stats); if (get_ufid_tc_mapping(ufid, &id) == 0) { VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d", id.handle, id.prio); - info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id, ufid); + info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping( + &id, ufid, &adjust_stats); } prio = get_prio_for_tc_flower(&flower); @@ -2351,8 +2548,9 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, if (!err) { if (stats) { memset(stats, 0, sizeof *stats); + netdev_tc_adjust_stats(stats, &adjust_stats); } - add_ufid_tc_mapping(netdev, ufid, &id); + add_ufid_tc_mapping(netdev, ufid, &id, &adjust_stats); } return err; @@ -2383,16 +2581,31 @@ netdev_tc_flow_get(struct netdev *netdev, err = tc_get_flower(&id, &flower); if (err) { - VLOG_ERR_RL(&error_rl, "flow get failed (dev %s prio %d handle %d): %s", + VLOG_ERR_RL(&error_rl, + "flow get failed (dev %s prio %d handle %d): %s", netdev_get_name(netdev), id.prio, id.handle, ovs_strerror(err)); return err; } in_port = netdev_ifindex_to_odp_port(id.ifindex); - parse_tc_flower_to_match(netdev, &flower, match, actions, - stats, attrs, buf, false); + err = parse_tc_flower_to_match(netdev, &flower, match, actions, + stats, attrs, buf, false); + if (err) { + VLOG_ERR_RL(&error_rl, + "flow get parse failed (dev %s prio %d handle %d): %s", + netdev_get_name(netdev), id.prio, id.handle, + ovs_strerror(err)); + return err; + } + + if (stats) { + struct dpif_flow_stats adjust_stats; + if (!get_ufid_adjust_stats(ufid, &adjust_stats)) { + netdev_tc_adjust_stats(stats, &adjust_stats); + } + } match->wc.masks.in_port.odp_port = u32_to_odp(UINT32_MAX); match->flow.in_port.odp_port = in_port; match_set_recirc_id(match, id.chain); @@ -2405,7 +2618,6 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED, const ovs_u128 *ufid, struct dpif_flow_stats *stats) { - struct tc_flower flower; struct tcf_id id; int error; @@ -2414,16 +2626,7 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED, return error; } - if (stats) { - memset(stats, 0, sizeof *stats); - if (!tc_get_flower(&id, &flower)) { - parse_tc_flower_to_stats(&flower, stats); - } - } - - error = del_filter_and_ufid_mapping(&id, ufid); - - return error; + return del_filter_and_ufid_mapping(&id, ufid, stats); } static int @@ -2477,13 +2680,13 @@ probe_multi_mask_per_prio(int ifindex) id2 = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); error = tc_replace_flower(&id2, &flower); - tc_del_filter(&id1); + tc_del_flower_filter(&id1); if (error) { goto out; } - tc_del_filter(&id2); + tc_del_flower_filter(&id2); multi_mask_per_prio = true; VLOG_INFO("probe tc: multiple masks on single tc prio is supported."); @@ -2535,7 +2738,7 @@ probe_ct_state_support(int ifindex) goto out_del; } - tc_del_filter(&id); + tc_del_flower_filter(&id); ct_state_support = OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | OVS_CS_F_TRACKED | @@ -2549,7 +2752,7 @@ probe_ct_state_support(int ifindex) goto out_del; } - tc_del_filter(&id); + tc_del_flower_filter(&id); /* Test for ct_state INVALID support */ memset(&flower, 0, sizeof flower); @@ -2560,7 +2763,7 @@ probe_ct_state_support(int ifindex) goto out; } - tc_del_filter(&id); + tc_del_flower_filter(&id); ct_state_support |= OVS_CS_F_INVALID; /* Test for ct_state REPLY support */ @@ -2576,7 +2779,7 @@ probe_ct_state_support(int ifindex) ct_state_support |= OVS_CS_F_REPLY_DIR; out_del: - tc_del_filter(&id); + tc_del_flower_filter(&id); out: tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS); VLOG_INFO("probe tc: supported ovs ct_state bits: 0x%x", ct_state_support); @@ -2615,6 +2818,51 @@ probe_tc_block_support(int ifindex) } } +static void +probe_vxlan_gbp_support(int ifindex) +{ + struct tc_flower flower; + struct tcf_id id; + int block_id = 0; + int prio = 1; + int error; + + error = tc_add_del_qdisc(ifindex, true, block_id, TC_INGRESS); + if (error) { + return; + } + + memset(&flower, 0, sizeof flower); + + flower.tc_policy = TC_POLICY_SKIP_HW; + flower.key.eth_type = htons(ETH_P_IP); + flower.mask.eth_type = OVS_BE16_MAX; + flower.tunnel = true; + flower.mask.tunnel.id = OVS_BE64_MAX; + flower.mask.tunnel.ipv4.ipv4_src = OVS_BE32_MAX; + flower.mask.tunnel.ipv4.ipv4_dst = OVS_BE32_MAX; + flower.mask.tunnel.tp_dst = OVS_BE16_MAX; + flower.mask.tunnel.gbp.id = OVS_BE16_MAX; + flower.key.tunnel.ipv4.ipv4_src = htonl(0x01010101); + flower.key.tunnel.ipv4.ipv4_dst = htonl(0x01010102); + flower.key.tunnel.tp_dst = htons(46354); + flower.key.tunnel.gbp.id = htons(512); + + id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS); + error = tc_replace_flower(&id, &flower); + if (error) { + goto out; + } + + tc_del_flower_filter(&id); + + vxlan_gbp_support = true; + VLOG_INFO("probe tc: vxlan gbp is supported."); + +out: + tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS); +} + static int tc_get_policer_action_ids(struct hmap *map) { @@ -2729,7 +2977,7 @@ netdev_tc_init_flow_api(struct netdev *netdev) /* fallback here if delete chains fail */ if (!get_chain_supported) { - tc_del_filter(&id); + tc_del_flower_filter(&id); } /* make sure there is no ingress/egress qdisc */ @@ -2742,6 +2990,7 @@ netdev_tc_init_flow_api(struct netdev *netdev) probe_multi_mask_per_prio(ifindex); probe_ct_state_support(ifindex); + probe_vxlan_gbp_support(ifindex); ovs_mutex_lock(&meter_police_ids_mutex); meter_police_ids = id_pool_create(METER_POLICE_IDS_BASE, @@ -2756,8 +3005,9 @@ netdev_tc_init_flow_api(struct netdev *netdev) error = tc_add_del_qdisc(ifindex, true, block_id, hook); if (error && error != EEXIST) { - VLOG_INFO("failed adding ingress qdisc required for offloading: %s", - ovs_strerror(error)); + VLOG_INFO("failed adding ingress qdisc required for offloading " + "on %s: %s", + netdev_get_name(netdev), ovs_strerror(error)); return error; } diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c index 9fde5f7a95f..8a9d3655592 100644 --- a/lib/netdev-offload.c +++ b/lib/netdev-offload.c @@ -183,6 +183,7 @@ netdev_assign_flow_api(struct netdev *netdev) CMAP_FOR_EACH (rfa, cmap_node, &netdev_flow_apis) { if (!rfa->flow_api->init_flow_api(netdev)) { ovs_refcount_ref(&rfa->refcnt); + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, true); ovsrcu_set(&netdev->flow_api, rfa->flow_api); VLOG_INFO("%s: Assigned flow API '%s'.", netdev_get_name(netdev), rfa->flow_api->type); @@ -191,6 +192,7 @@ netdev_assign_flow_api(struct netdev *netdev) VLOG_DBG("%s: flow API '%s' is not suitable.", netdev_get_name(netdev), rfa->flow_api->type); } + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, false); VLOG_INFO("%s: No suitable flow API found.", netdev_get_name(netdev)); return -1; @@ -322,12 +324,28 @@ int netdev_hw_miss_packet_recover(struct netdev *netdev, struct dp_packet *packet) { - const struct netdev_flow_api *flow_api = - ovsrcu_get(const struct netdev_flow_api *, &netdev->flow_api); + const struct netdev_flow_api *flow_api; + bool miss_api_supported; + int rv; + + atomic_read_relaxed(&netdev->hw_info.miss_api_supported, + &miss_api_supported); + if (!miss_api_supported) { + return EOPNOTSUPP; + } + + flow_api = ovsrcu_get(const struct netdev_flow_api *, &netdev->flow_api); + if (!flow_api || !flow_api->hw_miss_packet_recover) { + return EOPNOTSUPP; + } + + rv = flow_api->hw_miss_packet_recover(netdev, packet); + if (rv == EOPNOTSUPP) { + /* API unsupported by the port; avoid subsequent calls. */ + atomic_store_relaxed(&netdev->hw_info.miss_api_supported, false); + } - return (flow_api && flow_api->hw_miss_packet_recover) - ? flow_api->hw_miss_packet_recover(netdev, packet) - : EOPNOTSUPP; + return rv; } int @@ -467,11 +485,13 @@ netdev_set_hw_info(struct netdev *netdev, int type, int val) } /* Protects below port hashmaps. */ -static struct ovs_rwlock netdev_hmap_rwlock = OVS_RWLOCK_INITIALIZER; +static struct ovs_rwlock ifindex_to_port_rwlock = OVS_RWLOCK_INITIALIZER; +static struct ovs_rwlock port_to_netdev_rwlock + OVS_ACQ_BEFORE(ifindex_to_port_rwlock) = OVS_RWLOCK_INITIALIZER; -static struct hmap port_to_netdev OVS_GUARDED_BY(netdev_hmap_rwlock) +static struct hmap port_to_netdev OVS_GUARDED_BY(port_to_netdev_rwlock) = HMAP_INITIALIZER(&port_to_netdev); -static struct hmap ifindex_to_port OVS_GUARDED_BY(netdev_hmap_rwlock) +static struct hmap ifindex_to_port OVS_GUARDED_BY(ifindex_to_port_rwlock) = HMAP_INITIALIZER(&ifindex_to_port); struct port_to_netdev_data { @@ -488,12 +508,12 @@ struct port_to_netdev_data { */ bool netdev_any_oor(void) - OVS_EXCLUDED(netdev_hmap_rwlock) + OVS_EXCLUDED(port_to_netdev_rwlock) { struct port_to_netdev_data *data; bool oor = false; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { struct netdev *dev = data->netdev; @@ -502,7 +522,7 @@ netdev_any_oor(void) break; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return oor; } @@ -576,13 +596,13 @@ netdev_ports_flow_flush(const char *dpif_type) { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { netdev_flow_flush(data->netdev); } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } void @@ -592,7 +612,7 @@ netdev_ports_traverse(const char *dpif_type, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { if (cb(data->netdev, data->dpif_port.port_no, aux)) { @@ -600,25 +620,29 @@ netdev_ports_traverse(const char *dpif_type, } } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } struct netdev_flow_dump ** netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) { + struct netdev_flow_dump **dumps = NULL; struct port_to_netdev_data *data; - struct netdev_flow_dump **dumps; int count = 0; int i = 0; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { count++; } } - dumps = count ? xzalloc(sizeof *dumps * count) : NULL; + if (!count) { + goto unlock; + } + + dumps = xzalloc(sizeof *dumps * count); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type) { @@ -630,7 +654,9 @@ netdev_ports_flow_dump_create(const char *dpif_type, int *ports, bool terse) i++; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + +unlock: + ovs_rwlock_unlock(&port_to_netdev_rwlock); *ports = i; return dumps; @@ -642,15 +668,15 @@ netdev_ports_flow_del(const char *dpif_type, const ovs_u128 *ufid, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type && !netdev_flow_del(data->netdev, ufid, stats)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return 0; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ENOENT; } @@ -663,16 +689,16 @@ netdev_ports_flow_get(const char *dpif_type, struct match *match, { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { if (netdev_get_dpif_type(data->netdev) == dpif_type && !netdev_flow_get(data->netdev, match, actions, ufid, stats, attrs, buf)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return 0; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ENOENT; } @@ -684,7 +710,7 @@ netdev_ports_hash(odp_port_t port, const char *dpif_type) static struct port_to_netdev_data * netdev_ports_lookup(odp_port_t port_no, const char *dpif_type) - OVS_REQ_RDLOCK(netdev_hmap_rwlock) + OVS_REQ_RDLOCK(port_to_netdev_rwlock) { struct port_to_netdev_data *data; @@ -708,9 +734,9 @@ netdev_ports_insert(struct netdev *netdev, struct dpif_port *dpif_port) ovs_assert(dpif_type); - ovs_rwlock_wrlock(&netdev_hmap_rwlock); + ovs_rwlock_wrlock(&port_to_netdev_rwlock); if (netdev_ports_lookup(dpif_port->port_no, dpif_type)) { - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return EEXIST; } @@ -720,14 +746,16 @@ netdev_ports_insert(struct netdev *netdev, struct dpif_port *dpif_port) if (ifindex >= 0) { data->ifindex = ifindex; + ovs_rwlock_wrlock(&ifindex_to_port_rwlock); hmap_insert(&ifindex_to_port, &data->ifindex_node, ifindex); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); } else { data->ifindex = -1; } hmap_insert(&port_to_netdev, &data->portno_node, netdev_ports_hash(dpif_port->port_no, dpif_type)); - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); netdev_init_flow_api(netdev); @@ -740,12 +768,12 @@ netdev_ports_get(odp_port_t port_no, const char *dpif_type) struct port_to_netdev_data *data; struct netdev *ret = NULL; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { ret = netdev_ref(data->netdev); } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -756,19 +784,21 @@ netdev_ports_remove(odp_port_t port_no, const char *dpif_type) struct port_to_netdev_data *data; int ret = ENOENT; - ovs_rwlock_wrlock(&netdev_hmap_rwlock); + ovs_rwlock_wrlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { dpif_port_destroy(&data->dpif_port); netdev_close(data->netdev); /* unref and possibly close */ hmap_remove(&port_to_netdev, &data->portno_node); if (data->ifindex >= 0) { + ovs_rwlock_wrlock(&ifindex_to_port_rwlock); hmap_remove(&ifindex_to_port, &data->ifindex_node); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); } free(data); ret = 0; } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -780,7 +810,7 @@ netdev_ports_get_n_flows(const char *dpif_type, odp_port_t port_no, struct port_to_netdev_data *data; int ret = EOPNOTSUPP; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); data = netdev_ports_lookup(port_no, dpif_type); if (data) { uint64_t thread_n_flows[MAX_OFFLOAD_THREAD_NB] = {0}; @@ -794,7 +824,7 @@ netdev_ports_get_n_flows(const char *dpif_type, odp_port_t port_no, } } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); return ret; } @@ -804,14 +834,14 @@ netdev_ifindex_to_odp_port(int ifindex) struct port_to_netdev_data *data; odp_port_t ret = 0; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&ifindex_to_port_rwlock); HMAP_FOR_EACH_WITH_HASH (data, ifindex_node, ifindex, &ifindex_to_port) { if (data->ifindex == ifindex) { ret = data->dpif_port.port_no; break; } } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&ifindex_to_port_rwlock); return ret; } @@ -829,11 +859,11 @@ netdev_ports_flow_init(void) { struct port_to_netdev_data *data; - ovs_rwlock_rdlock(&netdev_hmap_rwlock); + ovs_rwlock_rdlock(&port_to_netdev_rwlock); HMAP_FOR_EACH (data, portno_node, &port_to_netdev) { netdev_init_flow_api(data->netdev); } - ovs_rwlock_unlock(&netdev_hmap_rwlock); + ovs_rwlock_unlock(&port_to_netdev_rwlock); } void @@ -848,7 +878,8 @@ netdev_set_flow_api_enabled(const struct smap *ovs_other_config) offload_thread_nb = smap_get_ullong(ovs_other_config, "n-offload-threads", DEFAULT_OFFLOAD_THREAD_NB); - if (offload_thread_nb > MAX_OFFLOAD_THREAD_NB) { + if (offload_thread_nb == 0 || + offload_thread_nb > MAX_OFFLOAD_THREAD_NB) { VLOG_WARN("netdev: Invalid number of threads requested: %u", offload_thread_nb); offload_thread_nb = DEFAULT_OFFLOAD_THREAD_NB; diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h index 180d3f95f06..47f8e6f48b7 100644 --- a/lib/netdev-offload.h +++ b/lib/netdev-offload.h @@ -20,6 +20,7 @@ #include "openvswitch/netdev.h" #include "openvswitch/types.h" +#include "ovs-atomic.h" #include "ovs-rcu.h" #include "ovs-thread.h" #include "openvswitch/ofp-meter.h" @@ -46,6 +47,7 @@ struct ovs_action_push_tnl; /* Offload-capable (HW) netdev information */ struct netdev_hw_info { bool oor; /* Out of Offload Resources ? */ + atomic_bool miss_api_supported; /* hw_miss_packet_recover() supported.*/ int offload_count; /* Pending (non-offloaded) flow count */ int pending_count; /* Offloaded flow count */ OVSRCU_TYPE(void *) offload_data; /* Offload metadata. */ @@ -70,7 +72,7 @@ struct offload_info { * sync with datapath recirc ids. */ /* - * The flow mark id assigened to the flow. If any pkts hit the flow, + * The flow mark id assigned to the flow. If any pkts hit the flow, * it will be in the pkt meta data. */ uint32_t flow_mark; diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h index b5420947d0c..22840a058b7 100644 --- a/lib/netdev-provider.h +++ b/lib/netdev-provider.h @@ -43,6 +43,10 @@ enum netdev_ol_flags { NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2, NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3, NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4, + NETDEV_TX_VXLAN_TNL_TSO = 1 << 5, + NETDEV_TX_GENEVE_TNL_TSO = 1 << 6, + NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM = 1 << 7, + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM = 1 << 8, }; /* A network device (e.g. an Ethernet device). @@ -500,6 +504,15 @@ struct netdev_class { enum netdev_features *supported, enum netdev_features *peer); + /* Stores the current and maximum supported link speed by 'netdev' into + * each of '*current' and '*max'. Each value represents the speed in Mbps. + * If any of the speeds is unknown, a zero value must be stored. + * + * This function may be set to null if it would always return EOPNOTSUPP. + */ + int (*get_speed)(const struct netdev *netdev, uint32_t *current, + uint32_t *max); + /* Set the features advertised by 'netdev' to 'advertise', which is a * set of NETDEV_F_* bits. * diff --git a/lib/netdev-vport-private.h b/lib/netdev-vport-private.h index d89a28c66c6..586231057c6 100644 --- a/lib/netdev-vport-private.h +++ b/lib/netdev-vport-private.h @@ -22,11 +22,17 @@ #include "compiler.h" #include "netdev.h" #include "netdev-provider.h" +#include "ovs-atomic.h" #include "ovs-thread.h" struct netdev_vport { struct netdev up; + OVSRCU_TYPE(const struct netdev_tunnel_config *) tnl_cfg; + + /* Sequence number for outgoing GRE packets. */ + atomic_count gre_seqno; + /* Protects all members below. */ struct ovs_mutex mutex; @@ -34,7 +40,6 @@ struct netdev_vport { struct netdev_stats stats; /* Tunnels. */ - struct netdev_tunnel_config tnl_cfg; char egress_iface[IFNAMSIZ]; bool carrier_status; diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c index 6d370a82b88..1e11c63190c 100644 --- a/lib/netdev-vport.c +++ b/lib/netdev-vport.c @@ -38,6 +38,7 @@ #include "netdev-provider.h" #include "netdev-vport-private.h" #include "openvswitch/dynamic-string.h" +#include "ovs-atomic.h" #include "ovs-router.h" #include "packets.h" #include "openvswitch/poll-loop.h" @@ -69,8 +70,8 @@ static int get_patch_config(const struct netdev *netdev, struct smap *args); static int get_tunnel_config(const struct netdev *, struct smap *args); static bool tunnel_check_status_change__(struct netdev_vport *); static void update_vxlan_global_cfg(struct netdev *, - struct netdev_tunnel_config *, - struct netdev_tunnel_config *); + const struct netdev_tunnel_config *, + const struct netdev_tunnel_config *); struct vport_class { const char *dpif_port; @@ -91,10 +92,16 @@ vport_class_cast(const struct netdev_class *class) return CONTAINER_OF(class, struct vport_class, netdev_class); } +static const struct netdev_tunnel_config * +vport_tunnel_config(struct netdev_vport *netdev) +{ + return ovsrcu_get(const struct netdev_tunnel_config *, &netdev->tnl_cfg); +} + static const struct netdev_tunnel_config * get_netdev_tunnel_config(const struct netdev *netdev) { - return &netdev_vport_cast(netdev)->tnl_cfg; + return vport_tunnel_config(netdev_vport_cast(netdev)); } bool @@ -135,8 +142,6 @@ netdev_vport_get_dpif_port(const struct netdev *netdev, } if (netdev_vport_needs_dst_port(netdev)) { - const struct netdev_vport *vport = netdev_vport_cast(netdev); - /* * Note: IFNAMSIZ is 16 bytes long. Implementations should choose * a dpif port name that is short enough to fit including any @@ -145,7 +150,7 @@ netdev_vport_get_dpif_port(const struct netdev *netdev, BUILD_ASSERT(NETDEV_VPORT_NAME_BUFSIZE >= IFNAMSIZ); ovs_assert(strlen(dpif_port) + 6 < IFNAMSIZ); snprintf(namebuf, bufsize, "%s_%d", dpif_port, - ntohs(vport->tnl_cfg.dst_port)); + ntohs(netdev_get_tunnel_config(netdev)->dst_port)); return namebuf; } else { return dpif_port; @@ -163,12 +168,14 @@ netdev_vport_route_changed(void) vports = netdev_get_vports(&n_vports); for (i = 0; i < n_vports; i++) { + const struct netdev_tunnel_config *tnl_cfg; struct netdev *netdev_ = vports[i]; struct netdev_vport *netdev = netdev_vport_cast(netdev_); ovs_mutex_lock(&netdev->mutex); /* Finds all tunnel vports. */ - if (ipv6_addr_is_set(&netdev->tnl_cfg.ipv6_dst)) { + tnl_cfg = netdev_get_tunnel_config(netdev_); + if (tnl_cfg && ipv6_addr_is_set(&tnl_cfg->ipv6_dst)) { if (tunnel_check_status_change__(netdev)) { netdev_change_seq_changed(netdev_); } @@ -199,6 +206,7 @@ netdev_vport_construct(struct netdev *netdev_) uint16_t port = 0; ovs_mutex_init(&dev->mutex); + atomic_count_init(&dev->gre_seqno, 0); eth_addr_random(&dev->etheraddr); if (name && dpif_port && (strlen(name) > strlen(dpif_port) + 1) && @@ -207,26 +215,31 @@ netdev_vport_construct(struct netdev *netdev_) port = atoi(p); } + struct netdev_tunnel_config *tnl_cfg = xzalloc(sizeof *tnl_cfg); + /* If a destination port for tunnel ports is specified in the netdev * name, use it instead of the default one. Otherwise, use the default * destination port */ if (!strcmp(type, "geneve")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(GENEVE_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(GENEVE_DST_PORT); } else if (!strcmp(type, "vxlan")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(VXLAN_DST_PORT); - update_vxlan_global_cfg(netdev_, NULL, &dev->tnl_cfg); + tnl_cfg->dst_port = port ? htons(port) : htons(VXLAN_DST_PORT); + update_vxlan_global_cfg(netdev_, NULL, tnl_cfg); } else if (!strcmp(type, "lisp")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(LISP_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(LISP_DST_PORT); } else if (!strcmp(type, "stt")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(STT_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(STT_DST_PORT); } else if (!strcmp(type, "gtpu")) { - dev->tnl_cfg.dst_port = port ? htons(port) : htons(GTPU_DST_PORT); + tnl_cfg->dst_port = port ? htons(port) : htons(GTPU_DST_PORT); } else if (!strcmp(type, "bareudp")) { - dev->tnl_cfg.dst_port = htons(port); + tnl_cfg->dst_port = htons(port); } - dev->tnl_cfg.dont_fragment = true; - dev->tnl_cfg.ttl = DEFAULT_TTL; + tnl_cfg->dont_fragment = true; + tnl_cfg->ttl = DEFAULT_TTL; + + ovsrcu_set(&dev->tnl_cfg, tnl_cfg); + return 0; } @@ -234,12 +247,15 @@ static void netdev_vport_destruct(struct netdev *netdev_) { struct netdev_vport *netdev = netdev_vport_cast(netdev_); + const struct netdev_tunnel_config *tnl_cfg = vport_tunnel_config(netdev); const char *type = netdev_get_type(netdev_); if (!strcmp(type, "vxlan")) { - update_vxlan_global_cfg(netdev_, &netdev->tnl_cfg, NULL); + update_vxlan_global_cfg(netdev_, tnl_cfg, NULL); } + ovsrcu_set(&netdev->tnl_cfg, NULL); + ovsrcu_postpone(free, CONST_CAST(struct netdev_tunnel_config *, tnl_cfg)); free(netdev->peer); ovs_mutex_destroy(&netdev->mutex); } @@ -282,15 +298,16 @@ static bool tunnel_check_status_change__(struct netdev_vport *netdev) OVS_REQUIRES(netdev->mutex) { + const struct netdev_tunnel_config *tnl_cfg = vport_tunnel_config(netdev); + const struct in6_addr *route; char iface[IFNAMSIZ]; bool status = false; - struct in6_addr *route; struct in6_addr gw; uint32_t mark; iface[0] = '\0'; - route = &netdev->tnl_cfg.ipv6_dst; - mark = netdev->tnl_cfg.egress_pkt_mark; + route = &tnl_cfg->ipv6_dst; + mark = tnl_cfg->egress_pkt_mark; if (ovs_router_lookup(mark, route, iface, NULL, &gw)) { struct netdev *egress_netdev; @@ -425,6 +442,35 @@ parse_tunnel_ip(const char *value, bool accept_mcast, bool *flow, return 0; } +static int +parse_srv6_segs(char *s, struct in6_addr *segs, uint8_t *num_segs) +{ + char *save_ptr = NULL; + char *token; + + if (!s) { + return EINVAL; + } + + *num_segs = 0; + + while ((token = strtok_r(s, ",", &save_ptr)) != NULL) { + if (*num_segs == SRV6_MAX_SEGS) { + return EINVAL; + } + + if (inet_pton(AF_INET6, token, segs) != 1) { + return EINVAL; + } + + segs++; + (*num_segs)++; + s = NULL; + } + + return 0; +} + enum tunnel_layers { TNL_L2 = 1 << 0, /* 1 if a tunnel type can carry Ethernet traffic. */ TNL_L3 = 1 << 1 /* 1 if a tunnel type can carry L3 traffic. */ @@ -444,6 +490,8 @@ tunnel_supported_layers(const char *type, return TNL_L3; } else if (!strcmp(type, "bareudp")) { return TNL_L3; + } else if (!strcmp(type, "srv6")) { + return TNL_L3; } else { return TNL_L2; } @@ -466,8 +514,8 @@ vxlan_get_port_ext_gbp_str(uint16_t port, bool gbp, static void update_vxlan_global_cfg(struct netdev *netdev, - struct netdev_tunnel_config *old_cfg, - struct netdev_tunnel_config *new_cfg) + const struct netdev_tunnel_config *old_cfg, + const struct netdev_tunnel_config *new_cfg) { unsigned int count; char namebuf[20]; @@ -511,19 +559,20 @@ static bool is_concomitant_vxlan_tunnel_present(struct netdev_vport *dev, const struct netdev_tunnel_config *tnl_cfg) { - char namebuf[20]; - const char *type = netdev_get_type(&dev->up); + const struct netdev_tunnel_config *dev_tnl_cfg = vport_tunnel_config(dev); struct vport_class *vclass = vport_class_cast(netdev_get_class(&dev->up)); + const char *type = netdev_get_type(&dev->up); + char namebuf[20]; if (strcmp(type, "vxlan")) { return false; } - if (dev->tnl_cfg.dst_port == tnl_cfg->dst_port && - (dev->tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GBP)) == + if (dev_tnl_cfg->dst_port == tnl_cfg->dst_port && + (dev_tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP)) == (tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP))) { - if (ntohs(dev->tnl_cfg.dst_port) == VXLAN_DST_PORT) { + if (ntohs(dev_tnl_cfg->dst_port) == VXLAN_DST_PORT) { /* Special case where we kept the default port/gbp, only ok if the opposite of the default does not exits */ vxlan_get_port_ext_gbp_str(ntohs(tnl_cfg->dst_port), @@ -539,9 +588,9 @@ is_concomitant_vxlan_tunnel_present(struct netdev_vport *dev, } /* Same port: ok if no one is left with the previous configuration */ - if (dev->tnl_cfg.dst_port == tnl_cfg->dst_port) { - vxlan_get_port_ext_gbp_str(ntohs(dev->tnl_cfg.dst_port), - dev->tnl_cfg.exts & + if (dev_tnl_cfg->dst_port == tnl_cfg->dst_port) { + vxlan_get_port_ext_gbp_str(ntohs(dev_tnl_cfg->dst_port), + dev_tnl_cfg->exts & (1 << OVS_VXLAN_EXT_GBP), namebuf, sizeof(namebuf)); @@ -569,6 +618,7 @@ static int set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) { struct netdev_vport *dev = netdev_vport_cast(dev_); + const struct netdev_tunnel_config *curr_tnl_cfg; const char *name = netdev_get_name(dev_); const char *type = netdev_get_type(dev_); struct ds errors = DS_EMPTY_INITIALIZER; @@ -653,7 +703,9 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) tnl_cfg.dst_port = htons(atoi(node->value)); } else if (!strcmp(node->key, "csum") && has_csum) { if (!strcmp(node->value, "true")) { - tnl_cfg.csum = true; + tnl_cfg.csum = NETDEV_TNL_CSUM_ENABLED; + } else if (!strcmp(node->value, "false")) { + tnl_cfg.csum = NETDEV_TNL_CSUM_DISABLED; } } else if (!strcmp(node->key, "seq") && has_seq) { if (!strcmp(node->value, "true")) { @@ -757,6 +809,25 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) goto out; } } + } else if (!strcmp(node->key, "srv6_segs")) { + err = parse_srv6_segs(node->value, + tnl_cfg.srv6_segs, + &tnl_cfg.srv6_num_segs); + + switch (err) { + case EINVAL: + ds_put_format(&errors, "%s: bad %s 'srv6_segs'\n", + name, node->value); + break; + } + } else if (!strcmp(node->key, "srv6_flowlabel")) { + if (!strcmp(node->value, "zero")) { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_ZERO; + } else if (!strcmp(node->value, "compute")) { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_COMPUTE; + } else { + tnl_cfg.srv6_flowlabel = SRV6_FLOWLABEL_COPY; + } } else if (!strcmp(node->key, "payload_type")) { if (!strcmp(node->value, "mpls")) { tnl_cfg.payload_ethertype = htons(ETH_TYPE_MPLS); @@ -788,6 +859,15 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) } } + /* The default csum state for GRE is special as it does have an optional + * checksum but the default configuration isn't correlated with IP version + * like UDP tunnels are. Likewise, tunnels with no checksum at all must be + * in this state. */ + if (tnl_cfg.csum == NETDEV_TNL_CSUM_DEFAULT && + (!has_csum || strstr(type, "gre"))) { + tnl_cfg.csum = NETDEV_TNL_DEFAULT_NO_CSUM; + } + enum tunnel_layers layers = tunnel_supported_layers(type, &tnl_cfg); const char *full_type = (strcmp(type, "vxlan") ? type : (tnl_cfg.exts & (1 << OVS_VXLAN_EXT_GPE) @@ -865,11 +945,16 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) err = EEXIST; goto out; } - update_vxlan_global_cfg(dev_, &dev->tnl_cfg, &tnl_cfg); ovs_mutex_lock(&dev->mutex); - if (memcmp(&dev->tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) { - dev->tnl_cfg = tnl_cfg; + + curr_tnl_cfg = vport_tunnel_config(dev); + update_vxlan_global_cfg(dev_, curr_tnl_cfg, &tnl_cfg); + + if (memcmp(curr_tnl_cfg, &tnl_cfg, sizeof tnl_cfg)) { + ovsrcu_set(&dev->tnl_cfg, xmemdup(&tnl_cfg, sizeof tnl_cfg)); + ovsrcu_postpone(free, CONST_CAST(struct netdev_tunnel_config *, + curr_tnl_cfg)); tunnel_check_status_change__(dev); netdev_change_seq_changed(dev_); } @@ -894,61 +979,60 @@ set_tunnel_config(struct netdev *dev_, const struct smap *args, char **errp) static int get_tunnel_config(const struct netdev *dev, struct smap *args) { - struct netdev_vport *netdev = netdev_vport_cast(dev); + const struct netdev_tunnel_config *tnl_cfg = netdev_get_tunnel_config(dev); const char *type = netdev_get_type(dev); - struct netdev_tunnel_config tnl_cfg; - ovs_mutex_lock(&netdev->mutex); - tnl_cfg = netdev->tnl_cfg; - ovs_mutex_unlock(&netdev->mutex); + if (!tnl_cfg) { + return 0; + } - if (ipv6_addr_is_set(&tnl_cfg.ipv6_dst)) { - smap_add_ipv6(args, "remote_ip", &tnl_cfg.ipv6_dst); - } else if (tnl_cfg.ip_dst_flow) { + if (ipv6_addr_is_set(&tnl_cfg->ipv6_dst)) { + smap_add_ipv6(args, "remote_ip", &tnl_cfg->ipv6_dst); + } else if (tnl_cfg->ip_dst_flow) { smap_add(args, "remote_ip", "flow"); } - if (ipv6_addr_is_set(&tnl_cfg.ipv6_src)) { - smap_add_ipv6(args, "local_ip", &tnl_cfg.ipv6_src); - } else if (tnl_cfg.ip_src_flow) { + if (ipv6_addr_is_set(&tnl_cfg->ipv6_src)) { + smap_add_ipv6(args, "local_ip", &tnl_cfg->ipv6_src); + } else if (tnl_cfg->ip_src_flow) { smap_add(args, "local_ip", "flow"); } - if (tnl_cfg.in_key_flow && tnl_cfg.out_key_flow) { + if (tnl_cfg->in_key_flow && tnl_cfg->out_key_flow) { smap_add(args, "key", "flow"); - } else if (tnl_cfg.in_key_present && tnl_cfg.out_key_present - && tnl_cfg.in_key == tnl_cfg.out_key) { - smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg.in_key)); + } else if (tnl_cfg->in_key_present && tnl_cfg->out_key_present + && tnl_cfg->in_key == tnl_cfg->out_key) { + smap_add_format(args, "key", "%"PRIu64, ntohll(tnl_cfg->in_key)); } else { - if (tnl_cfg.in_key_flow) { + if (tnl_cfg->in_key_flow) { smap_add(args, "in_key", "flow"); - } else if (tnl_cfg.in_key_present) { + } else if (tnl_cfg->in_key_present) { smap_add_format(args, "in_key", "%"PRIu64, - ntohll(tnl_cfg.in_key)); + ntohll(tnl_cfg->in_key)); } - if (tnl_cfg.out_key_flow) { + if (tnl_cfg->out_key_flow) { smap_add(args, "out_key", "flow"); - } else if (tnl_cfg.out_key_present) { + } else if (tnl_cfg->out_key_present) { smap_add_format(args, "out_key", "%"PRIu64, - ntohll(tnl_cfg.out_key)); + ntohll(tnl_cfg->out_key)); } } - if (tnl_cfg.ttl_inherit) { + if (tnl_cfg->ttl_inherit) { smap_add(args, "ttl", "inherit"); - } else if (tnl_cfg.ttl != DEFAULT_TTL) { - smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg.ttl); + } else if (tnl_cfg->ttl != DEFAULT_TTL) { + smap_add_format(args, "ttl", "%"PRIu8, tnl_cfg->ttl); } - if (tnl_cfg.tos_inherit) { + if (tnl_cfg->tos_inherit) { smap_add(args, "tos", "inherit"); - } else if (tnl_cfg.tos) { - smap_add_format(args, "tos", "0x%x", tnl_cfg.tos); + } else if (tnl_cfg->tos) { + smap_add_format(args, "tos", "0x%x", tnl_cfg->tos); } - if (tnl_cfg.dst_port) { - uint16_t dst_port = ntohs(tnl_cfg.dst_port); + if (tnl_cfg->dst_port) { + uint16_t dst_port = ntohs(tnl_cfg->dst_port); if ((!strcmp("geneve", type) && dst_port != GENEVE_DST_PORT) || (!strcmp("vxlan", type) && dst_port != VXLAN_DST_PORT) || @@ -960,33 +1044,35 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) } } - if (tnl_cfg.csum) { + if (tnl_cfg->csum == NETDEV_TNL_CSUM_ENABLED) { smap_add(args, "csum", "true"); + } else if (tnl_cfg->csum == NETDEV_TNL_CSUM_DISABLED) { + smap_add(args, "csum", "false"); } - if (tnl_cfg.set_seq) { + if (tnl_cfg->set_seq) { smap_add(args, "seq", "true"); } - enum tunnel_layers layers = tunnel_supported_layers(type, &tnl_cfg); - if (tnl_cfg.pt_mode != default_pt_mode(layers)) { + enum tunnel_layers layers = tunnel_supported_layers(type, tnl_cfg); + if (tnl_cfg->pt_mode != default_pt_mode(layers)) { smap_add(args, "packet_type", - tnl_cfg.pt_mode == NETDEV_PT_LEGACY_L2 ? "legacy_l2" - : tnl_cfg.pt_mode == NETDEV_PT_LEGACY_L3 ? "legacy_l3" + tnl_cfg->pt_mode == NETDEV_PT_LEGACY_L2 ? "legacy_l2" + : tnl_cfg->pt_mode == NETDEV_PT_LEGACY_L3 ? "legacy_l3" : "ptap"); } - if (!tnl_cfg.dont_fragment) { + if (!tnl_cfg->dont_fragment) { smap_add(args, "df_default", "false"); } - if (tnl_cfg.set_egress_pkt_mark) { + if (tnl_cfg->set_egress_pkt_mark) { smap_add_format(args, "egress_pkt_mark", - "%"PRIu32, tnl_cfg.egress_pkt_mark); + "%"PRIu32, tnl_cfg->egress_pkt_mark); } if (!strcmp("erspan", type) || !strcmp("ip6erspan", type)) { - if (tnl_cfg.erspan_ver_flow) { + if (tnl_cfg->erspan_ver_flow) { /* since version number is not determined, * assume print all other as flow */ @@ -995,27 +1081,27 @@ get_tunnel_config(const struct netdev *dev, struct smap *args) smap_add(args, "erspan_dir", "flow"); smap_add(args, "erspan_hwid", "flow"); } else { - smap_add_format(args, "erspan_ver", "%d", tnl_cfg.erspan_ver); + smap_add_format(args, "erspan_ver", "%d", tnl_cfg->erspan_ver); - if (tnl_cfg.erspan_ver == 1) { - if (tnl_cfg.erspan_idx_flow) { + if (tnl_cfg->erspan_ver == 1) { + if (tnl_cfg->erspan_idx_flow) { smap_add(args, "erspan_idx", "flow"); } else { smap_add_format(args, "erspan_idx", "0x%x", - tnl_cfg.erspan_idx); + tnl_cfg->erspan_idx); } - } else if (tnl_cfg.erspan_ver == 2) { - if (tnl_cfg.erspan_dir_flow) { + } else if (tnl_cfg->erspan_ver == 2) { + if (tnl_cfg->erspan_dir_flow) { smap_add(args, "erspan_dir", "flow"); } else { smap_add_format(args, "erspan_dir", "%d", - tnl_cfg.erspan_dir); + tnl_cfg->erspan_dir); } - if (tnl_cfg.erspan_hwid_flow) { + if (tnl_cfg->erspan_hwid_flow) { smap_add(args, "erspan_hwid", "flow"); } else { smap_add_format(args, "erspan_hwid", "0x%x", - tnl_cfg.erspan_hwid); + tnl_cfg->erspan_hwid); } } } @@ -1145,9 +1231,11 @@ netdev_vport_get_stats(const struct netdev *netdev, struct netdev_stats *stats) static enum netdev_pt_mode netdev_vport_get_pt_mode(const struct netdev *netdev) { - struct netdev_vport *dev = netdev_vport_cast(netdev); + const struct netdev_tunnel_config *tnl_cfg; - return dev->tnl_cfg.pt_mode; + tnl_cfg = netdev_get_tunnel_config(netdev); + + return tnl_cfg ? tnl_cfg->pt_mode : NETDEV_PT_UNKNOWN; } @@ -1297,6 +1385,17 @@ netdev_vport_tunnel_register(void) }, {{NULL, NULL, 0, 0}} }, + { "srv6_sys", + { + TUNNEL_FUNCTIONS_COMMON, + .type = "srv6", + .build_header = netdev_srv6_build_header, + .push_header = netdev_srv6_push_header, + .pop_header = netdev_srv6_pop_header, + .get_ifindex = NETDEV_VPORT_GET_IFINDEX, + }, + {{NULL, NULL, 0, 0}} + }, }; static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; diff --git a/lib/netdev-windows.c b/lib/netdev-windows.c index 4ad45ffa1b2..3fad501e3ee 100644 --- a/lib/netdev-windows.c +++ b/lib/netdev-windows.c @@ -156,6 +156,7 @@ netdev_windows_system_construct(struct netdev *netdev_) struct netdev_windows_netdev_info info; struct ofpbuf *buf; int ret; + const char *type = NULL; /* Query the attributes and runtime status of the netdev. */ ret = query_netdev(netdev_get_name(&netdev->up), &info, &buf); @@ -167,6 +168,16 @@ netdev_windows_system_construct(struct netdev *netdev_) } ofpbuf_delete(buf); + /* Don't create netdev if ovs-type is "internal" + * but the type of netdev->up is "system". */ + type = netdev_get_type(&netdev->up); + if (type && !strcmp(type, "system") && + (info.ovs_type == OVS_VPORT_TYPE_INTERNAL)) { + VLOG_DBG("construct device %s, ovs_type: %u failed", + netdev_get_name(&netdev->up), info.ovs_type); + return 1; + } + netdev->change_seq = 1; netdev->dev_type = info.ovs_type; netdev->port_no = info.port_no; diff --git a/lib/netdev.c b/lib/netdev.c index ce0d4117ac0..f2d921ed633 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -35,6 +35,7 @@ #include "coverage.h" #include "dpif.h" #include "dp-packet.h" +#include "dp-packet-gso.h" #include "openvswitch/dynamic-string.h" #include "fatal-signal.h" #include "hash.h" @@ -43,6 +44,7 @@ #include "netdev-provider.h" #include "netdev-vport.h" #include "odp-netlink.h" +#include "openvswitch/json.h" #include "openflow/openflow.h" #include "packets.h" #include "openvswitch/ofp-print.h" @@ -55,6 +57,7 @@ #include "svec.h" #include "openvswitch/vlog.h" #include "flow.h" +#include "userspace-tso.h" #include "util.h" #ifdef __linux__ #include "tc.h" @@ -66,8 +69,11 @@ COVERAGE_DEFINE(netdev_received); COVERAGE_DEFINE(netdev_sent); COVERAGE_DEFINE(netdev_add_router); COVERAGE_DEFINE(netdev_get_stats); -COVERAGE_DEFINE(netdev_send_prepare_drops); +COVERAGE_DEFINE(netdev_vxlan_tso_drops); +COVERAGE_DEFINE(netdev_geneve_tso_drops); COVERAGE_DEFINE(netdev_push_header_drops); +COVERAGE_DEFINE(netdev_soft_seg_good); +COVERAGE_DEFINE(netdev_soft_seg_drops); struct netdev_saved_flags { struct netdev *netdev; @@ -431,6 +437,7 @@ netdev_open(const char *name, const char *type, struct netdev **netdevp) seq_read(netdev->reconfigure_seq); ovsrcu_set(&netdev->flow_api, NULL); netdev->hw_info.oor = false; + atomic_init(&netdev->hw_info.miss_api_supported, false); netdev->node = shash_add(&netdev_shash, name, netdev); /* By default enable one tx and rx queue per netdev. */ @@ -790,74 +797,84 @@ netdev_get_pt_mode(const struct netdev *netdev) : NETDEV_PT_LEGACY_L2); } -/* Check if a 'packet' is compatible with 'netdev_flags'. - * If a packet is incompatible, return 'false' with the 'errormsg' - * pointing to a reason. */ -static bool -netdev_send_prepare_packet(const uint64_t netdev_flags, - struct dp_packet *packet, char **errormsg) +/* Attempts to segment GSO flagged packets and send them as multiple bundles. + * This function is only used if at least one packet in the current batch is + * flagged for TSO and the netdev does not support this. + * + * The return value is 0 if all batches sent successfully, and an error code + * from netdev_class->send() if at least one batch failed to send. */ +static int +netdev_send_tso(struct netdev *netdev, int qid, + struct dp_packet_batch *batch, bool concurrent_txq) { - uint64_t l4_mask; + struct dp_packet_batch *batches; + struct dp_packet *packet; + int retval = 0; + int n_packets; + int n_batches; + int error; - if (dp_packet_hwol_is_tso(packet) - && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { - /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No TSO support"); - return false; + /* Calculate the total number of packets in the batch after + * the segmentation. */ + n_packets = 0; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + n_packets += dp_packet_gso_nr_segs(packet); + } else { + n_packets++; + } } - l4_mask = dp_packet_hwol_l4_mask(packet); - if (l4_mask) { - if (dp_packet_hwol_l4_is_tcp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { - /* Fall back to TCP csum in software. */ - VLOG_ERR_BUF(errormsg, "No TCP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_udp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { - /* Fall back to UDP csum in software. */ - VLOG_ERR_BUF(errormsg, "No UDP checksum support"); - return false; - } - } else if (dp_packet_hwol_l4_is_sctp(packet)) { - if (!(netdev_flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM)) { - /* Fall back to SCTP csum in software. */ - VLOG_ERR_BUF(errormsg, "No SCTP checksum support"); - return false; + if (!n_packets) { + return 0; + } + + /* Allocate enough batches to store all the packets in order. */ + n_batches = DIV_ROUND_UP(n_packets, NETDEV_MAX_BURST); + batches = xmalloc(n_batches * sizeof *batches); + + struct dp_packet_batch *curr_batch = batches; + struct dp_packet_batch *last_batch = &batches[n_batches - 1]; + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + dp_packet_batch_init(curr_batch); + } + + /* Do the packet segmentation if TSO is flagged. */ + size_t size = dp_packet_batch_size(batch); + size_t k; + curr_batch = batches; + DP_PACKET_BATCH_REFILL_FOR_EACH (k, size, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + if (dp_packet_gso(packet, &curr_batch)) { + COVERAGE_INC(netdev_soft_seg_good); + } else { + COVERAGE_INC(netdev_soft_seg_drops); } + dp_packet_delete(packet); } else { - VLOG_ERR_BUF(errormsg, "No L4 checksum support: mask: %"PRIu64, - l4_mask); - return false; + if (dp_packet_batch_is_full(curr_batch)) { + curr_batch++; + } + + dp_packet_batch_add(curr_batch, packet); } } - return true; -} - -/* Check if each packet in 'batch' is compatible with 'netdev' features, - * otherwise either fall back to software implementation or drop it. */ -static void -netdev_send_prepare_batch(const struct netdev *netdev, - struct dp_packet_batch *batch) -{ - struct dp_packet *packet; - size_t i, size = dp_packet_batch_size(batch); - - DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - char *errormsg = NULL; + for (curr_batch = batches; curr_batch <= last_batch; curr_batch++) { + DP_PACKET_BATCH_FOR_EACH (i, packet, curr_batch) { + dp_packet_ol_send_prepare(packet, netdev->ol_flags); + } - if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) { - dp_packet_batch_refill(batch, packet, i); + error = netdev->netdev_class->send(netdev, qid, curr_batch, + concurrent_txq); + if (!error) { + COVERAGE_INC(netdev_sent); } else { - dp_packet_delete(packet); - COVERAGE_INC(netdev_send_prepare_drops); - VLOG_WARN_RL(&rl, "%s: Packet dropped: %s", - netdev_get_name(netdev), errormsg); - free(errormsg); + retval = error; } } + free(batches); + return retval; } /* Sends 'batch' on 'netdev'. Returns 0 if successful (for every packet), @@ -889,11 +906,38 @@ int netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq) { + const uint64_t netdev_flags = netdev->ol_flags; + struct dp_packet *packet; int error; - netdev_send_prepare_batch(netdev, batch); - if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) { - return 0; + if (userspace_tso_enabled() && + !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + if (dp_packet_hwol_is_tso(packet)) { + if (dp_packet_hwol_is_tunnel_vxlan(packet) + && !(netdev_flags & NETDEV_TX_VXLAN_TNL_TSO)) { + VLOG_WARN_RL(&rl, "%s: No VXLAN TSO support", + netdev_get_name(netdev)); + COVERAGE_INC(netdev_vxlan_tso_drops); + dp_packet_delete_batch(batch, true); + return false; + } + + if (dp_packet_hwol_is_tunnel_geneve(packet) + && !(netdev_flags & NETDEV_TX_GENEVE_TNL_TSO)) { + VLOG_WARN_RL(&rl, "%s: No GENEVE TSO support", + netdev_get_name(netdev)); + COVERAGE_INC(netdev_geneve_tso_drops); + dp_packet_delete_batch(batch, true); + return false; + } + return netdev_send_tso(netdev, qid, batch, concurrent_txq); + } + } + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + dp_packet_ol_send_prepare(packet, netdev_flags); } error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq); @@ -965,15 +1009,33 @@ netdev_push_header(const struct netdev *netdev, size_t i, size = dp_packet_batch_size(batch); DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) { - if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet) - || dp_packet_hwol_l4_mask(packet))) { + if (OVS_UNLIKELY(data->tnl_type != OVS_VPORT_TYPE_GENEVE && + data->tnl_type != OVS_VPORT_TYPE_VXLAN && + dp_packet_hwol_is_tso(packet))) { COVERAGE_INC(netdev_push_header_drops); dp_packet_delete(packet); - VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is " - "not supported: packet dropped", - netdev_get_name(netdev)); + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is not " + "supported for %s tunnels: packet dropped", + netdev_get_name(netdev), netdev_get_type(netdev)); } else { + if (data->tnl_type != OVS_VPORT_TYPE_GENEVE && + data->tnl_type != OVS_VPORT_TYPE_VXLAN) { + dp_packet_ol_send_prepare(packet, 0); + } else if (dp_packet_hwol_is_tunnel_geneve(packet) || + dp_packet_hwol_is_tunnel_vxlan(packet)) { + if (dp_packet_hwol_is_tso(packet)) { + COVERAGE_INC(netdev_push_header_drops); + dp_packet_delete(packet); + VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is not " + "supported with multiple levels of " + "VXLAN or GENEVE encapsulation.", + netdev_get_name(netdev)); + continue; + } + dp_packet_ol_send_prepare(packet, 0); + } netdev->netdev_class->push_header(netdev, packet, data); + pkt_metadata_init(&packet->md, data->out_port); dp_packet_batch_refill(batch, packet, i); } @@ -1166,6 +1228,36 @@ netdev_get_features(const struct netdev *netdev, return error; } +int +netdev_get_speed(const struct netdev *netdev, uint32_t *current, uint32_t *max) +{ + uint32_t current_dummy, max_dummy; + int error; + + if (!current) { + current = ¤t_dummy; + } + if (!max) { + max = &max_dummy; + } + + error = netdev->netdev_class->get_speed + ? netdev->netdev_class->get_speed(netdev, current, max) + : EOPNOTSUPP; + + if (error == EOPNOTSUPP) { + enum netdev_features current_f, supported_f; + + error = netdev_get_features(netdev, ¤t_f, NULL, + &supported_f, NULL); + *current = netdev_features_to_bps(current_f, 0) / 1000000; + *max = netdev_features_to_bps(supported_f, 0) / 1000000; + } else if (error) { + *current = *max = 0; + } + return error; +} + /* Returns the maximum speed of a network connection that has the NETDEV_F_* * bits in 'features', in bits per second. If no bits that indicate a speed * are set in 'features', returns 'default_bps'. */ @@ -1372,9 +1464,35 @@ netdev_get_next_hop(const struct netdev *netdev, int netdev_get_status(const struct netdev *netdev, struct smap *smap) { - return (netdev->netdev_class->get_status - ? netdev->netdev_class->get_status(netdev, smap) - : EOPNOTSUPP); + int err = EOPNOTSUPP; + + /* Set offload status only if relevant. */ + if (netdev_get_dpif_type(netdev) && + strcmp(netdev_get_dpif_type(netdev), "system")) { + +#define OL_ADD_STAT(name, bit) \ + smap_add(smap, "tx_" name "_offload", \ + netdev->ol_flags & bit ? "true" : "false"); + + OL_ADD_STAT("ip_csum", NETDEV_TX_OFFLOAD_IPV4_CKSUM); + OL_ADD_STAT("tcp_csum", NETDEV_TX_OFFLOAD_TCP_CKSUM); + OL_ADD_STAT("udp_csum", NETDEV_TX_OFFLOAD_UDP_CKSUM); + OL_ADD_STAT("sctp_csum", NETDEV_TX_OFFLOAD_SCTP_CKSUM); + OL_ADD_STAT("tcp_seg", NETDEV_TX_OFFLOAD_TCP_TSO); + OL_ADD_STAT("vxlan_tso", NETDEV_TX_VXLAN_TNL_TSO); + OL_ADD_STAT("geneve_tso", NETDEV_TX_GENEVE_TNL_TSO); + OL_ADD_STAT("out_ip_csum", NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM); + OL_ADD_STAT("out_udp_csum", NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM); +#undef OL_ADD_STAT + + err = 0; + } + + if (!netdev->netdev_class->get_status) { + return err; + } + + return netdev->netdev_class->get_status(netdev, smap); } /* Returns all assigned IP address to 'netdev' and returns 0. diff --git a/lib/netdev.h b/lib/netdev.h index 59f067dd710..2f2ec1e18b0 100644 --- a/lib/netdev.h +++ b/lib/netdev.h @@ -73,6 +73,9 @@ struct sset; struct ovs_action_push_tnl; enum netdev_pt_mode { + /* Unknown mode. The netdev is not configured yet. */ + NETDEV_PT_UNKNOWN = 0, + /* The netdev is packet type aware. It can potentially carry any kind of * packet. This "modern" mode is appropriate for both netdevs that handle * only a single kind of packet (such as a virtual or physical Ethernet @@ -98,6 +101,33 @@ enum netdev_pt_mode { NETDEV_PT_LEGACY_L3, }; +enum netdev_srv6_flowlabel { + /* Copy the flowlabel of inner packet. */ + SRV6_FLOWLABEL_COPY, + + /* Simply set flowlabel to 0. */ + SRV6_FLOWLABEL_ZERO, + + /* Set flowlabel to a hash over L3/L4 fields of the inner packet. */ + SRV6_FLOWLABEL_COMPUTE, +}; + +enum netdev_tnl_csum { + /* Default value for UDP tunnels if no configurations is present. Enforce + * checksum calculation in IPv6 tunnels, disable in IPv4 tunnels. */ + NETDEV_TNL_CSUM_DEFAULT = 0, + + /* Checksum explicitly to be calculated. */ + NETDEV_TNL_CSUM_ENABLED, + + /* Checksum calculation explicitly disabled. */ + NETDEV_TNL_CSUM_DISABLED, + + /* A value for when there is no checksum or the default value is no + * checksum regardless of IP version. */ + NETDEV_TNL_DEFAULT_NO_CSUM, +}; + /* Configuration specific to tunnels. */ struct netdev_tunnel_config { ovs_be64 in_key; @@ -126,12 +156,11 @@ struct netdev_tunnel_config { uint8_t tos; bool tos_inherit; - bool csum; + enum netdev_tnl_csum csum; bool dont_fragment; enum netdev_pt_mode pt_mode; bool set_seq; - uint32_t seqno; uint32_t erspan_idx; uint8_t erspan_ver; uint8_t erspan_dir; @@ -142,6 +171,11 @@ struct netdev_tunnel_config { bool erspan_dir_flow; bool erspan_hwid_flow; + uint8_t srv6_num_segs; + #define SRV6_MAX_SEGS 6 + struct in6_addr srv6_segs[SRV6_MAX_SEGS]; + enum netdev_srv6_flowlabel srv6_flowlabel; + #if defined(P4OVS) uint32_t vni; #endif diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c index 4fcde9ba1e3..0b3a8adf590 100644 --- a/lib/netlink-conntrack.c +++ b/lib/netlink-conntrack.c @@ -141,6 +141,9 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone, nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + if (zone) { + nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone)); + } nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf); ofpbuf_clear(&state->buf); @@ -263,11 +266,9 @@ nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone) return err; } -#ifdef _WIN32 -int -nl_ct_flush_zone(uint16_t flush_zone) +static int +nl_ct_flush_zone_with_cta_zone(uint16_t flush_zone) { - /* Windows can flush a specific zone */ struct ofpbuf buf; int err; @@ -282,24 +283,63 @@ nl_ct_flush_zone(uint16_t flush_zone) return err; } + +#ifdef _WIN32 +int +nl_ct_flush_zone(uint16_t flush_zone) +{ + return nl_ct_flush_zone_with_cta_zone(flush_zone); +} #else + +static bool +netlink_flush_supports_zone(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static bool supported = false; + + if (ovsthread_once_start(&once)) { + if (ovs_kernel_is_version_or_newer(6, 8)) { + supported = true; + } else { + VLOG_INFO("Disabling conntrack flush by zone. " + "Not supported in Linux kernel."); + } + ovsthread_once_done(&once); + } + return supported; +} + int nl_ct_flush_zone(uint16_t flush_zone) { - /* Apparently, there's no netlink interface to flush a specific zone. + /* In older kernels, there was no netlink interface to flush a specific + * conntrack zone. * This code dumps every connection, checks the zone and eventually * delete the entry. + * In newer kernels there is the option to specify a zone for filtering + * during dumps. Older kernels ignore this option. We set it here in the + * hope we only get relevant entries back, but fall back to filtering here + * to keep compatibility. + * + * This is race-prone, but it is better than using shell scripts. * - * This is race-prone, but it is better than using shell scripts. */ + * Additionally newer kernels also support flushing a zone without listing + * it first. */ struct nl_dump dump; struct ofpbuf buf, reply, delete; + if (netlink_flush_supports_zone()) { + return nl_ct_flush_zone_with_cta_zone(flush_zone); + } + ofpbuf_init(&buf, NL_DUMP_BUFSIZE); ofpbuf_init(&delete, NL_DUMP_BUFSIZE); nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); nl_dump_start(&dump, NETLINK_NETFILTER, &buf); ofpbuf_clear(&buf); @@ -579,7 +619,8 @@ nl_ct_put_tuple_proto(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple) nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_TYPE, tuple->icmp_type); nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_CODE, tuple->icmp_code); } else if (tuple->ip_proto == IPPROTO_TCP || - tuple->ip_proto == IPPROTO_UDP) { + tuple->ip_proto == IPPROTO_UDP || + tuple->ip_proto == IPPROTO_SCTP) { nl_msg_put_be16(buf, CTA_PROTO_SRC_PORT, tuple->src_port); nl_msg_put_be16(buf, CTA_PROTO_DST_PORT, tuple->dst_port); } else { diff --git a/lib/netlink-notifier.c b/lib/netlink-notifier.c index dfecb97789f..7ea5a418182 100644 --- a/lib/netlink-notifier.c +++ b/lib/netlink-notifier.c @@ -223,7 +223,7 @@ nln_wait(struct nln *nln) } } -void +void OVS_NO_SANITIZE_FUNCTION nln_report(const struct nln *nln, void *change, int group) { struct nln_notifier *notifier; diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h index 6eaa7035a4b..e4bb28ac9f6 100644 --- a/lib/netlink-protocol.h +++ b/lib/netlink-protocol.h @@ -155,6 +155,11 @@ enum { #define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) #endif +/* Introduced in v4.4. */ +#ifndef NLM_F_DUMP_FILTERED +#define NLM_F_DUMP_FILTERED 0x20 +#endif + /* These were introduced all together in 2.6.14. (We want our programs to * support the newer kernel features even if compiled with older headers.) */ #ifndef NETLINK_ADD_MEMBERSHIP @@ -168,6 +173,11 @@ enum { #define NETLINK_LISTEN_ALL_NSID 8 #endif +/* Strict checking of netlink arguments introduced in Linux kernel v4.20. */ +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + /* These were introduced all together in 2.6.23. (We want our programs to * support the newer kernel features even if compiled with older headers.) */ #ifndef CTRL_ATTR_MCAST_GRP_MAX diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index 80da20d9f05..5cb1fc89aed 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -205,6 +205,15 @@ nl_sock_create(int protocol, struct nl_sock **sockp) } } + /* Strict checking only supported for NETLINK_ROUTE. */ + if (protocol == NETLINK_ROUTE + && setsockopt(sock->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, + &one, sizeof one) < 0) { + VLOG_RL(&rl, errno == ENOPROTOOPT ? VLL_DBG : VLL_WARN, + "netlink: could not enable strict checking (%s)", + ovs_strerror(errno)); + } + retval = get_socket_rcvbuf(sock->fd); if (retval < 0) { retval = -retval; diff --git a/lib/netlink.c b/lib/netlink.c index 6215282d6fb..1e8d5a8ec57 100644 --- a/lib/netlink.c +++ b/lib/netlink.c @@ -523,6 +523,15 @@ nl_msg_start_nested(struct ofpbuf *msg, uint16_t type) return offset; } +/* Adds the header for nested Netlink attributes to 'msg', with the specified + * 'type', and returns the header's offset within 'msg'. It's similar to + * nl_msg_start_nested() and uses NLA_F_NESTED flag mandatorily. */ +size_t +nl_msg_start_nested_with_flag(struct ofpbuf *msg, uint16_t type) +{ + return nl_msg_start_nested(msg, type | NLA_F_NESTED); +} + /* Finalizes a nested Netlink attribute in 'msg'. 'offset' should be the value * returned by nl_msg_start_nested(). */ void diff --git a/lib/netlink.h b/lib/netlink.h index e9050c31bac..008604aa60d 100644 --- a/lib/netlink.h +++ b/lib/netlink.h @@ -81,6 +81,7 @@ void nl_msg_put_string__(struct ofpbuf *, uint16_t type, const char *value, void nl_msg_put_string(struct ofpbuf *, uint16_t type, const char *value); size_t nl_msg_start_nested(struct ofpbuf *, uint16_t type); +size_t nl_msg_start_nested_with_flag(struct ofpbuf *, uint16_t type); void nl_msg_end_nested(struct ofpbuf *, size_t offset); void nl_msg_cancel_nested(struct ofpbuf *, size_t offset); bool nl_msg_end_non_empty_nested(struct ofpbuf *, size_t offset); diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 6c77132516a..09eb685cbac 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #include "csum.h" #include "dp-packet.h" @@ -28,13 +31,14 @@ #include "odp-execute-private.h" #include "odp-netlink.h" #include "openvswitch/vlog.h" +#include "packets.h" VLOG_DEFINE_THIS_MODULE(odp_execute_avx512); -/* The below three build asserts make sure that l2_5_ofs, l3_ofs, and l4_ofs - * fields remain in the same order and offset to l2_padd_size. This is needed - * as the avx512_dp_packet_resize_l2() function will manipulate those fields at - * a fixed memory index based on the l2_padd_size offset. */ +/* The below build asserts make sure that the below fields remain in the same + * order and offset to l2_pad_size. This is needed as the + * avx512_dp_packet_resize_l2() function will manipulate those fields at a + * fixed memory index based on the l2_pad_size offset. */ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_pad_size) + MEMBER_SIZEOF(struct dp_packet, l2_pad_size) == offsetof(struct dp_packet, l2_5_ofs)); @@ -47,6 +51,14 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) + MEMBER_SIZEOF(struct dp_packet, l3_ofs) == offsetof(struct dp_packet, l4_ofs)); +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l4_ofs) + + MEMBER_SIZEOF(struct dp_packet, l4_ofs) == + offsetof(struct dp_packet, inner_l3_ofs)); + +BUILD_ASSERT_DECL(offsetof(struct dp_packet, inner_l3_ofs) + + MEMBER_SIZEOF(struct dp_packet, inner_l3_ofs) == + offsetof(struct dp_packet, inner_l4_ofs)); + /* The below build assert makes sure it's safe to read/write 128-bits starting * at the l2_pad_size location. */ BUILD_ASSERT_DECL(sizeof(struct dp_packet) - @@ -75,6 +87,26 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) + MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) == offsetof(struct ovs_key_ipv4, ipv4_ttl)); +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) == + offsetof(struct ovs_key_ipv6, ipv6_dst)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) == + offsetof(struct ovs_key_ipv6, ipv6_label)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) == + offsetof(struct ovs_key_ipv6, ipv6_proto)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) == + offsetof(struct ovs_key_ipv6, ipv6_tclass)); + +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) + + MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) == + offsetof(struct ovs_key_ipv6, ipv6_hlimit)); + /* Array of callback functions, one for each masked operation. */ odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX]; @@ -88,7 +120,7 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) dp_packet_pull(b, -resize_by_bytes); } - /* The next step is to update the l2_5_ofs, l3_ofs and l4_ofs fields which + /* The next step is to update the l2_5_ofs to inner_l4_ofs fields which * the scalar implementation does with the dp_packet_adjust_layer_offset() * function. */ @@ -98,13 +130,14 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) /* Set the v_u16_max register to all one's. */ const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros); - /* Each lane represents 16 bits in a 12-bit register. In this case the - * first three 16-bit values, which will map to the l2_5_ofs, l3_ofs and - * l4_ofs fields. */ - const uint8_t k_lanes = 0b1110; + /* Each lane represents 16 bits in a 128-bit register. Here the bitmask + * starts at l2_5_ofs with a value of 0 indicating it is not modified. Then + * five 1's to indicate modificaiton of all fields from l2_5_ofs to + * inner_l4_ofs. */ + const uint8_t k_lanes = 0b111110; /* Set all 16-bit words in the 128-bits v_offset register to the value we - * need to add/substract from the l2_5_ofs, l3_ofs, and l4_ofs fields. */ + * need to add/substract from the l2_5_ofs to inner_l4_ofs fields. */ __m128i v_offset = _mm_set1_epi16(abs(resize_by_bytes)); /* Load 128 bits from the dp_packet structure starting at the l2_pad_size @@ -123,7 +156,7 @@ avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes) /* Based on the bytes adjust (positive, or negative) it will do the actual * add or subtraction. These functions will only operate on the lanes * (fields) requested based on k_cmp, i.e: - * k_cmp = [l2_5_ofs, l3_ofs, l4_ofs] + * k_cmp = [l2_5_ofs, ..., inner_l4_ofs] * for field in kcmp * v_adjust_src[field] = v_adjust_src[field] + v_offset */ @@ -333,6 +366,8 @@ avx512_get_delta(__m256i old_header, __m256i new_header) 0xF, 0xF, 0xF, 0xF); v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_shuffle_epi8(v_delta, v_swap16a); v_delta = _mm256_hadd_epi32(v_delta, v_zeros); v_delta = _mm256_hadd_epi16(v_delta, v_zeros); @@ -426,7 +461,6 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { struct ip_header *nh = dp_packet_l3(packet); - ovs_be16 old_csum = ~nh->ip_csum; /* Load the 20 bytes of the IPv4 header. Without options, which is the * most common case it's 20 bytes, but can be up to 60 bytes. */ @@ -439,13 +473,20 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, * (v_pkt_masked). */ __m256i v_new_hdr = _mm256_or_si256(v_key_shuf, v_pkt_masked); - /* Update the IP checksum based on updated IP values. */ - uint16_t delta = avx512_ipv4_hdr_csum_delta(v_packet, v_new_hdr); - uint32_t new_csum = old_csum + delta; - delta = csum_finish(new_csum); + if (dp_packet_hwol_l3_ipv4(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + ovs_be16 old_csum = ~nh->ip_csum; + + /* Update the IP checksum based on updated IP values. */ + uint16_t delta = avx512_ipv4_hdr_csum_delta(v_packet, v_new_hdr); + uint32_t new_csum = old_csum + delta; - /* Insert new checksum. */ - v_new_hdr = _mm256_insert_epi16(v_new_hdr, delta, 5); + delta = csum_finish(new_csum); + + /* Insert new checksum. */ + v_new_hdr = _mm256_insert_epi16(v_new_hdr, delta, 5); + } /* If ip_src or ip_dst has been modified, L4 checksum needs to * be updated too. */ @@ -453,11 +494,14 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, uint16_t delta_checksum = avx512_ipv4_addr_csum_delta(v_packet, v_new_hdr); + size_t l4_size = dp_packet_l4_size(packet); - if (nh->ip_proto == IPPROTO_UDP) { - /* New UDP checksum. */ + if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + /* New UDP checksum. */ uint16_t old_udp_checksum = ~uh->udp_csum; uint32_t udp_checksum = old_udp_checksum + delta_checksum; udp_checksum = csum_finish(udp_checksum); @@ -468,21 +512,247 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, /* Insert new udp checksum. */ uh->udp_csum = udp_checksum; } - } else if (nh->ip_proto == IPPROTO_TCP) { - /* New TCP checksum. */ - struct tcp_header *th = dp_packet_l4(packet); - uint16_t old_tcp_checksum = ~th->tcp_csum; - uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; - tcp_checksum = csum_finish(tcp_checksum); - - th->tcp_csum = tcp_checksum; + } else if (nh->ip_proto == IPPROTO_TCP && + l4_size >= TCP_HEADER_LEN) { + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + /* New TCP checksum. */ + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + tcp_checksum = csum_finish(tcp_checksum); + + th->tcp_csum = tcp_checksum; + } } + + pkt_metadata_init_conn(&packet->md); } /* Write back the modified IPv4 addresses. */ _mm256_mask_storeu_epi32((void *) nh, 0x1F, v_new_hdr); } } +#if HAVE_AVX512VBMI +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_sum_header(__m512i ip6_header) +{ + __m256i v_zeros = _mm256_setzero_si256(); + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, + 0xFF, 0xFF, 0xFF, 0xFF); + + /* Shuffle ip6 src and dst to beginning of register. */ + __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst, + ip6_header); + + /* Extract ip6 src and dst into smaller 256-bit wide register. */ + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0); + + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the + * src and dst fields and add padding after each 16-bit value for the + * following carry over addition. */ + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF, + 0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF); + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a); + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b); + + /* Add each part of the old and new headers together. */ + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); + + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + + /* Shuffle 32-bit value from 3rd lane into first lane for final + * horizontal add. */ + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, + 0xF, 0xF, 0xF, 0xF); + + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_shuffle_epi8(v_delta, v_swap16a); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); + + /* Extract delta value. */ + return _mm256_extract_epi16(v_delta, 0); +} + +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_addr_csum_delta(__m512i v_packet, __m512i v_new_hdr, + bool rh_present) +{ + __m512i v_new_hdr_for_cksum = v_new_hdr; + uint32_t csum_delta; + uint16_t old_delta; + uint16_t new_delta; + + if (rh_present) { + v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr, + v_packet); + } + + old_delta = avx512_ipv6_sum_header(v_packet); + new_delta = avx512_ipv6_sum_header(v_new_hdr_for_cksum); + csum_delta = ((uint16_t) ~old_delta) + new_delta; + + return ~csum_finish(csum_delta); +} + +/* This function performs the same operation on each packet in the batch as + * the scalar odp_set_ipv6() function. */ +static void +__attribute__((__target__("avx512vbmi"))) +action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a) +{ + const struct ovs_key_ipv6 *key, *mask; + struct dp_packet *packet; + + a = nl_attr_get(a); + key = nl_attr_get(a); + mask = odp_get_key_mask(a, struct ovs_key_ipv6); + + /* Read the content of the key and mask in the respective registers. We + * only load the size of the actual structure, which is only 40 bytes. */ + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); + + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the + * ip6_hdr structure layout. */ + static const uint8_t ip_shuffle_mask[64] = { + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF + }; + + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); + + /* This shuffle is required for key and mask to match the layout of the + * ip6_hdr struct. */ + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); + + /* Set the v_zero register to all zero's. */ + const __m128i v_zeros = _mm_setzero_si128(); + + /* Set the v_all_ones register to all one's. */ + const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros); + + /* Load ip6 src and dst masks respectively into 128-bit wide registers. */ + __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src); + __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst); + + /* Perform a bitwise OR between src and dst registers. */ + __m128i v_or = _mm_or_si128(v_src, v_dst); + + /* Will return true if any bit has been set in v_or, else it will return + * false. */ + bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones); + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + + /* Load the 40 bytes of the IPv6 header. */ + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); + + /* AND the v_pkt_mask to the packet data (v_packet). */ + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); + + /* OR the new addresses (v_key_shuf) with the masked packet addresses + * (v_pkt_masked). */ + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); + + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be + * updated. */ + uint8_t proto = 0; + bool rh_present; + bool do_csum = do_checksum; + + rh_present = packet_rh_present(packet, &proto, &do_csum); + + if (do_csum) { + size_t l4_size = dp_packet_l4_size(packet); + uint16_t delta_checksum; + + if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { + struct udp_header *uh = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else if (uh->udp_csum) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); + uint16_t old_udp_checksum = ~uh->udp_csum; + uint32_t udp_checksum = old_udp_checksum + + delta_checksum; + + udp_checksum = csum_finish(udp_checksum); + + if (!udp_checksum) { + udp_checksum = htons(0xffff); + } + + uh->udp_csum = udp_checksum; + } + + } else if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + + tcp_checksum = csum_finish(tcp_checksum); + th->tcp_csum = tcp_checksum; + } + } else if (proto == IPPROTO_ICMPV6 && + l4_size >= sizeof(struct icmp6_header)) { + delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr, + rh_present); + struct icmp6_header *icmp = dp_packet_l4(packet); + uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum; + uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum; + + icmp6_checksum = csum_finish(icmp6_checksum); + icmp->icmp6_cksum = icmp6_checksum; + } + + pkt_metadata_init_conn(&packet->md); + } + /* Write back the modified IPv6 addresses. */ + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); + + /* Scalar method for setting IPv6 tclass field. */ + if (key->ipv6_tclass) { + uint8_t old_tc = ntohl(get_16aligned_be32(&nh->ip6_flow)) >> 20; + uint8_t key_tc = key->ipv6_tclass | (old_tc & ~mask->ipv6_tclass); + + packet_set_ipv6_tc(&nh->ip6_flow, key_tc); + } + } +} +#endif /* HAVE_AVX512VBMI */ + static void action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) { @@ -514,6 +784,12 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; +#if HAVE_AVX512VBMI + if (action_avx512vbmi_isa_probe()) { + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = action_avx512_set_ipv6; + } +#endif + return 0; } diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c index f80ae5a239c..8b7a6b4ab0e 100644 --- a/lib/odp-execute-private.c +++ b/lib/odp-execute-private.c @@ -60,6 +60,20 @@ action_avx512_isa_probe(void) #endif +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI +bool +action_avx512vbmi_isa_probe(void) +{ + return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI); +} +#else +bool +action_avx512vbmi_isa_probe(void) +{ + return false; +} +#endif + static struct odp_execute_action_impl action_impls[] = { [ACTION_IMPL_AUTOVALIDATOR] = { .available = false, @@ -229,6 +243,18 @@ action_autoval_generic(struct dp_packet_batch *batch, const struct nlattr *a) } } + /* Compare packet metadata. */ + if (memcmp(&good_pkt->md, &test_pkt->md, sizeof good_pkt->md)) { + ds_put_format(&log_msg, "Autovalidation metadata failed\n"); + ds_put_format(&log_msg, "Good packet metadata:\n"); + ds_put_sparse_hex_dump(&log_msg, &good_pkt->md, + sizeof good_pkt->md, 0, false); + ds_put_format(&log_msg, "Test packet metadata:\n"); + ds_put_sparse_hex_dump(&log_msg, &test_pkt->md, + sizeof test_pkt->md, 0, false); + failed = true; + } + if (failed) { VLOG_ERR("Autovalidation of %s failed. Details:\n%s", action_impls[impl].name, ds_cstr(&log_msg)); diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h index 940180c99f9..643f41c2a61 100644 --- a/lib/odp-execute-private.h +++ b/lib/odp-execute-private.h @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) bool action_avx512_isa_probe(void); +bool action_avx512vbmi_isa_probe(void); /* Odp execute init handles setting up the state of the actions functions at * initialization time. It cannot return errors, as it must always succeed in diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 5cf6fbec09a..15577d5394f 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -147,6 +147,8 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, uint8_t new_tos; uint8_t new_ttl; + ovs_assert(nh); + if (mask->ipv4_src) { ip_src_nh = get_16aligned_be32(&nh->ip_src); new_ip_src = key->ipv4_src | (ip_src_nh & ~mask->ipv4_src); @@ -169,9 +171,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, new_tos = key->ipv4_tos | (nh->ip_tos & ~mask->ipv4_tos); if (nh->ip_tos != new_tos) { - nh->ip_csum = recalc_csum16(nh->ip_csum, - htons((uint16_t) nh->ip_tos), - htons((uint16_t) new_tos)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, + htons((uint16_t) nh->ip_tos), + htons((uint16_t) new_tos)); + } + nh->ip_tos = new_tos; } } @@ -180,8 +187,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct ovs_key_ipv4 *key, new_ttl = key->ipv4_ttl | (nh->ip_ttl & ~mask->ipv4_ttl); if (OVS_LIKELY(nh->ip_ttl != new_ttl)) { - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_ttl << 8), - htons(new_ttl << 8)); + if (dp_packet_hwol_tx_ip_csum(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, + htons(nh->ip_ttl << 8), + htons(new_ttl << 8)); + } + nh->ip_ttl = new_ttl; } } @@ -276,6 +289,8 @@ set_arp(struct dp_packet *packet, const struct ovs_key_arp *key, { struct arp_eth_header *arp = dp_packet_l3(packet); + ovs_assert(arp); + if (!mask) { arp->ar_op = key->arp_op; arp->ar_sha = key->arp_sha; @@ -803,13 +818,13 @@ requires_datapath_assistance(const struct nlattr *a) case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_CT: case OVS_ACTION_ATTR_METER: + case OVS_ACTION_ATTR_PSAMPLE: return true; case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SET_MASKED: case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: - case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_PUSH_MPLS: case OVS_ACTION_ATTR_POP_MPLS: @@ -822,9 +837,32 @@ requires_datapath_assistance(const struct nlattr *a) case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: case OVS_ACTION_ATTR_DROP: return false; + case OVS_ACTION_ATTR_SAMPLE: { + /* Nested "psample" actions rely on the datapath executing the + * parent "sample", storing the probability and making it available + * when the nested "psample" is run. */ + const struct nlattr *attr; + unsigned int left; + + NL_NESTED_FOR_EACH (attr, left, a) { + if (nl_attr_type(attr) == OVS_SAMPLE_ATTR_ACTIONS) { + const struct nlattr *act; + unsigned int act_left; + + NL_NESTED_FOR_EACH (act, act_left, attr) { + if (nl_attr_type(act) == OVS_ACTION_ATTR_PSAMPLE) { + return true; + } + } + } + } + return false; + } + case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); @@ -1212,6 +1250,8 @@ odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal, case OVS_ACTION_ATTR_RECIRC: case OVS_ACTION_ATTR_CT: case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: /* The following actions are handled by the scalar implementation. */ case OVS_ACTION_ATTR_POP_VLAN: diff --git a/lib/odp-util.c b/lib/odp-util.c index ba5be4bb355..d3245223dd6 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -143,7 +143,9 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_POP_NSH: return 0; case OVS_ACTION_ATTR_CHECK_PKT_LEN: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_ADD_MPLS: return sizeof(struct ovs_action_add_mpls); + case OVS_ACTION_ATTR_DEC_TTL: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_DROP: return sizeof(uint32_t); + case OVS_ACTION_ATTR_PSAMPLE: return ATTR_LEN_VARIABLE; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: @@ -715,6 +717,24 @@ format_odp_tnl_push_header(struct ds *ds, struct ovs_action_push_tnl *data) } ds_put_char(ds, ')'); + } else if (data->tnl_type == OVS_VPORT_TYPE_SRV6) { + const struct srv6_base_hdr *srh; + struct in6_addr *segs; + int nr_segs; + int i; + + srh = (const struct srv6_base_hdr *) l4; + segs = ALIGNED_CAST(struct in6_addr *, srh + 1); + nr_segs = srh->last_entry + 1; + + ds_put_format(ds, "srv6("); + ds_put_format(ds, "segments_left=%d", srh->rt_hdr.segments_left); + ds_put_format(ds, ",segs("); + for (i = 0; i < nr_segs; i++) { + ds_put_format(ds, i > 0 ? "," : ""); + ipv6_format_addr(&segs[nr_segs - i - 1], ds); + } + ds_put_format(ds, "))"); } else if (data->tnl_type == OVS_VPORT_TYPE_GRE || data->tnl_type == OVS_VPORT_TYPE_IP6GRE) { const struct gre_base_hdr *greh; @@ -1004,7 +1024,7 @@ format_odp_conntrack_action(struct ds *ds, const struct nlattr *attr) ds_put_format(ds, "helper=%s,", helper); } if (timeout) { - ds_put_format(ds, "timeout=%s", timeout); + ds_put_format(ds, "timeout=%s,", timeout); } if (nat) { format_odp_ct_nat(ds, nat); @@ -1112,6 +1132,47 @@ format_odp_check_pkt_len_action(struct ds *ds, const struct nlattr *attr, ds_put_cstr(ds, "))"); } +static void +format_dec_ttl_action(struct ds *ds, const struct nlattr *attr, + const struct hmap *portno_names) +{ + const struct nlattr *a; + unsigned int left; + + ds_put_cstr(ds,"dec_ttl(le_1("); + NL_ATTR_FOR_EACH (a, left, + nl_attr_get(attr), nl_attr_get_size(attr)) { + if (nl_attr_type(a) == OVS_DEC_TTL_ATTR_ACTION) { + format_odp_actions(ds, nl_attr_get(a), + nl_attr_get_size(a), portno_names); + break; + } + } + ds_put_format(ds, "))"); +} + +static void +format_odp_psample_action(struct ds *ds, const struct nlattr *attr) +{ + const struct nlattr *a; + unsigned int left; + + ds_put_cstr(ds, "psample("); + NL_NESTED_FOR_EACH (a, left, attr) { + switch (a->nla_type) { + case OVS_PSAMPLE_ATTR_GROUP: + ds_put_format(ds, "group=%"PRIu32",", nl_attr_get_u32(a)); + break; + case OVS_PSAMPLE_ATTR_COOKIE: + ds_put_cstr(ds, "cookie="); + ds_put_hex(ds, nl_attr_get(a), nl_attr_get_size(a)); + break; + } + } + ds_chomp(ds, ','); + ds_put_char(ds, ')'); +} + static void format_odp_action(struct ds *ds, const struct nlattr *a, const struct hmap *portno_names) @@ -1265,9 +1326,15 @@ format_odp_action(struct ds *ds, const struct nlattr *a, ntohs(mpls->mpls_ethertype)); break; } + case OVS_ACTION_ATTR_DEC_TTL: + format_dec_ttl_action(ds, a, portno_names); + break; case OVS_ACTION_ATTR_DROP: ds_put_cstr(ds, "drop"); break; + case OVS_ACTION_ATTR_PSAMPLE: + format_odp_psample_action(ds, a); + break; case OVS_ACTION_ATTR_UNSPEC: case __OVS_ACTION_ATTR_MAX: default: @@ -1534,6 +1601,7 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) uint8_t hwid, dir; uint32_t teid; uint8_t gtpu_flags, gtpu_msgtype; + uint8_t segments_left; if (!ovs_scan_len(s, &n, "tnl_push(tnl_port(%"SCNi32"),", &data->tnl_port)) { return -EINVAL; @@ -1775,6 +1843,57 @@ ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) tnl_type = OVS_VPORT_TYPE_GTPU; header_len = sizeof *eth + ip_len + sizeof *udp + sizeof *gtph; + } else if (ovs_scan_len(s, &n, "srv6(segments_left=%"SCNu8, + &segments_left)) { + struct srv6_base_hdr *srh = (struct srv6_base_hdr *) (ip6 + 1); + union ovs_16aligned_in6_addr *segs; + char seg_s[IPV6_SCAN_LEN + 1]; + struct in6_addr seg; + uint8_t n_segs = 0; + + if (segments_left + 1 > SRV6_MAX_SEGS) { + return -EINVAL; + } + + ip6->ip6_nxt = IPPROTO_ROUTING; + + srh->rt_hdr.hdrlen = 2 * (segments_left + 1); + srh->rt_hdr.segments_left = segments_left; + srh->rt_hdr.type = IPV6_SRCRT_TYPE_4; + srh->last_entry = segments_left; + + tnl_type = OVS_VPORT_TYPE_SRV6; + header_len = sizeof *eth + ip_len + + sizeof *srh + 8 * srh->rt_hdr.hdrlen; + /* Parse segment list. */ + if (!ovs_scan_len(s, &n, ",segs(")) { + return -EINVAL; + } + + segs = (union ovs_16aligned_in6_addr *) (srh + 1); + segs += segments_left; + + while (ovs_scan_len(s, &n, IPV6_SCAN_FMT, seg_s) + && inet_pton(AF_INET6, seg_s, &seg) == 1) { + if (n_segs == segments_left + 1) { + return -EINVAL; + } + + memcpy(segs--, &seg, sizeof *segs); + n_segs++; + + if (s[n] == ',') { + n++; + } + } + + if (!ovs_scan_len(s, &n, ")))")) { + return -EINVAL; + } + + if (n_segs != segments_left + 1) { + return -EINVAL; + } } else { return -EINVAL; } @@ -2265,6 +2384,50 @@ parse_odp_push_nsh_action(const char *s, struct ofpbuf *actions) return ret; } +static int +parse_odp_psample_action(const char *s, struct ofpbuf *actions) +{ + char buf[2 * OVS_PSAMPLE_COOKIE_MAX_SIZE + 1]; + uint8_t cookie[OVS_PSAMPLE_COOKIE_MAX_SIZE]; + bool has_group = false; + size_t cookie_len = 0; + uint32_t group; + int n = 0; + + if (!ovs_scan_len(s, &n, "psample(")) { + return -EINVAL; + } + + while (s[n] != ')') { + n += strspn(s + n, delimiters); + + if (!has_group && ovs_scan_len(s, &n, "group=%"SCNi32, &group)) { + has_group = true; + continue; + } + + if (!cookie_len && + ovs_scan_len(s, &n, "cookie=0x%32[0-9a-fA-F]", buf) && n > 7) { + struct ofpbuf b; + + ofpbuf_use_stub(&b, cookie, OVS_PSAMPLE_COOKIE_MAX_SIZE); + ofpbuf_put_hex(&b, buf, &cookie_len); + ofpbuf_uninit(&b); + continue; + } + return -EINVAL; + } + n++; + + if (!has_group) { + return -EINVAL; + } + + odp_put_psample_action(actions, group, cookie_len ? cookie : NULL, + cookie_len); + return n; +} + static int parse_action_list(struct parse_odp_context *context, const char *s, struct ofpbuf *actions) @@ -2626,6 +2789,10 @@ parse_odp_action__(struct parse_odp_context *context, const char *s, } } + if (!strncmp(s, "psample(", 8)) { + return parse_odp_psample_action(s, actions); + } + { struct ovs_action_push_tnl data; int n; @@ -3079,23 +3246,12 @@ odp_tun_key_from_attr__(const struct nlattr *attr, bool is_mask, tun->flags |= FLOW_TNL_F_OAM; break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: { - static const struct nl_policy vxlan_opts_policy[] = { - [OVS_VXLAN_EXT_GBP] = { .type = NL_A_U32 }, - }; - struct nlattr *ext[ARRAY_SIZE(vxlan_opts_policy)]; - - if (!nl_parse_nested(a, vxlan_opts_policy, ext, ARRAY_SIZE(ext))) { + if (odp_vxlan_tun_opts_from_attr(a, &tun->gbp_id, + &tun->gbp_flags, + NULL)) { odp_parse_error(&rl, errorp, "error parsing VXLAN options"); return ODP_FIT_ERROR; } - - if (ext[OVS_VXLAN_EXT_GBP]) { - uint32_t gbp = nl_attr_get_u32(ext[OVS_VXLAN_EXT_GBP]); - - tun->gbp_id = htons(gbp & 0xFFFF); - tun->gbp_flags = (gbp >> 16) & 0xFF; - } - break; } case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: @@ -3209,10 +3365,11 @@ tun_key_to_attr(struct ofpbuf *a, const struct flow_tnl *tun_key, if ((!tnl_type || !strcmp(tnl_type, "vxlan")) && (tun_key->gbp_flags || tun_key->gbp_id)) { size_t vxlan_opts_ofs; + uint32_t gbp_raw; vxlan_opts_ofs = nl_msg_start_nested(a, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); - nl_msg_put_u32(a, OVS_VXLAN_EXT_GBP, - (tun_key->gbp_flags << 16) | ntohs(tun_key->gbp_id)); + gbp_raw = odp_encode_gbp_raw(tun_key->gbp_flags, tun_key->gbp_id); + nl_msg_put_u32(a, OVS_VXLAN_EXT_GBP, gbp_raw); nl_msg_end_nested(a, vxlan_opts_ofs); } @@ -3594,9 +3751,16 @@ static bool check_attr_len(struct ds *ds, const struct nlattr *a, const struct nlattr *ma, const struct attr_len_tbl tbl[], int max_type, bool need_key) { + uint16_t type = nl_attr_type(a); int expected_len; - expected_len = odp_key_attr_len(tbl, max_type, nl_attr_type(a)); + if (type > max_type) { + /* Unknown attribute, can't check the length. */ + return true; + } + + expected_len = odp_key_attr_len(tbl, max_type, type); + if (expected_len != ATTR_LEN_VARIABLE && expected_len != ATTR_LEN_NESTED) { @@ -3605,7 +3769,7 @@ check_attr_len(struct ds *ds, const struct nlattr *a, const struct nlattr *ma, if (bad_key_len || bad_mask_len) { if (need_key) { - ds_put_format(ds, "key%u", nl_attr_type(a)); + ds_put_format(ds, "key%u", type); } if (bad_key_len) { ds_put_format(ds, "(bad key length %"PRIuSIZE", expected %d)(", @@ -3676,12 +3840,10 @@ format_odp_tun_vxlan_opt(const struct nlattr *attr, ovs_be16 id, id_mask; uint8_t flags, flags_mask = 0; - id = htons(key & 0xFFFF); - flags = (key >> 16) & 0xFF; + odp_decode_gbp_raw(key, &id, &flags); if (ma) { uint32_t mask = nl_attr_get_u32(ma); - id_mask = htons(mask & 0xFFFF); - flags_mask = (mask >> 16) & 0xFF; + odp_decode_gbp_raw(mask, &id_mask, &flags_mask); } ds_put_cstr(ds, "gbp("); @@ -6197,6 +6359,11 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, const struct flow *mask = parms->mask; const struct flow *data = export_mask ? mask : flow; + if (parms->support.recirc) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); + nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); + } + nl_msg_put_u32(buf, OVS_KEY_ATTR_PRIORITY, data->skb_priority); if (flow_tnl_dst_is_set(&flow->tunnel) || @@ -6205,6 +6372,12 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, parms->key_buf, NULL); } + /* Add an ingress port attribute if this is a mask or 'in_port.odp_port' + * is not the magical value "ODPP_NONE". */ + if (export_mask || flow->in_port.odp_port != ODPP_NONE) { + nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, data->in_port.odp_port); + } + nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark); if (parms->support.ct_state) { @@ -6248,16 +6421,6 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, ct->ipv6_proto = data->ct_nw_proto; } } - if (parms->support.recirc) { - nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); - nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); - } - - /* Add an ingress port attribute if this is a mask or 'in_port.odp_port' - * is not the magical value "ODPP_NONE". */ - if (export_mask || flow->in_port.odp_port != ODPP_NONE) { - nl_msg_put_odp_port(buf, OVS_KEY_ATTR_IN_PORT, data->in_port.odp_port); - } nl_msg_put_be32(buf, OVS_KEY_ATTR_PACKET_TYPE, data->packet_type); @@ -6398,12 +6561,10 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, icmpv6_key->icmpv6_code = ntohs(data->tp_dst); if (is_nd(flow, NULL) - /* Even though 'tp_src' and 'tp_dst' are 16 bits wide, ICMP - * type and code are 8 bits wide. Therefore, an exact match - * looks like htons(0xff), not htons(0xffff). See - * xlate_wc_finish() for details. */ - && (!export_mask || (data->tp_src == htons(0xff) - && data->tp_dst == htons(0xff)))) { + /* Even though 'tp_src' is 16 bits wide, ICMP type is 8 bits + * wide. Therefore, an exact match looks like htons(0xff), + * not htons(0xffff). See xlate_wc_finish() for details. */ + && (!export_mask || data->tp_src == htons(0xff))) { struct ovs_key_nd *nd_key; nd_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_ND, sizeof *nd_key); @@ -7119,20 +7280,17 @@ parse_l2_5_onward(const struct nlattr *attrs[OVS_KEY_ATTR_MAX + 1], flow->arp_sha = nd_key->nd_sll; flow->arp_tha = nd_key->nd_tll; if (is_mask) { - /* Even though 'tp_src' and 'tp_dst' are 16 bits wide, - * ICMP type and code are 8 bits wide. Therefore, an - * exact match looks like htons(0xff), not - * htons(0xffff). See xlate_wc_finish() for details. - * */ + /* Even though 'tp_src' is 16 bits wide, ICMP type + * is 8 bits wide. Therefore, an exact match looks + * like htons(0xff), not htons(0xffff). See + * xlate_wc_finish() for details. */ if (!is_all_zeros(nd_key, sizeof *nd_key) && - (flow->tp_src != htons(0xff) || - flow->tp_dst != htons(0xff))) { + flow->tp_src != htons(0xff)) { odp_parse_error(&rl, errorp, - "ICMP (src,dst) masks should be " - "(0xff,0xff) but are actually " - "(%#"PRIx16",%#"PRIx16")", - ntohs(flow->tp_src), - ntohs(flow->tp_dst)); + "ICMP src mask should be " + "(0xff) but is actually " + "(%#"PRIx16")", + ntohs(flow->tp_src)); return ODP_FIT_ERROR; } else { *expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_ND; @@ -7744,6 +7902,23 @@ odp_put_tnl_push_action(struct ofpbuf *odp_actions, nl_msg_put_unspec(odp_actions, OVS_ACTION_ATTR_TUNNEL_PUSH, data, size); } +void +odp_put_psample_action(struct ofpbuf *odp_actions, uint32_t group_id, + uint8_t *cookie, size_t cookie_len) +{ + size_t offset = nl_msg_start_nested_with_flag(odp_actions, + OVS_ACTION_ATTR_PSAMPLE); + + nl_msg_put_u32(odp_actions, OVS_PSAMPLE_ATTR_GROUP, group_id); + if (cookie && cookie_len) { + ovs_assert(cookie_len <= OVS_PSAMPLE_COOKIE_MAX_SIZE); + nl_msg_put_unspec(odp_actions, OVS_PSAMPLE_ATTR_COOKIE, cookie, + cookie_len); + } + + nl_msg_end_nested(odp_actions, offset); +} + /* The commit_odp_actions() function and its helpers. */ @@ -8768,3 +8943,29 @@ commit_odp_actions(const struct flow *flow, struct flow *base, return slow1 ? slow1 : slow2; } + +int +odp_vxlan_tun_opts_from_attr(const struct nlattr *tun_attr, ovs_be16 *id, + uint8_t *flags, bool *id_present) +{ + static const struct nl_policy vxlan_opts_policy[] = { + [OVS_VXLAN_EXT_GBP] = { .type = NL_A_U32 }, + }; + struct nlattr *ext[ARRAY_SIZE(vxlan_opts_policy)]; + + if (!nl_parse_nested(tun_attr, vxlan_opts_policy, ext, ARRAY_SIZE(ext))) { + return EINVAL; + } + + if (ext[OVS_VXLAN_EXT_GBP]) { + uint32_t gbp_raw = nl_attr_get_u32(ext[OVS_VXLAN_EXT_GBP]); + + odp_decode_gbp_raw(gbp_raw, id, flags); + } + + if (id_present) { + *id_present = !!ext[OVS_VXLAN_EXT_GBP]; + } + + return 0; +} diff --git a/lib/odp-util.h b/lib/odp-util.h index a1d0d0fba5d..e454dbfcdb5 100644 --- a/lib/odp-util.h +++ b/lib/odp-util.h @@ -292,6 +292,9 @@ enum slow_path_reason commit_odp_actions(const struct flow *, bool pending_decap, struct ofpbuf *encap_data); +int odp_vxlan_tun_opts_from_attr(const struct nlattr *tun_attr, ovs_be16 *id, + uint8_t *flags, bool *id_present); + /* ofproto-dpif interface. * * The following types and functions are logically part of ofproto-dpif. @@ -373,6 +376,22 @@ void odp_put_pop_eth_action(struct ofpbuf *odp_actions); void odp_put_push_eth_action(struct ofpbuf *odp_actions, const struct eth_addr *eth_src, const struct eth_addr *eth_dst); +void odp_put_psample_action(struct ofpbuf *odp_actions, + uint32_t group_id, uint8_t *cookie, + size_t cookie_len); + +static inline void odp_decode_gbp_raw(uint32_t gbp_raw, + ovs_be16 *id, + uint8_t *flags) +{ + *id = htons(gbp_raw & 0xFFFF); + *flags = (gbp_raw >> 16) & 0xFF; +} + +static inline uint32_t odp_encode_gbp_raw(uint8_t flags, ovs_be16 id) +{ + return (flags << 16) | ntohs(id); +} struct attr_len_tbl { int len; diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c index d7e5f542a04..fe6a17b6dad 100644 --- a/lib/ofp-actions.c +++ b/lib/ofp-actions.c @@ -330,6 +330,8 @@ enum ofp_raw_action_type { NXAST_RAW_SAMPLE2, /* NX1.0+(41): struct nx_action_sample2. */ NXAST_RAW_SAMPLE3, + /* NX1.0+(51): struct nx_action_sample4. VLMFF */ + NXAST_RAW_SAMPLE4, /* NX1.0+(34): struct nx_action_conjunction. */ NXAST_RAW_CONJUNCTION, @@ -4230,10 +4232,12 @@ encode_DELETE_FIELD(const struct ofpact_delete_field *delete_field, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { - struct nx_action_delete_field *nadf = put_NXAST_DELETE_FIELD(out); - size_t size = out->size; + size_t size; - out->size = size - sizeof nadf->pad; + put_NXAST_DELETE_FIELD(out); + size = out->size; + + out->size = size - MEMBER_SIZEOF(struct nx_action_delete_field, pad); nx_put_mff_header(out, delete_field->field, 0, false); out->size = size; } @@ -6186,6 +6190,34 @@ struct nx_action_sample2 { }; OFP_ASSERT(sizeof(struct nx_action_sample2) == 32); +/* Action structure for NXAST_SAMPLE4 + * + * NXAST_SAMPLE4 was added in Open vSwitch 3.4.0. Compared to NXAST_SAMPLE3, + * it adds support for using field specifiers for observation_domain_id and + * observation_point_id. */ +struct nx_action_sample4 { + ovs_be16 type; /* OFPAT_VENDOR. */ + ovs_be16 len; /* Length is 40. */ + ovs_be32 vendor; /* NX_VENDOR_ID. */ + ovs_be16 subtype; /* NXAST_SAMPLE4. */ + ovs_be16 probability; /* Fraction of packets to sample. */ + ovs_be32 collector_set_id; /* ID of collector set in OVSDB. */ + ovs_be32 obs_domain_src; /* The observation_domain_id source. */ + union { + ovs_be16 obs_domain_ofs_nbits; /* Range to use from source field. */ + ovs_be32 obs_domain_imm; /* Immediate value for domain id. */ + }; + ovs_be32 obs_point_src; /* The observation_point_id source. */ + union { + ovs_be16 obs_point_ofs_nbits; /* Range to use from source field. */ + ovs_be32 obs_point_imm; /* Immediate value for point id. */ + }; + ovs_be16 sampling_port; /* Sampling port. */ + uint8_t direction; /* Sampling direction. */ + uint8_t zeros[5]; /* Pad to a multiple of 8 bytes */ + }; + OFP_ASSERT(sizeof(struct nx_action_sample4) == 40); + static enum ofperr decode_NXAST_RAW_SAMPLE(const struct nx_action_sample *nas, enum ofp_version ofp_version OVS_UNUSED, @@ -6197,11 +6229,14 @@ decode_NXAST_RAW_SAMPLE(const struct nx_action_sample *nas, sample->ofpact.raw = NXAST_RAW_SAMPLE; sample->probability = ntohs(nas->probability); sample->collector_set_id = ntohl(nas->collector_set_id); - sample->obs_domain_id = ntohl(nas->obs_domain_id); - sample->obs_point_id = ntohl(nas->obs_point_id); + sample->obs_domain_imm = ntohl(nas->obs_domain_id); + sample->obs_domain_src.field = NULL; + sample->obs_point_imm = ntohl(nas->obs_point_id); + sample->obs_point_src.field = NULL; sample->sampling_port = OFPP_NONE; sample->direction = NX_ACTION_SAMPLE_DEFAULT; - + sample->obs_domain_src.field = NULL; + sample->obs_point_src.field = NULL; if (sample->probability == 0) { return OFPERR_OFPBAC_BAD_ARGUMENT; } @@ -6218,8 +6253,10 @@ decode_SAMPLE2(const struct nx_action_sample2 *nas, sample->ofpact.raw = raw; sample->probability = ntohs(nas->probability); sample->collector_set_id = ntohl(nas->collector_set_id); - sample->obs_domain_id = ntohl(nas->obs_domain_id); - sample->obs_point_id = ntohl(nas->obs_point_id); + sample->obs_domain_imm = ntohl(nas->obs_domain_id); + sample->obs_domain_src.field = NULL; + sample->obs_point_imm = ntohl(nas->obs_point_id); + sample->obs_point_src.field = NULL; sample->sampling_port = u16_to_ofp(ntohs(nas->sampling_port)); sample->direction = direction; @@ -6239,41 +6276,170 @@ decode_NXAST_RAW_SAMPLE2(const struct nx_action_sample2 *nas, ofpact_put_SAMPLE(out)); } +static int +check_sample_direction(enum nx_action_sample_direction direction) +{ + if (direction != NX_ACTION_SAMPLE_DEFAULT && + direction != NX_ACTION_SAMPLE_INGRESS && + direction != NX_ACTION_SAMPLE_EGRESS) { + VLOG_WARN_RL(&rl, "invalid sample direction %"PRIu8, direction); + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + return 0; +} + static enum ofperr decode_NXAST_RAW_SAMPLE3(const struct nx_action_sample2 *nas, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { struct ofpact_sample *sample = ofpact_put_SAMPLE(out); + int err; + if (!is_all_zeros(nas->zeros, sizeof nas->zeros)) { return OFPERR_NXBRC_MUST_BE_ZERO; } - if (nas->direction != NX_ACTION_SAMPLE_DEFAULT && - nas->direction != NX_ACTION_SAMPLE_INGRESS && - nas->direction != NX_ACTION_SAMPLE_EGRESS) { - VLOG_WARN_RL(&rl, "invalid sample direction %"PRIu8, nas->direction); - return OFPERR_OFPBAC_BAD_ARGUMENT; + err = check_sample_direction(nas->direction); + if (err) { + return err; } return decode_SAMPLE2(nas, NXAST_RAW_SAMPLE3, nas->direction, sample); } +static int +decode_sample_obs_id(ovs_be32 src, ovs_be16 ofs_nbits, ovs_be32 imm, + const struct vl_mff_map *vl_mff_map, uint64_t *tlv_bitmap, + struct mf_subfield *src_out, uint32_t *imm_out) +{ + if (src) { + enum ofperr error; + + src_out->ofs = nxm_decode_ofs(ofs_nbits); + src_out->n_bits = nxm_decode_n_bits(ofs_nbits); + error = mf_vl_mff_mf_from_nxm_header(ntohl(src), + vl_mff_map, &src_out->field, + tlv_bitmap); + if (error) { + return error; + } + + error = mf_check_src(src_out, NULL); + if (error) { + return error; + } + + if (src_out->n_bits > 32) { + VLOG_WARN_RL(&rl, "size of field used in observation_id (%d) " + "exceeds maximum (32)", src_out->n_bits); + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + } else { + src_out->field = NULL; + *imm_out = ntohl(imm); + } + + return 0; +} + +static enum ofperr +decode_NXAST_RAW_SAMPLE4(const struct nx_action_sample4 *nas, + enum ofp_version ofp_version OVS_UNUSED, + const struct vl_mff_map *vl_mff_map, + uint64_t *tlv_bitmap, + struct ofpbuf *out) +{ + struct ofpact_sample *sample = ofpact_put_SAMPLE(out); + int err; + + if (!is_all_zeros(nas->zeros, sizeof nas->zeros)) { + return OFPERR_NXBRC_MUST_BE_ZERO; + } + + err = check_sample_direction(nas->direction); + if (err) { + return err; + } + + sample->ofpact.raw = NXAST_RAW_SAMPLE4; + sample->probability = ntohs(nas->probability); + sample->collector_set_id = ntohl(nas->collector_set_id); + sample->sampling_port = u16_to_ofp(ntohs(nas->sampling_port)); + sample->direction = nas->direction; + + if (sample->probability == 0) { + return OFPERR_OFPBAC_BAD_ARGUMENT; + } + + err = decode_sample_obs_id(nas->obs_domain_src, + nas->obs_domain_ofs_nbits, + nas->obs_domain_imm, + vl_mff_map, tlv_bitmap, + &sample->obs_domain_src, + &sample->obs_domain_imm); + if (err) { + return err; + } + + return decode_sample_obs_id(nas->obs_point_src, + nas->obs_point_ofs_nbits, + nas->obs_point_imm, + vl_mff_map, tlv_bitmap, + &sample->obs_point_src, + &sample->obs_point_imm); +} + static void encode_SAMPLE2(const struct ofpact_sample *sample, struct nx_action_sample2 *nas) { nas->probability = htons(sample->probability); nas->collector_set_id = htonl(sample->collector_set_id); - nas->obs_domain_id = htonl(sample->obs_domain_id); - nas->obs_point_id = htonl(sample->obs_point_id); + nas->obs_domain_id = htonl(sample->obs_domain_imm); + nas->obs_point_id = htonl(sample->obs_point_imm); + nas->sampling_port = htons(ofp_to_u16(sample->sampling_port)); + nas->direction = sample->direction; +} + +static void +encode_SAMPLE4(const struct ofpact_sample *sample, + struct nx_action_sample4 *nas) +{ + nas->probability = htons(sample->probability); + nas->collector_set_id = htonl(sample->collector_set_id); nas->sampling_port = htons(ofp_to_u16(sample->sampling_port)); nas->direction = sample->direction; + + if (sample->obs_domain_src.field) { + nas->obs_domain_src = + htonl(nxm_header_from_mff(sample->obs_domain_src.field)); + nas->obs_domain_ofs_nbits = + nxm_encode_ofs_nbits(sample->obs_domain_src.ofs, + sample->obs_domain_src.n_bits); + } else { + nas->obs_domain_src = htonl(0); + nas->obs_domain_imm = htonl(sample->obs_domain_imm); + } + if (sample->obs_point_src.field) { + nas->obs_point_src = + htonl(nxm_header_from_mff(sample->obs_point_src.field)); + nas->obs_point_ofs_nbits = + nxm_encode_ofs_nbits(sample->obs_point_src.ofs, + sample->obs_point_src.n_bits); + } else { + nas->obs_point_src = htonl(0); + nas->obs_point_imm = htonl(sample->obs_point_imm); + } } static void encode_SAMPLE(const struct ofpact_sample *sample, enum ofp_version ofp_version OVS_UNUSED, struct ofpbuf *out) { - if (sample->ofpact.raw == NXAST_RAW_SAMPLE3 + if (sample->ofpact.raw == NXAST_RAW_SAMPLE4 || + sample->obs_domain_src.field || + sample->obs_point_src.field) { + encode_SAMPLE4(sample, put_NXAST_SAMPLE4(out)); + } else if (sample->ofpact.raw == NXAST_RAW_SAMPLE3 || sample->direction != NX_ACTION_SAMPLE_DEFAULT) { encode_SAMPLE2(sample, put_NXAST_SAMPLE3(out)); } else if (sample->ofpact.raw == NXAST_RAW_SAMPLE2 @@ -6283,8 +6449,8 @@ encode_SAMPLE(const struct ofpact_sample *sample, struct nx_action_sample *nas = put_NXAST_SAMPLE(out); nas->probability = htons(sample->probability); nas->collector_set_id = htonl(sample->collector_set_id); - nas->obs_domain_id = htonl(sample->obs_domain_id); - nas->obs_point_id = htonl(sample->obs_point_id); + nas->obs_domain_id = htonl(sample->obs_domain_imm); + nas->obs_point_id = htonl(sample->obs_point_imm); } } @@ -6312,9 +6478,35 @@ parse_SAMPLE(char *arg, const struct ofpact_parse_params *pp) } else if (!strcmp(key, "collector_set_id")) { error = str_to_u32(value, &os->collector_set_id); } else if (!strcmp(key, "obs_domain_id")) { - error = str_to_u32(value, &os->obs_domain_id); + error = str_to_u32(value, &os->obs_domain_imm); + + if (error) { + free(error); + error = mf_parse_subfield(&os->obs_domain_src, value); + if (error) { + return error; + } + if (os->obs_domain_src.n_bits > 32) { + return xasprintf("size of obs_domain_id field (%d) " + "exceeds maximum (32)", + os->obs_domain_src.n_bits); + } + } } else if (!strcmp(key, "obs_point_id")) { - error = str_to_u32(value, &os->obs_point_id); + error = str_to_u32(value, &os->obs_point_imm); + + if (error) { + free(error); + error = mf_parse_subfield(&os->obs_point_src, value); + if (error) { + return error; + } + if (os->obs_point_src.n_bits > 32) { + return xasprintf("size of obs_point_id field (%d) " + "exceeds maximum (32)", + os->obs_point_src.n_bits); + } + } } else if (!strcmp(key, "sampling_port")) { if (!ofputil_port_from_string(value, pp->port_map, &os->sampling_port)) { @@ -6344,14 +6536,23 @@ format_SAMPLE(const struct ofpact_sample *a, const struct ofpact_format_params *fp) { ds_put_format(fp->s, "%ssample(%s%sprobability=%s%"PRIu16 - ",%scollector_set_id=%s%"PRIu32 - ",%sobs_domain_id=%s%"PRIu32 - ",%sobs_point_id=%s%"PRIu32, + ",%scollector_set_id=%s%"PRIu32, colors.paren, colors.end, colors.param, colors.end, a->probability, - colors.param, colors.end, a->collector_set_id, - colors.param, colors.end, a->obs_domain_id, - colors.param, colors.end, a->obs_point_id); + colors.param, colors.end, a->collector_set_id); + + ds_put_format(fp->s, ",%sobs_domain_id=%s", colors.param, colors.end); + if (a->obs_domain_src.field) { + mf_format_subfield(&a->obs_domain_src, fp->s); + } else { + ds_put_format(fp->s, "%"PRIu32, a->obs_domain_imm); + } + ds_put_format(fp->s, ",%sobs_point_id=%s", colors.param, colors.end); + if (a->obs_point_src.field) { + mf_format_subfield(&a->obs_point_src, fp->s); + } else { + ds_put_format(fp->s, "%"PRIu32, a->obs_point_imm); + } if (a->sampling_port != OFPP_NONE) { ds_put_format(fp->s, ",%ssampling_port=%s", colors.param, colors.end); ofputil_format_port(a->sampling_port, fp->port_map, fp->s); diff --git a/lib/ofp-bundle.c b/lib/ofp-bundle.c index 0161c2bc615..941a8370e08 100644 --- a/lib/ofp-bundle.c +++ b/lib/ofp-bundle.c @@ -292,6 +292,7 @@ ofputil_is_bundlable(enum ofptype type) case OFPTYPE_IPFIX_FLOW_STATS_REQUEST: case OFPTYPE_IPFIX_FLOW_STATS_REPLY: case OFPTYPE_CT_FLUSH_ZONE: + case OFPTYPE_CT_FLUSH: break; } diff --git a/lib/ofp-ct.c b/lib/ofp-ct.c new file mode 100644 index 00000000000..2e12790b434 --- /dev/null +++ b/lib/ofp-ct.c @@ -0,0 +1,594 @@ +/* + * Copyright (c) 2023, Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "ct-dpif.h" +#include "openvswitch/ofp-ct.h" +#include "openflow/nicira-ext.h" +#include "openvswitch/dynamic-string.h" +#include "openvswitch/ofp-msgs.h" +#include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-errors.h" +#include "openvswitch/ofp-prop.h" +#include "openvswitch/ofp-util.h" +#include "openvswitch/packets.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(ofp_ct); + +static void +ofp_ct_tuple_format(struct ds *ds, const struct ofp_ct_tuple *tuple, + uint8_t ip_proto, uint16_t l3_type) +{ + ds_put_cstr(ds, l3_type == AF_INET ? "ct_nw_src=": "ct_ipv6_src="); + ipv6_format_mapped(&tuple->src, ds); + ds_put_cstr(ds, l3_type == AF_INET ? ",ct_nw_dst=": ",ct_ipv6_dst="); + ipv6_format_mapped(&tuple->dst, ds); + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + ds_put_format(ds, ",icmp_id=%u,icmp_type=%u,icmp_code=%u", + ntohs(tuple->icmp_id), tuple->icmp_type, + tuple->icmp_code); + } else { + ds_put_format(ds, ",ct_tp_src=%u,ct_tp_dst=%u", ntohs(tuple->src_port), + ntohs(tuple->dst_port)); + } +} + +static bool +ofp_ct_tuple_is_zero(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) +{ + bool is_zero = ipv6_is_zero(&tuple->src) && ipv6_is_zero(&tuple->dst); + + if (!(ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6)) { + is_zero = is_zero && !tuple->src_port && !tuple->dst_port; + } + + return is_zero; +} + +static bool +ofp_ct_tuple_is_five_tuple(const struct ofp_ct_tuple *tuple, uint8_t ip_proto) +{ + /* First check if we have address. */ + bool five_tuple = !ipv6_is_zero(&tuple->src) && !ipv6_is_zero(&tuple->dst); + + if (!(ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6)) { + five_tuple = five_tuple && tuple->src_port && tuple->dst_port; + } + + return five_tuple; +} + +bool +ofp_ct_match_is_five_tuple(const struct ofp_ct_match *match) +{ + return ofp_ct_tuple_is_five_tuple(&match->tuple_orig, match->ip_proto) && + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto) && + !match->mark_mask && ovs_u128_is_zero(match->labels_mask); +} + +bool +ofp_ct_match_is_zero(const struct ofp_ct_match *match) +{ + return !match->ip_proto && !match->l3_type && + ofp_ct_tuple_is_zero(&match->tuple_orig, match->ip_proto) && + ofp_ct_tuple_is_zero(&match->tuple_reply, match->ip_proto) && + !match->mark_mask && ovs_u128_is_zero(match->labels_mask); +} + +void +ofp_ct_match_format(struct ds *ds, const struct ofp_ct_match *match) +{ + if (match->mark_mask) { + ds_put_format(ds, "mark=%#"PRIx32, match->mark); + if (match->mark_mask != UINT32_MAX) { + ds_put_format(ds, "/%#"PRIx32, match->mark_mask); + } + ds_put_char(ds, ' '); + } + + if (!ovs_u128_is_zero(match->labels_mask)) { + ovs_be128 be_value = hton128(match->labels); + ovs_be128 be_mask = hton128(match->labels_mask); + + ds_put_cstr(ds, "labels="); + ds_put_hex(ds, &be_value, sizeof be_value); + + if (!ovs_u128_is_ones(match->labels_mask)) { + ds_put_char(ds, '/'); + ds_put_hex(ds, &be_mask, sizeof be_mask); + } + ds_put_char(ds, ' '); + } + + ds_put_cstr(ds, "'"); + ofp_ct_tuple_format(ds, &match->tuple_orig, match->ip_proto, + match->l3_type); + ds_put_format(ds, ",ct_nw_proto=%u' '", match->ip_proto); + ofp_ct_tuple_format(ds, &match->tuple_reply, match->ip_proto, + match->l3_type); + ds_put_cstr(ds, "'"); +} + +static inline bool +ofp_ct_masked_parse(const char *s, uint8_t *val, size_t val_len, + uint8_t *mask, size_t mask_len) +{ + char *tail; + if (!parse_int_string(s, val, val_len, &tail)) { + if (*tail != '/' || parse_int_string(tail + 1, mask, + mask_len, &tail)) { + memset(mask, UINT8_MAX, mask_len); + } + + return true; + } + + return false; +} + +/* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. + * Returns true on success. Otherwise, returns false and puts the error + * message in 'ds'. */ +static bool +ofp_ct_tuple_parse(struct ofp_ct_tuple *tuple, const char *s, + struct ds *ds, uint8_t *ip_proto, uint16_t *l3_type) +{ + char *pos, *key, *value, *copy; + + pos = copy = xstrdup(s); + while (ofputil_parse_key_value(&pos, &key, &value)) { + if (!*value) { + ds_put_format(ds, "field %s missing value", key); + goto error; + } + + if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { + struct in6_addr *addr = key[6] == 's' ? &tuple->src : &tuple->dst; + + if (*l3_type && *l3_type != AF_INET) { + ds_put_format(ds ,"the L3 protocol does not match %s", value); + goto error; + } + + if (!ipv6_is_zero(addr)) { + ds_put_format(ds, "%s is set multiple times", key); + goto error; + } + + ovs_be32 ip = 0; + if (!ip_parse(value, &ip)) { + goto error_with_msg; + } + + *l3_type = AF_INET; + *addr = in6_addr_mapped_ipv4(ip); + } else if (!strcmp(key, "ct_ipv6_src") || + !strcmp(key, "ct_ipv6_dst")) { + struct in6_addr *addr = key[8] == 's' ? &tuple->src : &tuple->dst; + + if (*l3_type && *l3_type != AF_INET6) { + ds_put_format(ds, "the L3 protocol does not match %s", value); + goto error; + } + + if (!ipv6_is_zero(addr)) { + ds_put_format(ds, "%s is set multiple times", key); + goto error; + } + + + if (!ipv6_parse(value, addr)) { + goto error_with_msg; + } + + *l3_type = AF_INET6; + } else if (!strcmp(key, "ct_nw_proto")) { + if (*ip_proto) { + ds_put_format(ds, "%s is set multiple times", key); + } + char *err = str_to_u8(value, key, ip_proto); + + if (err) { + free(err); + goto error_with_msg; + } + } else if (!strcmp(key, "ct_tp_src") || !strcmp(key, "ct_tp_dst")) { + uint16_t port; + char *err = str_to_u16(value, key, &port); + + if (err) { + free(err); + goto error_with_msg; + } + if (key[6] == 's') { + tuple->src_port = htons(port); + } else { + tuple->dst_port = htons(port); + } + } else if (!strcmp(key, "icmp_type") || !strcmp(key, "icmp_code") || + !strcmp(key, "icmp_id")) { + if (*ip_proto != IPPROTO_ICMP && *ip_proto != IPPROTO_ICMPV6) { + ds_put_cstr(ds, "invalid L4 fields"); + goto error; + } + uint16_t icmp_id; + char *err; + + if (key[5] == 't') { + err = str_to_u8(value, key, &tuple->icmp_type); + } else if (key[5] == 'c') { + err = str_to_u8(value, key, &tuple->icmp_code); + } else { + err = str_to_u16(value, key, &icmp_id); + tuple->icmp_id = htons(icmp_id); + } + if (err) { + free(err); + goto error_with_msg; + } + } else { + ds_put_format(ds, "invalid conntrack tuple field: %s", key); + goto error; + } + } + + if (!*ip_proto && (tuple->src_port || tuple->dst_port)) { + ds_put_cstr(ds, "port is set without protocol"); + goto error; + } + + free(copy); + return true; + +error_with_msg: + ds_put_format(ds, "failed to parse field %s", key); +error: + free(copy); + return false; +} + +/* Parses a specification of a conntrack match from 'argv' into 'match'. + * Returns true on success. Otherwise, returns false and puts the error + * message in 'ds'. */ +bool +ofp_ct_match_parse(const char **argv, int argc, struct ds *ds, + struct ofp_ct_match *match, bool *with_zone, + uint16_t *zone_id) +{ + int args = argc; + + /* Parse zone. */ + if (args && !strncmp(argv[argc - args], "zone=", 5)) { + if (!ovs_scan(argv[argc - args], "zone=%"SCNu16, zone_id)) { + ds_put_cstr(ds, "failed to parse zone"); + return false; + } + *with_zone = true; + args--; + } + + /* Parse mark. */ + if (args && !strncmp(argv[argc - args], "mark=", 5)) { + const char *s = argv[argc - args] + 5; + ovs_be32 mark_be; + ovs_be32 mask_be; + + if (ofp_ct_masked_parse(s, (uint8_t *) &mark_be, sizeof mark_be, + (uint8_t *) &mask_be, sizeof mask_be)) { + match->mark = ntohl(mark_be); + match->mark_mask = ntohl(mask_be); + } else { + ds_put_cstr(ds, "failed to parse mark"); + return false; + } + args--; + } + + /* Parse labels. */ + if (args && !strncmp(argv[argc - args], "labels=", 7)) { + const char *s = argv[argc - args] + 7; + ovs_be128 labels_be; + ovs_be128 mask_be; + + if (ofp_ct_masked_parse(s, (uint8_t *) &labels_be, sizeof labels_be, + (uint8_t *) &mask_be, sizeof mask_be)) { + match->labels = ntoh128(labels_be); + match->labels_mask = ntoh128(mask_be); + } else { + ds_put_cstr(ds, "failed to parse labels"); + return false; + } + args--; + } + + /* Parse ct tuples. */ + for (int i = 0; i < 2; i++) { + if (!args) { + break; + } + + struct ofp_ct_tuple *tuple = + i ? &match->tuple_reply : &match->tuple_orig; + const char *arg = argv[argc - args]; + + if (arg[0] && !ofp_ct_tuple_parse(tuple, arg, ds, &match->ip_proto, + &match->l3_type)) { + return false; + } + args--; + } + + if (args > 0) { + ds_put_cstr(ds, "invalid arguments"); + return false; + } + + return true; +} + +static enum ofperr +ofpprop_pull_ipv6(struct ofpbuf *property, struct in6_addr *addr, + uint16_t *l3_type) +{ + if (ofpbuf_msgsize(property) < sizeof *addr) { + return OFPERR_OFPBPC_BAD_LEN; + } + + memcpy(addr, property->msg, sizeof *addr); + + uint16_t l3 = 0; + if (!ipv6_is_zero(addr)) { + l3 = IN6_IS_ADDR_V4MAPPED(addr) ? AF_INET : AF_INET6; + } + + if (*l3_type && l3 && *l3_type != l3) { + return OFPERR_OFPBPC_BAD_VALUE; + } + + *l3_type = l3; + + return 0; +} + +static enum ofperr +ofp_ct_tuple_decode_nested(struct ofpbuf *property, struct ofp_ct_tuple *tuple, + uint16_t *l3_type) +{ + struct ofpbuf nested; + enum ofperr error = ofpprop_parse_nested(property, &nested); + if (error) { + return error; + } + + while (nested.size) { + struct ofpbuf inner; + uint64_t type; + + error = ofpprop_pull(&nested, &inner, &type); + if (error) { + return error; + } + switch (type) { + case NXT_CT_TUPLE_SRC: + error = ofpprop_pull_ipv6(&inner, &tuple->src, l3_type); + break; + + case NXT_CT_TUPLE_DST: + error = ofpprop_pull_ipv6(&inner, &tuple->dst, l3_type); + break; + + case NXT_CT_TUPLE_SRC_PORT: + error = ofpprop_parse_be16(&inner, &tuple->src_port); + break; + + case NXT_CT_TUPLE_DST_PORT: + error = ofpprop_parse_be16(&inner, &tuple->dst_port); + break; + + case NXT_CT_TUPLE_ICMP_ID: + error = ofpprop_parse_be16(&inner, &tuple->icmp_id); + break; + + case NXT_CT_TUPLE_ICMP_TYPE: + error = ofpprop_parse_u8(&inner, &tuple->icmp_type); + break; + + case NXT_CT_TUPLE_ICMP_CODE: + error = ofpprop_parse_u8(&inner, &tuple->icmp_code); + break; + + default: + error = OFPPROP_UNKNOWN(false, "NXT_CT_TUPLE", type); + break; + } + + if (error) { + return error; + } + } + + return 0; +} + +static void +ofp_ct_tuple_encode(const struct ofp_ct_tuple *tuple, struct ofpbuf *buf, + enum nx_ct_flush_tlv_type type, uint8_t ip_proto) +{ + /* 128 B is enough to hold the whole tuple. */ + uint8_t stub[128]; + struct ofpbuf nested = OFPBUF_STUB_INITIALIZER(stub); + + if (!ipv6_is_zero(&tuple->src)) { + ofpprop_put(&nested, NXT_CT_TUPLE_SRC, &tuple->src, sizeof tuple->src); + } + + if (!ipv6_is_zero(&tuple->dst)) { + ofpprop_put(&nested, NXT_CT_TUPLE_DST, &tuple->dst, sizeof tuple->dst); + } + + if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_ICMP_ID, tuple->icmp_id); + ofpprop_put_u8(&nested, NXT_CT_TUPLE_ICMP_TYPE, tuple->icmp_type); + ofpprop_put_u8(&nested, NXT_CT_TUPLE_ICMP_CODE, tuple->icmp_code); + } else { + if (tuple->src_port) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_SRC_PORT, tuple->src_port); + } + + if (tuple->dst_port) { + ofpprop_put_be16(&nested, NXT_CT_TUPLE_DST_PORT, tuple->dst_port); + } + } + + if (nested.size) { + ofpprop_put_nested(buf, type, &nested); + } + + ofpbuf_uninit(&nested); +} + +enum ofperr +ofp_ct_match_decode(struct ofp_ct_match *match, bool *with_zone, + uint16_t *zone_id, const struct ofp_header *oh) +{ + uint32_t tlv_flags = 0; + struct ofpbuf msg = ofpbuf_const_initializer(oh, ntohs(oh->length)); + ofpraw_pull_assert(&msg); + + const struct nx_ct_flush *nx_flush = ofpbuf_pull(&msg, sizeof *nx_flush); + + if (!is_all_zeros(nx_flush->pad, sizeof nx_flush->pad)) { + return OFPERR_NXBRC_MUST_BE_ZERO; + } + + match->ip_proto = nx_flush->ip_proto; + + struct ofp_ct_tuple *orig = &match->tuple_orig; + struct ofp_ct_tuple *reply = &match->tuple_reply; + + while (msg.size) { + struct ofpbuf property; + uint64_t type; + + enum ofperr error = ofpprop_pull(&msg, &property, &type); + if (error) { + return error; + } + + switch (type) { + case NXT_CT_ORIG_TUPLE: + error = ofp_ct_tuple_decode_nested(&property, orig, + &match->l3_type); + break; + + case NXT_CT_REPLY_TUPLE: + error = ofp_ct_tuple_decode_nested(&property, reply, + &match->l3_type); + break; + + case NXT_CT_ZONE_ID: + if (with_zone) { + *with_zone = true; + } + error = ofpprop_parse_u16(&property, zone_id); + break; + + case NXT_CT_MARK: + error = ofpprop_parse_u32(&property, &match->mark); + break; + + case NXT_CT_MARK_MASK: + error = ofpprop_parse_u32(&property, &match->mark_mask); + break; + + case NXT_CT_LABELS: + error = ofpprop_parse_u128(&property, &match->labels); + break; + + case NXT_CT_LABELS_MASK: + error = ofpprop_parse_u128(&property, &match->labels_mask); + break; + + default: + error = OFPPROP_UNKNOWN(false, "NXT_CT_FLUSH", type); + break; + } + + if (error) { + return error; + } + + if (type < (sizeof tlv_flags * CHAR_BIT)) { + tlv_flags |= (UINT32_C(1) << type); + } + } + + /* Consider the mask being all ones if it's not present but the value + * is specified. */ + if (tlv_flags & (UINT32_C(1) << NXT_CT_MARK) && + !(tlv_flags & (UINT32_C(1) << NXT_CT_MARK_MASK))) { + match->mark_mask = UINT32_MAX; + } + + if (tlv_flags & (UINT32_C(1) << NXT_CT_LABELS) && + !(tlv_flags & (UINT32_C(1) << NXT_CT_LABELS_MASK))) { + match->labels_mask = OVS_U128_MAX; + } + + return 0; +} + +struct ofpbuf * +ofp_ct_match_encode(const struct ofp_ct_match *match, uint16_t *zone_id, + enum ofp_version version) +{ + struct ofpbuf *msg = ofpraw_alloc(OFPRAW_NXT_CT_FLUSH, version, 0); + struct nx_ct_flush *nx_flush = ofpbuf_put_zeros(msg, sizeof *nx_flush); + const struct ofp_ct_tuple *orig = &match->tuple_orig; + const struct ofp_ct_tuple *reply = &match->tuple_reply; + + nx_flush->ip_proto = match->ip_proto; + + ofp_ct_tuple_encode(orig, msg, NXT_CT_ORIG_TUPLE,match->ip_proto); + ofp_ct_tuple_encode(reply, msg, NXT_CT_REPLY_TUPLE, match->ip_proto); + + if (zone_id) { + ofpprop_put_u16(msg, NXT_CT_ZONE_ID, *zone_id); + } + + if (match->mark_mask) { + ofpprop_put_u32(msg, NXT_CT_MARK, match->mark); + if (match->mark_mask != UINT32_MAX) { + ofpprop_put_u32(msg, NXT_CT_MARK_MASK, match->mark_mask); + } + } + + if (!ovs_u128_is_zero(match->labels_mask)) { + ofpprop_put_u128(msg, NXT_CT_LABELS, match->labels); + if (!ovs_u128_is_ones(match->labels_mask)) { + ofpprop_put_u128(msg, NXT_CT_LABELS_MASK, match->labels_mask); + } + } + + return msg; +} diff --git a/lib/ofp-group.c b/lib/ofp-group.c index 737f48047b1..3edf1b01b37 100644 --- a/lib/ofp-group.c +++ b/lib/ofp-group.c @@ -1526,6 +1526,31 @@ ofputil_group_properties_destroy(struct ofputil_group_props *gp) free(gp->fields.values); } +void +ofputil_group_properties_format(const struct ofputil_group_props *gp, + struct ds *ds) +{ + if (!gp->selection_method[0]) { + return; + } + + ds_put_format(ds, ",selection_method=%s", gp->selection_method); + if (gp->selection_method_param) { + ds_put_format(ds, ",selection_method_param=%"PRIu64, + gp->selection_method_param); + } + + size_t n = bitmap_count1(gp->fields.used.bm, MFF_N_IDS); + if (n == 1) { + ds_put_cstr(ds, ",fields="); + oxm_format_field_array(ds, &gp->fields); + } else if (n > 1) { + ds_put_cstr(ds, ",fields("); + oxm_format_field_array(ds, &gp->fields); + ds_put_char(ds, ')'); + } +} + static enum ofperr parse_group_prop_ntr_selection_method(struct ofpbuf *payload, enum ofp11_group_type group_type, @@ -1813,16 +1838,45 @@ ofp_print_bucket_id(struct ds *s, const char *label, uint32_t bucket_id, ds_put_char(s, ','); } -static void -ofp_print_group(struct ds *s, uint32_t group_id, uint8_t type, - const struct ovs_list *p_buckets, - const struct ofputil_group_props *props, - enum ofp_version ofp_version, bool suppress_type, - const struct ofputil_port_map *port_map, - const struct ofputil_table_map *table_map) +void +ofputil_bucket_format(struct ds * s, const struct ofputil_bucket *bucket, + enum ofp11_group_type type, enum ofp_version ofp_version, + const struct ofputil_port_map *port_map, + const struct ofputil_table_map *table_map) { - struct ofputil_bucket *bucket; + ds_put_cstr(s, "bucket="); + + ofp_print_bucket_id(s, "bucket_id:", bucket->bucket_id, ofp_version); + if (bucket->weight != (type == OFPGT11_SELECT ? 1 : 0)) { + ds_put_format(s, "weight:%"PRIu16",", bucket->weight); + } + if (bucket->watch_port != OFPP_NONE) { + ds_put_cstr(s, "watch_port:"); + ofputil_format_port(bucket->watch_port, port_map, s); + ds_put_char(s, ','); + } + if (bucket->watch_group != OFPG_ANY) { + ds_put_format(s, "watch_group:%"PRIu32",", bucket->watch_group); + } + ds_put_cstr(s, "actions="); + struct ofpact_format_params fp = { + .port_map = port_map, + .table_map = table_map, + .s = s, + }; + ofpacts_format(bucket->ofpacts, bucket->ofpacts_len, &fp); +} + +void +ofputil_group_format(struct ds *s, uint32_t group_id, uint8_t type, + const struct ofputil_bucket *bucket, + const struct ovs_list *p_buckets, + const struct ofputil_group_props *props, + enum ofp_version ofp_version, bool suppress_type, + const struct ofputil_port_map *port_map, + const struct ofputil_table_map *table_map) +{ ds_put_format(s, "group_id=%"PRIu32, group_id); if (!suppress_type) { @@ -1831,57 +1885,24 @@ ofp_print_group(struct ds *s, uint32_t group_id, uint8_t type, ds_put_format(s, ",type=%s", type_str[type > 4 ? 4 : type]); } - if (props->selection_method[0]) { - ds_put_format(s, ",selection_method=%s", props->selection_method); - if (props->selection_method_param) { - ds_put_format(s, ",selection_method_param=%"PRIu64, - props->selection_method_param); - } - - size_t n = bitmap_count1(props->fields.used.bm, MFF_N_IDS); - if (n == 1) { - ds_put_cstr(s, ",fields="); - oxm_format_field_array(s, &props->fields); - } else if (n > 1) { - ds_put_cstr(s, ",fields("); - oxm_format_field_array(s, &props->fields); - ds_put_char(s, ')'); - } - } + ofputil_group_properties_format(props, s); - if (!p_buckets) { + if (!bucket && !p_buckets) { return; } ds_put_char(s, ','); - LIST_FOR_EACH (bucket, list_node, p_buckets) { - ds_put_cstr(s, "bucket="); - - ofp_print_bucket_id(s, "bucket_id:", bucket->bucket_id, ofp_version); - if (bucket->weight != (type == OFPGT11_SELECT ? 1 : 0)) { - ds_put_format(s, "weight:%"PRIu16",", bucket->weight); - } - if (bucket->watch_port != OFPP_NONE) { - ds_put_cstr(s, "watch_port:"); - ofputil_format_port(bucket->watch_port, port_map, s); + if (bucket) { + ofputil_bucket_format(s, bucket, type, ofp_version, NULL, NULL); + } else { + LIST_FOR_EACH (bucket, list_node, p_buckets) { + ofputil_bucket_format(s, bucket, type, ofp_version, + port_map, table_map); ds_put_char(s, ','); } - if (bucket->watch_group != OFPG_ANY) { - ds_put_format(s, "watch_group:%"PRIu32",", bucket->watch_group); - } - - ds_put_cstr(s, "actions="); - struct ofpact_format_params fp = { - .port_map = port_map, - .table_map = table_map, - .s = s, - }; - ofpacts_format(bucket->ofpacts, bucket->ofpacts_len, &fp); - ds_put_char(s, ','); + ds_chomp(s, ','); } - - ds_chomp(s, ','); } enum ofperr @@ -1901,8 +1922,9 @@ ofputil_group_desc_format(struct ds *s, const struct ofp_header *oh, ds_put_char(s, '\n'); ds_put_char(s, ' '); - ofp_print_group(s, gd.group_id, gd.type, &gd.buckets, &gd.props, - oh->version, false, port_map, table_map); + ofputil_group_format(s, gd.group_id, gd.type, NULL, &gd.buckets, + &gd.props, oh->version, false, + port_map, table_map); ofputil_uninit_group_desc(&gd); } } @@ -2368,8 +2390,9 @@ ofputil_group_mod_format__(struct ds *s, enum ofp_version ofp_version, gm->command_bucket_id, ofp_version); } - ofp_print_group(s, gm->group_id, gm->type, &gm->buckets, &gm->props, - ofp_version, bucket_command, port_map, table_map); + ofputil_group_format(s, gm->group_id, gm->type, NULL, &gm->buckets, + &gm->props, ofp_version, bucket_command, + port_map, table_map); } enum ofperr diff --git a/lib/ofp-monitor.c b/lib/ofp-monitor.c index c27733a5264..29b0c5965c7 100644 --- a/lib/ofp-monitor.c +++ b/lib/ofp-monitor.c @@ -962,7 +962,7 @@ ofputil_decode_flow_update(struct ofputil_flow_update *update, return 0; } else if (update->event == OFPFME_PAUSED || update->event == OFPFME_RESUMED) { - struct ofp_flow_update_paused *ofup; + struct ofp_flow_update_paused *ofup OVS_UNUSED; if (length != sizeof *ofup) { goto bad_len; diff --git a/lib/ofp-msgs.c b/lib/ofp-msgs.c index 93aa812978e..fdb89806480 100644 --- a/lib/ofp-msgs.c +++ b/lib/ofp-msgs.c @@ -148,7 +148,7 @@ struct raw_instance { /* Information about a particular 'enum ofpraw'. */ struct raw_info { /* All possible instantiations of this OFPRAW_* into OpenFlow headers. */ - struct raw_instance *instances; /* min_version - max_version + 1 elems. */ + struct raw_instance *instances; /* max_version - min_version + 1 elems. */ uint8_t min_version; uint8_t max_version; diff --git a/lib/ofp-parse.c b/lib/ofp-parse.c index a90b926efb5..102b183a8fd 100644 --- a/lib/ofp-parse.c +++ b/lib/ofp-parse.c @@ -71,16 +71,13 @@ str_to_u16(const char *str, const char *name, uint16_t *valuep) char * OVS_WARN_UNUSED_RESULT str_to_u32(const char *str, uint32_t *valuep) { - char *tail; - uint32_t value; + unsigned long long value; if (!str[0]) { return xstrdup("missing required numeric argument"); } - errno = 0; - value = strtoul(str, &tail, 0); - if (errno == EINVAL || errno == ERANGE || *tail) { + if (!str_to_ullong(str, 0, &value) || value > UINT32_MAX) { return xasprintf("invalid numeric format %s", str); } *valuep = value; diff --git a/lib/ofp-print.c b/lib/ofp-print.c index bd37fa17a59..874079b84b4 100644 --- a/lib/ofp-print.c +++ b/lib/ofp-print.c @@ -45,6 +45,7 @@ #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" #include "openvswitch/ofp-connection.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-group.h" #include "openvswitch/ofp-ipfix.h" @@ -949,6 +950,23 @@ ofp_print_nxt_ct_flush_zone(struct ds *string, const struct nx_zone_id *nzi) return 0; } +static enum ofperr +ofp_print_nxt_ct_flush(struct ds *string, const struct ofp_header *oh) +{ + uint16_t zone_id = 0; + struct ofp_ct_match match = {0}; + + enum ofperr error = ofp_ct_match_decode(&match, NULL, &zone_id, oh); + if (error) { + return error; + } + + ds_put_format(string, " zone=%"PRIu16" ", zone_id); + ofp_ct_match_format(string, &match); + + return 0; +} + static enum ofperr ofp_to_string__(const struct ofp_header *oh, const struct ofputil_port_map *port_map, @@ -1184,6 +1202,8 @@ ofp_to_string__(const struct ofp_header *oh, case OFPTYPE_CT_FLUSH_ZONE: return ofp_print_nxt_ct_flush_zone(string, ofpmsg_body(oh)); + case OFPTYPE_CT_FLUSH: + return ofp_print_nxt_ct_flush(string, oh); } return 0; diff --git a/lib/ofp-prop.c b/lib/ofp-prop.c index 8b2d8a85abe..0e54543bdd8 100644 --- a/lib/ofp-prop.c +++ b/lib/ofp-prop.c @@ -21,6 +21,7 @@ #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-prop.h" #include "openvswitch/vlog.h" +#include "unaligned.h" #include "util.h" #include "uuid.h" @@ -184,6 +185,21 @@ ofpprop_parse_be64(const struct ofpbuf *property, ovs_be64 *value) return 0; } +/* Attempts to parse 'property' as a property containing a 128-bit value. If + * successful, stores the value into '*value' and returns 0; otherwise returns + * an OpenFlow error. */ +enum ofperr +ofpprop_parse_be128(const struct ofpbuf *property, ovs_be128 *value) +{ + ovs_32aligned_be128 *p = property->msg; + + if (ofpbuf_msgsize(property) != sizeof *p) { + return OFPERR_OFPBPC_BAD_LEN; + } + *value = get_32aligned_be128(p); + return 0; +} + /* Attempts to parse 'property' as a property containing a 8-bit value. If * successful, stores the value into '*value' and returns 0; otherwise returns * an OpenFlow error. */ @@ -250,6 +266,21 @@ ofpprop_parse_u64(const struct ofpbuf *property, uint64_t *value) return 0; } +/* Attempts to parse 'property' as a property containing a 128-bit value. If + * successful, stores the value into '*value' and returns 0; otherwise returns + * an OpenFlow error. */ +enum ofperr +ofpprop_parse_u128(const struct ofpbuf *property, ovs_u128 *value) +{ + enum ofperr error = ofpprop_parse_be128(property, (ovs_be128 *) value); + + if (!error) { + *value = ntoh128(*(ovs_be128 *) value); + } + + return error; +} + /* Attempts to parse 'property' as a property containing a UUID. If * successful, stores the value into '*uuid' and returns 0; otherwise returns * an OpenFlow error. */ @@ -351,6 +382,13 @@ ofpprop_put_be64(struct ofpbuf *msg, uint64_t type, ovs_be64 value) ofpprop_end(msg, start); } +/* Adds a property with the given 'type' and 128-bit 'value' to 'msg'. */ +void +ofpprop_put_be128(struct ofpbuf *msg, uint64_t type, ovs_be128 value) +{ + ofpprop_put(msg, type, &value, sizeof value); +} + /* Adds a property with the given 'type' and 8-bit 'value' to 'msg'. */ void ofpprop_put_u8(struct ofpbuf *msg, uint64_t type, uint8_t value) @@ -381,6 +419,13 @@ ofpprop_put_u64(struct ofpbuf *msg, uint64_t type, uint64_t value) ofpprop_put_be64(msg, type, htonll(value)); } +/* Adds a property with the given 'type' and 128-bit 'value' to 'msg'. */ +void +ofpprop_put_u128(struct ofpbuf *msg, uint64_t type, ovs_u128 value) +{ + ofpprop_put_be128(msg, type, hton128(value)); +} + /* Appends a property to 'msg' whose type is 'type' and whose contents is a * series of property headers, one for each 1-bit in 'bitmap'. */ void diff --git a/lib/ofp-table.c b/lib/ofp-table.c index a956754f2d5..f9bd3b7f9c8 100644 --- a/lib/ofp-table.c +++ b/lib/ofp-table.c @@ -1416,7 +1416,7 @@ count_common_prefix_run(const char *ids[], size_t n, if (!next) { break; } else if (next < extra_prefix_len) { - next = extra_prefix_len; + extra_prefix_len = next; } i++; } diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c index d3d42b41482..232ebeb97ba 100644 --- a/lib/ofpbuf.c +++ b/lib/ofpbuf.c @@ -197,12 +197,12 @@ ofpbuf_clone_with_headroom(const struct ofpbuf *b, size_t headroom) struct ofpbuf *new_buffer; new_buffer = ofpbuf_clone_data_with_headroom(b->data, b->size, headroom); - if (b->header) { + if (new_buffer->data && b->header) { ptrdiff_t header_offset = (char *) b->header - (char *) b->data; new_buffer->header = (char *) new_buffer->data + header_offset; } - if (b->msg) { + if (new_buffer->data && b->msg) { ptrdiff_t msg_offset = (char *) b->msg - (char *) b->data; new_buffer->msg = (char *) new_buffer->data + msg_offset; diff --git a/lib/ovs-atomic-clang.h b/lib/ovs-atomic-clang.h index cdf02a512a9..0fc643c8a97 100644 --- a/lib/ovs-atomic-clang.h +++ b/lib/ovs-atomic-clang.h @@ -23,8 +23,6 @@ #define ATOMIC(TYPE) _Atomic(TYPE) -#define ATOMIC_VAR_INIT(VALUE) (VALUE) - #define atomic_init(OBJECT, VALUE) __c11_atomic_init(OBJECT, VALUE) /* Clang hard-codes these exact values internally but does not appear to diff --git a/lib/ovs-atomic-gcc4+.h b/lib/ovs-atomic-gcc4+.h index f9accde1a39..1917df69007 100644 --- a/lib/ovs-atomic-gcc4+.h +++ b/lib/ovs-atomic-gcc4+.h @@ -43,7 +43,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) (sizeof(OBJECT) <= sizeof(void *)) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-gcc4.7+.h b/lib/ovs-atomic-gcc4.7+.h index 846e0577520..9680e546fc1 100644 --- a/lib/ovs-atomic-gcc4.7+.h +++ b/lib/ovs-atomic-gcc4.7+.h @@ -30,7 +30,6 @@ typedef enum { memory_order_seq_cst = __ATOMIC_SEQ_CST } memory_order; -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) #define atomic_thread_fence __atomic_thread_fence diff --git a/lib/ovs-atomic-i586.h b/lib/ovs-atomic-i586.h index 35a0959ffca..2b651865215 100644 --- a/lib/ovs-atomic-i586.h +++ b/lib/ovs-atomic-i586.h @@ -119,7 +119,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) /* diff --git a/lib/ovs-atomic-msvc.h b/lib/ovs-atomic-msvc.h index fb8cd03bd69..3a71f61aeec 100644 --- a/lib/ovs-atomic-msvc.h +++ b/lib/ovs-atomic-msvc.h @@ -59,7 +59,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-pthreads.h b/lib/ovs-atomic-pthreads.h index 570a67fe4cb..0e4263fe288 100644 --- a/lib/ovs-atomic-pthreads.h +++ b/lib/ovs-atomic-pthreads.h @@ -42,7 +42,6 @@ typedef enum { memory_order_seq_cst } memory_order; -#define ATOMIC_VAR_INIT(VALUE) (VALUE) #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) static inline void diff --git a/lib/ovs-atomic-x86_64.h b/lib/ovs-atomic-x86_64.h index 3bdaf2f08e9..2f538699f18 100644 --- a/lib/ovs-atomic-x86_64.h +++ b/lib/ovs-atomic-x86_64.h @@ -120,7 +120,6 @@ typedef enum { #define IS_LOCKLESS_ATOMIC(OBJECT) \ (sizeof(OBJECT) <= 8 && IS_POW2(sizeof(OBJECT))) -#define ATOMIC_VAR_INIT(VALUE) VALUE #define atomic_init(OBJECT, VALUE) (*(OBJECT) = (VALUE), (void) 0) /* diff --git a/lib/ovs-atomic.h b/lib/ovs-atomic.h index 8fdce0cf804..f140d25feba 100644 --- a/lib/ovs-atomic.h +++ b/lib/ovs-atomic.h @@ -91,10 +91,9 @@ * Life Cycle * ========== * - * To initialize an atomic variable at its point of definition, use - * ATOMIC_VAR_INIT: + * To initialize an atomic variable at its point of definition, use: * - * static atomic_int ai = ATOMIC_VAR_INIT(123); + * static atomic_int ai = 123; * * To initialize an atomic variable in code, use atomic_init(): * @@ -329,7 +328,7 @@ #if __CHECKER__ /* sparse doesn't understand some GCC extensions we use. */ #include "ovs-atomic-pthreads.h" - #elif __has_extension(c_atomic) + #elif __clang__ && __has_extension(c_atomic) #include "ovs-atomic-clang.h" #elif HAVE_ATOMIC && __cplusplus >= 201103L #include "ovs-atomic-c++.h" diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c index 946aa04d18e..49afcc55c94 100644 --- a/lib/ovs-rcu.c +++ b/lib/ovs-rcu.c @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void) ovs_assert(!single_threaded()); perthread = ovsrcu_perthread_get(); if (!seq_try_lock()) { - perthread->seqno = seq_read_protected(global_seqno); + perthread->seqno = seq_read(global_seqno); if (perthread->cbset) { ovsrcu_flush_cbset__(perthread, true); } @@ -326,7 +326,7 @@ ovsrcu_postpone__(void (*function)(void *aux), void *aux) cb->aux = aux; } -static bool +static bool OVS_NO_SANITIZE_FUNCTION ovsrcu_call_postponed(void) { struct ovsrcu_cbset *cbset; diff --git a/lib/ovs-rcu.h b/lib/ovs-rcu.h index 8b397b7fb0c..a1c15c1266e 100644 --- a/lib/ovs-rcu.h +++ b/lib/ovs-rcu.h @@ -175,7 +175,7 @@ #if __GNUC__ #define OVSRCU_TYPE(TYPE) struct { ATOMIC(TYPE) p; } -#define OVSRCU_INITIALIZER(VALUE) { ATOMIC_VAR_INIT(VALUE) } +#define OVSRCU_INITIALIZER(VALUE) { VALUE } #define ovsrcu_get__(TYPE, VAR, ORDER) \ ({ \ TYPE value__; \ @@ -207,7 +207,7 @@ #else /* not GNU C */ struct ovsrcu_pointer { ATOMIC(void *) p; }; #define OVSRCU_TYPE(TYPE) struct ovsrcu_pointer -#define OVSRCU_INITIALIZER(VALUE) { ATOMIC_VAR_INIT(VALUE) } +#define OVSRCU_INITIALIZER(VALUE) { VALUE } static inline void * ovsrcu_get__(const struct ovsrcu_pointer *pointer, memory_order order) { diff --git a/lib/ovs-replay.c b/lib/ovs-replay.c index f386246c7ef..551c7f56d3b 100644 --- a/lib/ovs-replay.c +++ b/lib/ovs-replay.c @@ -34,7 +34,7 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25); static struct ovs_mutex replay_mutex = OVS_MUTEX_INITIALIZER; static int replay_seqno OVS_GUARDED_BY(replay_mutex) = 0; -static atomic_int replay_state = ATOMIC_VAR_INIT(OVS_REPLAY_NONE); +static atomic_int replay_state = OVS_REPLAY_NONE; static char *dirname = NULL; diff --git a/lib/ovs-router.c b/lib/ovs-router.c index 5d0fbd503e9..3d84c9a30a8 100644 --- a/lib/ovs-router.c +++ b/lib/ovs-router.c @@ -115,7 +115,8 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst, const struct cls_rule *cr_src; struct flow flow_src = {.ipv6_dst = *src, .pkt_mark = mark}; - cr_src = classifier_lookup(&cls, OVS_VERSION_MAX, &flow_src, NULL); + cr_src = classifier_lookup(&cls, OVS_VERSION_MAX, &flow_src, NULL, + NULL); if (cr_src) { struct ovs_router_entry *p_src = ovs_router_entry_cast(cr_src); if (!p_src->local) { @@ -126,7 +127,7 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr *ip6_dst, } } - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL, NULL); if (cr) { struct ovs_router_entry *p = ovs_router_entry_cast(cr); @@ -164,6 +165,46 @@ static void rt_init_match(struct match *match, uint32_t mark, match->flow.pkt_mark = mark; } +static int +verify_prefsrc(const struct in6_addr *ip6_dst, + const char output_bridge[], + struct in6_addr *prefsrc) +{ + struct in6_addr *mask, *addr6; + struct netdev *dev; + int err, n_in6, i; + + err = netdev_open(output_bridge, NULL, &dev); + if (err) { + return err; + } + + err = netdev_get_addr_list(dev, &addr6, &mask, &n_in6); + if (err) { + goto out; + } + + for (i = 0; i < n_in6; i++) { + struct in6_addr a1, a2; + a1 = ipv6_addr_bitand(ip6_dst, &mask[i]); + a2 = ipv6_addr_bitand(prefsrc, &mask[i]); + + /* Check that the interface has "prefsrc" and + * it is same broadcast domain with "ip6_dst". */ + if (IN6_ARE_ADDR_EQUAL(prefsrc, &addr6[i]) && + IN6_ARE_ADDR_EQUAL(&a1, &a2)) { + goto out; + } + } + err = ENOENT; + +out: + free(addr6); + free(mask); + netdev_close(dev); + return err; +} + int ovs_router_get_netdev_source_address(const struct in6_addr *ip6_dst, const char output_bridge[], @@ -217,8 +258,12 @@ static int ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, const struct in6_addr *ip6_dst, uint8_t plen, const char output_bridge[], - const struct in6_addr *gw) + const struct in6_addr *gw, + const struct in6_addr *ip6_src) { + int (*get_src_addr)(const struct in6_addr *ip6_dst, + const char output_bridge[], + struct in6_addr *prefsrc); const struct cls_rule *cr; struct ovs_router_entry *p; struct match match; @@ -236,11 +281,17 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, p->plen = plen; p->local = local; p->priority = priority; - err = ovs_router_get_netdev_source_address(ip6_dst, output_bridge, - &p->src_addr); + + if (ipv6_addr_is_set(ip6_src)) { + p->src_addr = *ip6_src; + get_src_addr = verify_prefsrc; + } else { + get_src_addr = ovs_router_get_netdev_source_address; + } + + err = get_src_addr(ip6_dst, output_bridge, &p->src_addr); if (err && ipv6_addr_is_set(gw)) { - err = ovs_router_get_netdev_source_address(gw, output_bridge, - &p->src_addr); + err = get_src_addr(gw, output_bridge, &p->src_addr); } if (err) { struct ds ds = DS_EMPTY_INITIALIZER; @@ -269,15 +320,30 @@ ovs_router_insert__(uint32_t mark, uint8_t priority, bool local, void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, - bool local, const char output_bridge[], - const struct in6_addr *gw) + bool local, const char output_bridge[], + const struct in6_addr *gw, const struct in6_addr *prefsrc) { if (use_system_routing_table) { uint8_t priority = local ? plen + 64 : plen; - ovs_router_insert__(mark, priority, local, ip_dst, plen, output_bridge, gw); + ovs_router_insert__(mark, priority, local, ip_dst, plen, + output_bridge, gw, prefsrc); } } +/* The same as 'ovs_router_insert', but it adds the route even if updates + * from the system routing table are disabled. Used for unit tests. */ +void +ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst, + uint8_t plen, bool local, const char output_bridge[], + const struct in6_addr *gw, + const struct in6_addr *prefsrc) +{ + uint8_t priority = local ? plen + 64 : plen; + + ovs_router_insert__(mark, priority, local, ip_dst, plen, + output_bridge, gw, prefsrc); +} + static void rt_entry_delete__(const struct cls_rule *cr) { @@ -341,48 +407,68 @@ static void ovs_router_add(struct unixctl_conn *conn, int argc, const char *argv[], void *aux OVS_UNUSED) { + struct in6_addr src6 = in6addr_any; struct in6_addr gw6 = in6addr_any; + char src6_s[IPV6_SCAN_LEN + 1]; struct in6_addr ip6; uint32_t mark = 0; unsigned int plen; + ovs_be32 src = 0; + ovs_be32 gw = 0; + bool is_ipv6; ovs_be32 ip; int err; + int i; if (scan_ipv4_route(argv[1], &ip, &plen)) { - ovs_be32 gw = 0; - - if (argc > 3) { - if (!ovs_scan(argv[3], "pkt_mark=%"SCNi32, &mark) && - !ip_parse(argv[3], &gw)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark or gateway"); - return; - } - } in6_addr_set_mapped_ipv4(&ip6, ip); - if (gw) { - in6_addr_set_mapped_ipv4(&gw6, gw); - } plen += 96; + is_ipv6 = false; } else if (scan_ipv6_route(argv[1], &ip6, &plen)) { - if (argc > 3) { - if (!ovs_scan(argv[3], "pkt_mark=%"SCNi32, &mark) && - !ipv6_parse(argv[3], &gw6)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark or IPv6 gateway"); - return; - } - } + is_ipv6 = true; } else { - unixctl_command_reply_error(conn, "Invalid parameters"); + unixctl_command_reply_error(conn, + "Invalid 'ip/plen' parameter"); return; } - if (argc > 4) { - if (!ovs_scan(argv[4], "pkt_mark=%"SCNi32, &mark)) { - unixctl_command_reply_error(conn, "Invalid pkt_mark"); - return; + + /* Parse optional parameters. */ + for (i = 3; i < argc; i++) { + if (ovs_scan(argv[i], "pkt_mark=%"SCNi32, &mark)) { + continue; } + + if (is_ipv6) { + if (ovs_scan(argv[i], "src="IPV6_SCAN_FMT, src6_s) && + ipv6_parse(src6_s, &src6)) { + continue; + } + if (ipv6_parse(argv[i], &gw6)) { + continue; + } + } else { + if (ovs_scan(argv[i], "src="IP_SCAN_FMT, IP_SCAN_ARGS(&src))) { + continue; + } + if (ip_parse(argv[i], &gw)) { + continue; + } + } + + unixctl_command_reply_error(conn, + "Invalid pkt_mark, IP gateway or src_ip"); + return; + } + + if (gw) { + in6_addr_set_mapped_ipv4(&gw6, gw); + } + if (src) { + in6_addr_set_mapped_ipv4(&src6, src); } - err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2], &gw6); + err = ovs_router_insert__(mark, plen + 32, false, &ip6, plen, argv[2], + &gw6, &src6); if (err) { unixctl_command_reply_error(conn, "Error while inserting route."); } else { @@ -532,12 +618,12 @@ ovs_router_init(void) fatal_signal_add_hook(ovs_router_flush_handler, NULL, NULL, true); classifier_init(&cls, NULL); unixctl_command_register("ovs/route/add", - "ip_addr/prefix_len out_br_name [gw] " - "[pkt_mark=mark]", - 2, 4, ovs_router_add, NULL); + "ip/plen output_bridge [gw] " + "[pkt_mark=mark] [src=src_ip]", + 2, 5, ovs_router_add, NULL); unixctl_command_register("ovs/route/show", "", 0, 0, ovs_router_show, NULL); - unixctl_command_register("ovs/route/del", "ip_addr/prefix_len " + unixctl_command_register("ovs/route/del", "ip/plen " "[pkt_mark=mark]", 1, 2, ovs_router_del, NULL); unixctl_command_register("ovs/route/lookup", "ip_addr " diff --git a/lib/ovs-router.h b/lib/ovs-router.h index d8ce3c00ded..d7dc7e55f37 100644 --- a/lib/ovs-router.h +++ b/lib/ovs-router.h @@ -32,7 +32,13 @@ bool ovs_router_lookup(uint32_t mark, const struct in6_addr *ip_dst, void ovs_router_init(void); void ovs_router_insert(uint32_t mark, const struct in6_addr *ip_dst, uint8_t plen, bool local, - const char output_bridge[], const struct in6_addr *gw); + const char output_bridge[], const struct in6_addr *gw, + const struct in6_addr *prefsrc); +void ovs_router_force_insert(uint32_t mark, const struct in6_addr *ip_dst, + uint8_t plen, bool local, + const char output_bridge[], + const struct in6_addr *gw, + const struct in6_addr *prefsrc); void ovs_router_flush(void); void ovs_router_disable_system_routing_table(void); diff --git a/lib/ovs-thread.c b/lib/ovs-thread.c index 78ed3e9707e..f8000806156 100644 --- a/lib/ovs-thread.c +++ b/lib/ovs-thread.c @@ -31,6 +31,7 @@ #include "openvswitch/poll-loop.h" #include "seq.h" #include "socket-util.h" +#include "timeval.h" #include "util.h" #ifdef __CHECKER__ @@ -62,13 +63,14 @@ static bool multithreaded; \ /* Verify that 'l' was initialized. */ \ if (OVS_UNLIKELY(!l->where)) { \ - ovs_abort(0, "%s: %s() passed uninitialized ovs_"#TYPE, \ - where, __func__); \ + VLOG_ABORT("%s: %s() passed uninitialized ovs_"#TYPE, \ + where, __func__); \ } \ \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s: pthread_%s_%s failed", where, #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ l->where = where; \ } @@ -90,13 +92,14 @@ LOCK_FUNCTION(spin, lock); \ /* Verify that 'l' was initialized. */ \ if (OVS_UNLIKELY(!l->where)) { \ - ovs_abort(0, "%s: %s() passed uninitialized ovs_"#TYPE, \ - where, __func__); \ + VLOG_ABORT("%s: %s() passed uninitialized ovs_"#TYPE, \ + where, __func__); \ } \ \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error) && error != EBUSY) { \ - ovs_abort(error, "%s: pthread_%s_%s failed", where, #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ if (!error) { \ l->where = where; \ @@ -124,7 +127,8 @@ TRY_LOCK_FUNCTION(spin, trylock); l->where = WHERE; \ error = pthread_##TYPE##_##FUN(&l->lock); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "pthread_%s_%s failed", #TYPE, #FUN); \ + VLOG_ABORT("%s: pthread_%s_%s failed: %s", l->where, #TYPE, #FUN, \ + ovs_strerror(error)); \ } \ } UNLOCK_FUNCTION(mutex, unlock, ""); @@ -142,7 +146,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } #define XPTHREAD_FUNC2(FUNCTION, PARAM1, PARAM2) \ @@ -151,7 +156,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1, arg2); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } #define XPTHREAD_FUNC3(FUNCTION, PARAM1, PARAM2, PARAM3)\ @@ -160,7 +166,8 @@ UNLOCK_FUNCTION(spin, destroy, NULL); { \ int error = FUNCTION(arg1, arg2, arg3); \ if (OVS_UNLIKELY(error)) { \ - ovs_abort(error, "%s failed", #FUNCTION); \ + VLOG_ABORT("%s failed: %s", #FUNCTION, \ + ovs_strerror(error)); \ } \ } @@ -203,7 +210,7 @@ ovs_mutex_init__(const struct ovs_mutex *l_, int type) xpthread_mutexattr_settype(&attr, type); error = pthread_mutex_init(&l->lock, &attr); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_mutex_init failed"); + VLOG_ABORT("pthread_mutex_init failed: %s", ovs_strerror(error)); } xpthread_mutexattr_destroy(&attr); } @@ -256,7 +263,7 @@ ovs_rwlock_init(const struct ovs_rwlock *l_) #endif if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_rwlock_init failed"); + VLOG_ABORT("pthread_rwlock_init failed: %s", ovs_strerror(error)); } } @@ -274,7 +281,7 @@ ovs_mutex_cond_wait(pthread_cond_t *cond, const struct ovs_mutex *mutex_) error = pthread_cond_wait(cond, &mutex->lock); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_cond_wait failed"); + VLOG_ABORT("pthread_cond_wait failed: %s", ovs_strerror(error)); } } @@ -288,7 +295,7 @@ ovs_spin_init__(const struct ovs_spin *l_, int pshared) l->where = ""; error = pthread_spin_init(&l->lock, pshared); if (OVS_UNLIKELY(error)) { - ovs_abort(error, "pthread_spin_init failed"); + VLOG_ABORT("pthread_spin_init failed: %s", ovs_strerror(error)); } } @@ -430,13 +437,15 @@ set_min_stack_size(pthread_attr_t *attr, size_t min_stacksize) error = pthread_attr_getstacksize(attr, &stacksize); if (error) { - ovs_abort(error, "pthread_attr_getstacksize failed"); + VLOG_ABORT("pthread_attr_getstacksize failed: %s", + ovs_strerror(error)); } if (stacksize < min_stacksize) { error = pthread_attr_setstacksize(attr, min_stacksize); if (error) { - ovs_abort(error, "pthread_attr_setstacksize failed"); + VLOG_ABORT("pthread_attr_setstacksize failed: %s", + ovs_strerror(error)); } } } @@ -485,7 +494,7 @@ ovs_thread_create(const char *name, void *(*start)(void *), void *arg) error = pthread_create(&thread, &attr, ovsthread_wrapper, aux); if (error) { - ovs_abort(error, "pthread_create failed"); + VLOG_ABORT("pthread_create failed: %s", ovs_strerror(error)); } pthread_attr_destroy(&attr); return thread; @@ -627,42 +636,60 @@ ovs_thread_stats_next_bucket(const struct ovsthread_stats *stats, size_t i) } -/* Returns the total number of cores available to this process, or 0 if the - * number cannot be determined. */ -int -count_cpu_cores(void) +static int +count_cpu_cores__(void) { - static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; - static long int n_cores; + long int n_cores; - if (ovsthread_once_start(&once)) { #ifndef _WIN32 - n_cores = sysconf(_SC_NPROCESSORS_ONLN); + n_cores = sysconf(_SC_NPROCESSORS_ONLN); +#else + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + n_cores = sysinfo.dwNumberOfProcessors; +#endif #ifdef __linux__ - if (n_cores > 0) { - cpu_set_t *set = CPU_ALLOC(n_cores); + if (n_cores > 0) { + cpu_set_t *set = CPU_ALLOC(n_cores); - if (set) { - size_t size = CPU_ALLOC_SIZE(n_cores); + if (set) { + size_t size = CPU_ALLOC_SIZE(n_cores); - if (!sched_getaffinity(0, size, set)) { - n_cores = CPU_COUNT_S(size, set); - } - CPU_FREE(set); + if (!sched_getaffinity(0, size, set)) { + n_cores = CPU_COUNT_S(size, set); } + CPU_FREE(set); } -#endif -#else - SYSTEM_INFO sysinfo; - GetSystemInfo(&sysinfo); - n_cores = sysinfo.dwNumberOfProcessors; -#endif - ovsthread_once_done(&once); } - +#endif return n_cores > 0 ? n_cores : 0; } +/* It's unlikely that the available cpus change several times per second and + * even if it does, it's not needed (or desired) to react to such changes so + * quickly. */ +#define COUNT_CPU_UPDATE_TIME_MS 10000 + +static struct ovs_mutex cpu_cores_mutex = OVS_MUTEX_INITIALIZER; + +/* Returns the current total number of cores available to this process, or 0 + * if the number cannot be determined. */ +int +count_cpu_cores(void) +{ + static long long int last_updated = 0; + long long int now = time_msec(); + static int cpu_cores; + + ovs_mutex_lock(&cpu_cores_mutex); + if (!last_updated || now - last_updated >= COUNT_CPU_UPDATE_TIME_MS) { + last_updated = now; + cpu_cores = count_cpu_cores__(); + } + ovs_mutex_unlock(&cpu_cores_mutex); + return cpu_cores; +} + /* Returns the total number of cores on the system, or 0 if the * number cannot be determined. */ int diff --git a/lib/ovs.tmac b/lib/ovs.tmac index 5f8f20afa4a..97b6fa3df76 100644 --- a/lib/ovs.tmac +++ b/lib/ovs.tmac @@ -175,7 +175,7 @@ . nr mE \\n(.f . nf . nh -. ft CW +. ft CR .. . . diff --git a/lib/ovsdb-cs.c b/lib/ovsdb-cs.c index a6fbd290c87..b5eda88adbd 100644 --- a/lib/ovsdb-cs.c +++ b/lib/ovsdb-cs.c @@ -219,6 +219,9 @@ struct ovsdb_cs { struct uuid cid; struct hmap server_rows; + /* Whether to send 'set_db_change_aware'. */ + bool set_db_change_aware; + /* Clustered servers. */ uint64_t min_index; /* Minimum allowed index, to avoid regression. */ bool leader_only; /* If true, do not connect to Raft followers. */ @@ -331,6 +334,7 @@ ovsdb_cs_create(const char *db_name, int max_version, cs->request_id = NULL; cs->leader_only = true; cs->shuffle_remotes = true; + cs->set_db_change_aware = true; hmap_init(&cs->server_rows); return cs; @@ -461,7 +465,7 @@ ovsdb_cs_process_response(struct ovsdb_cs *cs, struct jsonrpc_msg *msg) cs->server.monitor_version = cs->server.max_version; ovsdb_cs_db_parse_monitor_reply(&cs->server, msg->result, cs->server.monitor_version); - if (ovsdb_cs_check_server_db(cs)) { + if (ovsdb_cs_check_server_db(cs) && cs->set_db_change_aware) { ovsdb_cs_send_db_change_aware(cs); } } else { @@ -787,6 +791,16 @@ ovsdb_cs_get_last_error(const struct ovsdb_cs *cs) } } +/* Sets all the JSON-RPC session 'options' for 'cs''s current session. */ +void +ovsdb_cs_set_jsonrpc_options(const struct ovsdb_cs *cs, + const struct jsonrpc_session_options *options) +{ + if (cs->session) { + jsonrpc_session_set_options(cs->session, options); + } +} + /* Sets the "probe interval" for 'cs''s current session to 'probe_interval', in * milliseconds. */ void @@ -892,7 +906,7 @@ ovsdb_cs_db_get_table(struct ovsdb_cs_db *db, const char *table) t = xzalloc(sizeof *t); t->name = xstrdup(table); - t->new_cond = json_array_create_1(json_boolean_create(true)); + t->ack_cond = json_array_create_1(json_boolean_create(true)); hmap_insert(&db->tables, &t->hmap_node, hash); return t; } @@ -1150,6 +1164,22 @@ ovsdb_cs_send_cond_change(struct ovsdb_cs *cs) } } +/* Database change awareness. */ + +/* By default, or if 'set_db_change_aware' is true, 'cs' will send + * 'set_db_change_aware' request to the server after receiving the _SERVER data + * (when the server supports it), which is useful for clients that intends to + * keep long connections to the server. Otherwise, 'cs' will not send the + * 'set_db_change_aware' request, which is more reasonable for short-lived + * connections to avoid unnecessary processing at the server side and possible + * error handling due to connections being closed by the clients before the + * responses are sent by the server. */ +void +ovsdb_cs_set_db_change_aware(struct ovsdb_cs *cs, bool set_db_change_aware) +{ + cs->set_db_change_aware = set_db_change_aware; +} + /* Clustered servers. */ /* By default, or if 'leader_only' is true, when 'cs' connects to a clustered diff --git a/lib/ovsdb-cs.h b/lib/ovsdb-cs.h index 5d5b58f0a0a..bcc3dcd7167 100644 --- a/lib/ovsdb-cs.h +++ b/lib/ovsdb-cs.h @@ -32,6 +32,7 @@ #include "openvswitch/uuid.h" struct json; +struct jsonrpc_session_options; struct ovsdb_cs; struct ovsdb_cs_ops { @@ -131,6 +132,8 @@ bool ovsdb_cs_is_alive(const struct ovsdb_cs *); bool ovsdb_cs_is_connected(const struct ovsdb_cs *); int ovsdb_cs_get_last_error(const struct ovsdb_cs *); +void ovsdb_cs_set_jsonrpc_options(const struct ovsdb_cs *, + const struct jsonrpc_session_options *); void ovsdb_cs_set_probe_interval(const struct ovsdb_cs *, int probe_interval); /* Conditional monitoring (specifying that only rows matching particular @@ -142,6 +145,9 @@ unsigned int ovsdb_cs_set_condition(struct ovsdb_cs *, const char *table, const struct json *condition); unsigned int ovsdb_cs_get_condition_seqno(const struct ovsdb_cs *); +/* Database change awareness. */ +void ovsdb_cs_set_db_change_aware(struct ovsdb_cs *, bool set_db_change_aware); + /* Clustered servers. */ void ovsdb_cs_set_leader_only(struct ovsdb_cs *, bool leader_only); void ovsdb_cs_set_shuffle_remotes(struct ovsdb_cs *, bool shuffle); diff --git a/lib/ovsdb-data.c b/lib/ovsdb-data.c index 183e752583a..abb923ad8fa 100644 --- a/lib/ovsdb-data.c +++ b/lib/ovsdb-data.c @@ -455,9 +455,15 @@ ovsdb_atom_from_json(union ovsdb_atom *atom, /* Converts 'atom', of the specified 'type', to JSON format, and returns the * JSON. The caller is responsible for freeing the returned JSON. * + * If 'allow_shallow_copies' is false, deep copy of the string JSON object + * will be used. Useful when the same string object is accessed by multiple + * threads as deep copy will not change the reference counter of the original + * JSON string. + * * Refer to RFC 7047 for the format of the JSON that this function produces. */ -struct json * -ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) +static struct json * +ovsdb_atom_to_json__(const union ovsdb_atom *atom, enum ovsdb_atomic_type type, + bool allow_shallow_copies) { switch (type) { case OVSDB_TYPE_VOID: @@ -473,7 +479,8 @@ ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) return json_boolean_create(atom->boolean); case OVSDB_TYPE_STRING: - return json_clone(atom->s); + return allow_shallow_copies ? json_clone(atom->s) + : json_deep_clone(atom->s); case OVSDB_TYPE_UUID: return wrap_json("uuid", json_string_create_nocopy( @@ -485,6 +492,19 @@ ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) } } +struct json * +ovsdb_atom_to_json(const union ovsdb_atom *atom, enum ovsdb_atomic_type type) +{ + return ovsdb_atom_to_json__(atom, type, true); +} + +static struct json * +ovsdb_atom_to_json_deep(const union ovsdb_atom *atom, + enum ovsdb_atomic_type type) +{ + return ovsdb_atom_to_json__(atom, type, false); +} + static char * ovsdb_atom_from_string__(union ovsdb_atom *atom, union ovsdb_atom **range_end_atom, @@ -1409,12 +1429,15 @@ ovsdb_unconstrained_datum_from_json(struct ovsdb_datum *datum, static struct json * ovsdb_base_to_json(const union ovsdb_atom *atom, const struct ovsdb_base_type *base, - bool use_row_names) + bool use_row_names, + bool allow_shallow_copies) { if (!use_row_names || base->type != OVSDB_TYPE_UUID || !base->uuid.refTableName) { - return ovsdb_atom_to_json(atom, base->type); + return allow_shallow_copies + ? ovsdb_atom_to_json(atom, base->type) + : ovsdb_atom_to_json_deep(atom, base->type); } else { return json_array_create_2( json_string_create("named-uuid"), @@ -1425,7 +1448,8 @@ ovsdb_base_to_json(const union ovsdb_atom *atom, static struct json * ovsdb_datum_to_json__(const struct ovsdb_datum *datum, const struct ovsdb_type *type, - bool use_row_names) + bool use_row_names, + bool allow_shallow_copies) { if (ovsdb_type_is_map(type)) { struct json **elems; @@ -1435,14 +1459,15 @@ ovsdb_datum_to_json__(const struct ovsdb_datum *datum, for (i = 0; i < datum->n; i++) { elems[i] = json_array_create_2( ovsdb_base_to_json(&datum->keys[i], &type->key, - use_row_names), + use_row_names, allow_shallow_copies), ovsdb_base_to_json(&datum->values[i], &type->value, - use_row_names)); + use_row_names, allow_shallow_copies)); } return wrap_json("map", json_array_create(elems, datum->n)); } else if (datum->n == 1) { - return ovsdb_base_to_json(&datum->keys[0], &type->key, use_row_names); + return ovsdb_base_to_json(&datum->keys[0], &type->key, + use_row_names, allow_shallow_copies); } else { struct json **elems; size_t i; @@ -1450,7 +1475,7 @@ ovsdb_datum_to_json__(const struct ovsdb_datum *datum, elems = xmalloc(datum->n * sizeof *elems); for (i = 0; i < datum->n; i++) { elems[i] = ovsdb_base_to_json(&datum->keys[i], &type->key, - use_row_names); + use_row_names, allow_shallow_copies); } return wrap_json("set", json_array_create(elems, datum->n)); @@ -1467,14 +1492,21 @@ struct json * ovsdb_datum_to_json(const struct ovsdb_datum *datum, const struct ovsdb_type *type) { - return ovsdb_datum_to_json__(datum, type, false); + return ovsdb_datum_to_json__(datum, type, false, true); +} + +struct json * +ovsdb_datum_to_json_deep(const struct ovsdb_datum *datum, + const struct ovsdb_type *type) +{ + return ovsdb_datum_to_json__(datum, type, false, false); } struct json * ovsdb_datum_to_json_with_row_names(const struct ovsdb_datum *datum, const struct ovsdb_type *type) { - return ovsdb_datum_to_json__(datum, type, true); + return ovsdb_datum_to_json__(datum, type, true, true); } static const char * @@ -2206,6 +2238,8 @@ ovsdb_symbol_table_insert(struct ovsdb_symbol_table *symtab, /* APIs for Generating and apply diffs. */ /* Find what needs to be added to and removed from 'old' to construct 'new'. + * If the optional 'diff' is provided, it can be used to speed up processing, + * in case it is smaller than the original 'old' and 'new'. * * The 'added' and 'removed' datums are always safe; the orders of keys are * maintained since they are added in order. */ @@ -2214,6 +2248,7 @@ ovsdb_datum_added_removed(struct ovsdb_datum *added, struct ovsdb_datum *removed, const struct ovsdb_datum *old, const struct ovsdb_datum *new, + const struct ovsdb_datum *diff, const struct ovsdb_type *type) { size_t oi, ni; @@ -2226,6 +2261,31 @@ ovsdb_datum_added_removed(struct ovsdb_datum *added, return; } + /* Use diff, if provided, unless it's comparable in size. With a large + * diff, the O(n log n) binary search of each element may be slower than + * a simple O(n) comparison between old and new. */ + if (diff && diff->n * 2 < old->n + new->n) { + unsigned int idx; + + for (size_t di = 0; di < diff->n; di++) { + bool found = ovsdb_datum_find_key(old, &diff->keys[di], + type->key.type, &idx); + + if (!found) { + ovsdb_datum_add_from_index_unsafe(added, diff, di, type); + } else { + if (type->value.type != OVSDB_TYPE_VOID + && !ovsdb_atom_equals(&diff->values[di], + &old->values[idx], + type->value.type)) { + ovsdb_datum_add_from_index_unsafe(added, diff, di, type); + } + ovsdb_datum_add_from_index_unsafe(removed, old, idx, type); + } + } + return; + } + /* Generate the diff in O(n) time. */ for (oi = ni = 0; oi < old->n && ni < new->n;) { int c = ovsdb_atom_compare_3way(&old->keys[oi], &new->keys[ni], diff --git a/lib/ovsdb-data.h b/lib/ovsdb-data.h index dcb62051358..c0408ee49ca 100644 --- a/lib/ovsdb-data.h +++ b/lib/ovsdb-data.h @@ -195,6 +195,8 @@ ovsdb_unconstrained_datum_from_json(struct ovsdb_datum *, OVS_WARN_UNUSED_RESULT; struct json *ovsdb_datum_to_json(const struct ovsdb_datum *, const struct ovsdb_type *); +struct json *ovsdb_datum_to_json_deep(const struct ovsdb_datum *, + const struct ovsdb_type *); char *ovsdb_datum_from_string(struct ovsdb_datum *, const struct ovsdb_type *, const char *, @@ -254,6 +256,7 @@ void ovsdb_datum_added_removed(struct ovsdb_datum *added, struct ovsdb_datum *removed, const struct ovsdb_datum *old, const struct ovsdb_datum *new, + const struct ovsdb_datum *diff, const struct ovsdb_type *type); void ovsdb_datum_diff(struct ovsdb_datum *diff, diff --git a/lib/ovsdb-error.c b/lib/ovsdb-error.c index a75ad36b737..56512fc28dd 100644 --- a/lib/ovsdb-error.c +++ b/lib/ovsdb-error.c @@ -141,16 +141,12 @@ ovsdb_internal_error(struct ovsdb_error *inner_error, backtrace_capture(&backtrace); if (backtrace.n_frames) { - int i; - ds_put_cstr(&ds, " (backtrace:"); - for (i = 0; i < backtrace.n_frames; i++) { - ds_put_format(&ds, " 0x%08"PRIxPTR, backtrace.frames[i]); - } + backtrace_format(&ds, &backtrace, ", "); ds_put_char(&ds, ')'); } - ds_put_format(&ds, " (%s %s)", program_name, VERSION); + ds_put_format(&ds, " (%s %s)", program_name, VERSION VERSION_SUFFIX); if (inner_error) { char *s = ovsdb_error_to_string_free(inner_error); diff --git a/lib/ovsdb-idl-provider.h b/lib/ovsdb-idl-provider.h index 8797686f900..8d2b7d6b914 100644 --- a/lib/ovsdb-idl-provider.h +++ b/lib/ovsdb-idl-provider.h @@ -74,6 +74,7 @@ struct ovsdb_idl_row { struct ovs_list dst_arcs; /* Backward arcs (ovsdb_idl_arc.dst_node). */ struct ovsdb_idl_table *table; /* Containing table. */ struct ovsdb_datum *old_datum; /* Committed data (null if orphaned). */ + bool persist_uuid; /* Persist 'uuid' during insert txn if set. */ bool parsed; /* Whether the row is parsed. */ struct ovs_list reparse_node; /* Rows that needs to be re-parsed due to * insertion of a referenced row. */ diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c index 99b58422eca..ba720474b66 100644 --- a/lib/ovsdb-idl.c +++ b/lib/ovsdb-idl.c @@ -177,6 +177,7 @@ static void ovsdb_idl_row_mark_backrefs_for_reparsing(struct ovsdb_idl_row *); static void ovsdb_idl_row_track_change(struct ovsdb_idl_row *, enum ovsdb_idl_change); static void ovsdb_idl_row_untrack_change(struct ovsdb_idl_row *); +static void ovsdb_idl_row_clear_changeseqno(struct ovsdb_idl_row *); static void ovsdb_idl_txn_abort_all(struct ovsdb_idl *); static bool ovsdb_idl_txn_extract_mutations(struct ovsdb_idl_row *, @@ -321,6 +322,14 @@ ovsdb_idl_set_shuffle_remotes(struct ovsdb_idl *idl, bool shuffle) ovsdb_cs_set_shuffle_remotes(idl->cs, shuffle); } +/* Passes 'set_db_change_aware' to ovsdb_cs_set_db_change_aware(). See that + * function for documentation. */ +void +ovsdb_idl_set_db_change_aware(struct ovsdb_idl *idl, bool set_db_change_aware) +{ + ovsdb_cs_set_db_change_aware(idl->cs, set_db_change_aware); +} + /* Reset min_index to 0. This prevents a situation where the client * thinks all databases have stale data, when they actually have all * been destroyed and rebuilt from scratch. @@ -1366,6 +1375,7 @@ ovsdb_idl_track_clear__(struct ovsdb_idl *idl, bool flush_all) row->updated = NULL; } ovsdb_idl_row_untrack_change(row); + ovsdb_idl_row_clear_changeseqno(row); if (ovsdb_idl_row_is_orphan(row)) { ovsdb_idl_row_unparse(row); @@ -1624,6 +1634,7 @@ ovsdb_idl_process_update(struct ovsdb_idl_table *table, ru->columns); } else if (ovsdb_idl_row_is_orphan(row)) { ovsdb_idl_row_untrack_change(row); + ovsdb_idl_row_clear_changeseqno(row); ovsdb_idl_insert_row(row, ru->columns); } else { VLOG_ERR_RL(&semantic_rl, "cannot add existing row "UUID_FMT" to " @@ -2275,11 +2286,15 @@ ovsdb_idl_row_untrack_change(struct ovsdb_idl_row *row) return; } + ovs_list_remove(&row->track_node); + ovs_list_init(&row->track_node); +} + +static void ovsdb_idl_row_clear_changeseqno(struct ovsdb_idl_row *row) +{ row->change_seqno[OVSDB_IDL_CHANGE_INSERT] = row->change_seqno[OVSDB_IDL_CHANGE_MODIFY] = row->change_seqno[OVSDB_IDL_CHANGE_DELETE] = 0; - ovs_list_remove(&row->track_node); - ovs_list_init(&row->track_node); } static struct ovsdb_idl_row * @@ -2855,11 +2870,14 @@ substitute_uuids(struct json *json, const struct ovsdb_idl_txn *txn) row = ovsdb_idl_txn_get_row(txn, &uuid); if (row && !row->old_datum && row->new_datum) { - json_destroy(json); - - return json_array_create_2( - json_string_create("named-uuid"), - json_string_create_nocopy(ovsdb_data_row_name(&uuid))); + if (row->persist_uuid) { + return json; + } else { + json_destroy(json); + return json_array_create_2( + json_string_create("named-uuid"), + json_string_create_nocopy(ovsdb_data_row_name(&uuid))); + } } } @@ -3284,9 +3302,19 @@ ovsdb_idl_txn_commit(struct ovsdb_idl_txn *txn) any_updates = true; - json_object_put(op, "uuid-name", - json_string_create_nocopy( - ovsdb_data_row_name(&row->uuid))); + char *uuid_json; + struct json *value; + if (row->persist_uuid) { + uuid_json = "uuid"; + value = json_string_create_nocopy( + xasprintf(UUID_FMT, UUID_ARGS(&row->uuid))); + } else { + uuid_json = "uuid-name"; + value = json_string_create_nocopy( + ovsdb_data_row_name(&row->uuid)); + } + + json_object_put(op, uuid_json, value); insert = xmalloc(sizeof *insert); insert->dummy = row->uuid; @@ -3770,6 +3798,31 @@ ovsdb_idl_txn_delete(const struct ovsdb_idl_row *row_) row->new_datum = NULL; } +static const struct ovsdb_idl_row * +ovsdb_idl_txn_insert__(struct ovsdb_idl_txn *txn, + const struct ovsdb_idl_table_class *class, + const struct uuid *uuid, + bool persist_uuid) +{ + struct ovsdb_idl_row *row = ovsdb_idl_row_create__(class); + + ovs_assert(uuid || !persist_uuid); + if (uuid) { + ovs_assert(!ovsdb_idl_txn_get_row(txn, uuid)); + row->uuid = *uuid; + } else { + uuid_generate(&row->uuid); + } + row->persist_uuid = persist_uuid; + row->table = ovsdb_idl_table_from_class(txn->idl, class); + row->new_datum = xmalloc(class->n_columns * sizeof *row->new_datum); + hmap_insert(&row->table->rows, &row->hmap_node, uuid_hash(&row->uuid)); + hmap_insert(&txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); + ovsdb_idl_add_to_indexes(row); + + return row; +} + /* Inserts and returns a new row in the table with the specified 'class' in the * database with open transaction 'txn'. * @@ -3787,22 +3840,23 @@ ovsdb_idl_txn_insert(struct ovsdb_idl_txn *txn, const struct ovsdb_idl_table_class *class, const struct uuid *uuid) { - struct ovsdb_idl_row *row = ovsdb_idl_row_create__(class); - - if (uuid) { - ovs_assert(!ovsdb_idl_txn_get_row(txn, uuid)); - row->uuid = *uuid; - } else { - uuid_generate(&row->uuid); - } - - row->table = ovsdb_idl_table_from_class(txn->idl, class); - row->new_datum = xmalloc(class->n_columns * sizeof *row->new_datum); - hmap_insert(&row->table->rows, &row->hmap_node, uuid_hash(&row->uuid)); - hmap_insert(&txn->txn_rows, &row->txn_node, uuid_hash(&row->uuid)); - ovsdb_idl_add_to_indexes(row); + return ovsdb_idl_txn_insert__(txn, class, uuid, false); +} - return row; +/* Inserts and returns a new row in the table with the specified 'class' in the + * database with open transaction 'txn'. + * + * The new row is assigned the specified UUID (which cannot be null). + * + * Usually this function is used indirectly through one of the + * "insert_persist_uuid" functions generated by ovsdb-idlc. */ +const struct ovsdb_idl_row * +ovsdb_idl_txn_insert_persist_uuid(struct ovsdb_idl_txn *txn, + const struct ovsdb_idl_table_class *class, + const struct uuid *uuid) +{ + ovs_assert(uuid); + return ovsdb_idl_txn_insert__(txn, class, uuid, true); } static void diff --git a/lib/ovsdb-idl.h b/lib/ovsdb-idl.h index fbd9f671a20..86fd2bd36f2 100644 --- a/lib/ovsdb-idl.h +++ b/lib/ovsdb-idl.h @@ -66,6 +66,8 @@ struct ovsdb_idl *ovsdb_idl_create_unconnected( const struct ovsdb_idl_class *, bool monitor_everything_by_default); void ovsdb_idl_set_remote(struct ovsdb_idl *, const char *remote, bool retry); void ovsdb_idl_set_shuffle_remotes(struct ovsdb_idl *, bool shuffle); +void ovsdb_idl_set_db_change_aware(struct ovsdb_idl *, + bool set_db_change_aware); void ovsdb_idl_reset_min_index(struct ovsdb_idl *); void ovsdb_idl_destroy(struct ovsdb_idl *); @@ -375,6 +377,9 @@ void ovsdb_idl_txn_delete(const struct ovsdb_idl_row *); const struct ovsdb_idl_row *ovsdb_idl_txn_insert( struct ovsdb_idl_txn *, const struct ovsdb_idl_table_class *, const struct uuid *); +const struct ovsdb_idl_row *ovsdb_idl_txn_insert_persist_uuid( + struct ovsdb_idl_txn *txn, const struct ovsdb_idl_table_class *class, + const struct uuid *uuid); struct ovsdb_idl *ovsdb_idl_txn_get_idl (struct ovsdb_idl_txn *); void ovsdb_idl_get_initial_snapshot(struct ovsdb_idl *); diff --git a/lib/ovsdb-types.c b/lib/ovsdb-types.c index 61efe59cffa..197cee1c67a 100644 --- a/lib/ovsdb-types.c +++ b/lib/ovsdb-types.c @@ -275,6 +275,58 @@ ovsdb_base_type_is_valid(const struct ovsdb_base_type *base) } } +bool +ovsdb_base_type_equals(const struct ovsdb_base_type *a, + const struct ovsdb_base_type *b) +{ + if (a == b) { + return true; + } + + if (a->type != b->type) { + return false; + } + + if ((a->enum_ && !b->enum_) || (!a->enum_ && b->enum_)) { + return false; + } else if (a->enum_ && + !ovsdb_datum_equals(a->enum_, b->enum_, + ovsdb_base_type_get_enum_type(a->type))) { + return false; + } + + switch (a->type) { + case OVSDB_TYPE_VOID: + return true; + + case OVSDB_TYPE_INTEGER: + return a->integer.min == b->integer.min + && a->integer.max == b->integer.max; + + case OVSDB_TYPE_REAL: + return a->real.min == b->real.min && a->real.max == b->real.max; + + case OVSDB_TYPE_BOOLEAN: + return true; + + case OVSDB_TYPE_STRING: + return a->string.minLen == b->string.minLen + && a->string.maxLen == b->string.maxLen; + + case OVSDB_TYPE_UUID: + /* Not comparing the table pointer here, only the table name, as this + * function can be used to compare types from different databases, so + * pointers will be different. */ + return a->uuid.refType == b->uuid.refType + && nullable_string_is_equal(a->uuid.refTableName, + b->uuid.refTableName); + + case OVSDB_N_TYPES: + default: + OVS_NOT_REACHED(); + } +} + bool ovsdb_base_type_has_constraints(const struct ovsdb_base_type *base) { @@ -568,6 +620,15 @@ ovsdb_type_is_valid(const struct ovsdb_type *type) && type->n_max >= 1); } +bool +ovsdb_type_equals(const struct ovsdb_type *a, const struct ovsdb_type *b) +{ + return ovsdb_base_type_equals(&a->key, &b->key) + && ovsdb_base_type_equals(&a->value, &b->value) + && a->n_min == b->n_min + && a->n_max == b->n_max; +} + static struct ovsdb_error * n_from_json(const struct json *json, unsigned int *n) { diff --git a/lib/ovsdb-types.h b/lib/ovsdb-types.h index b9eb0928df6..688fe56337e 100644 --- a/lib/ovsdb-types.h +++ b/lib/ovsdb-types.h @@ -107,6 +107,8 @@ void ovsdb_base_type_clone(struct ovsdb_base_type *, void ovsdb_base_type_destroy(struct ovsdb_base_type *); bool ovsdb_base_type_is_valid(const struct ovsdb_base_type *); +bool ovsdb_base_type_equals(const struct ovsdb_base_type *, + const struct ovsdb_base_type *); bool ovsdb_base_type_has_constraints(const struct ovsdb_base_type *); void ovsdb_base_type_clear_constraints(struct ovsdb_base_type *); const struct ovsdb_type *ovsdb_base_type_get_enum_type(enum ovsdb_atomic_type); @@ -157,6 +159,7 @@ void ovsdb_type_clone(struct ovsdb_type *, const struct ovsdb_type *); void ovsdb_type_destroy(struct ovsdb_type *); bool ovsdb_type_is_valid(const struct ovsdb_type *); +bool ovsdb_type_equals(const struct ovsdb_type *, const struct ovsdb_type *); static inline bool ovsdb_type_is_scalar(const struct ovsdb_type *); static inline bool ovsdb_type_is_optional(const struct ovsdb_type *); @@ -235,6 +238,18 @@ static inline bool ovsdb_type_is_map(const struct ovsdb_type *type) return type->value.type != OVSDB_TYPE_VOID; } +static inline bool ovsdb_type_has_strong_refs(const struct ovsdb_type *type) +{ + return ovsdb_base_type_is_strong_ref(&type->key) + || ovsdb_base_type_is_strong_ref(&type->value); +} + +static inline bool ovsdb_type_has_weak_refs(const struct ovsdb_type *type) +{ + return ovsdb_base_type_is_weak_ref(&type->key) + || ovsdb_base_type_is_weak_ref(&type->value); +} + #ifdef __cplusplus } #endif diff --git a/lib/packets.c b/lib/packets.c index 1dcd4a6fcd2..91c28daf028 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -224,7 +224,6 @@ compose_rarp(struct dp_packet *b, const struct eth_addr eth_src) arp->ar_tha = eth_src; put_16aligned_be32(&arp->ar_tpa, htonl(0)); - dp_packet_reset_offsets(b); dp_packet_set_l3(b, arp); b->packet_type = htonl(PT_ETH); } @@ -427,7 +426,7 @@ add_mpls(struct dp_packet *packet, ovs_be16 ethtype, ovs_be32 lse, } if (!l3_encap) { - struct mpls_hdr *header = dp_packet_push_uninit(packet, MPLS_HLEN); + struct mpls_hdr *header = dp_packet_resize_l2(packet, MPLS_HLEN); put_16aligned_be32(&header->mpls_lse, lse); packet->l2_5_ofs = 0; @@ -513,7 +512,7 @@ push_nsh(struct dp_packet *packet, const struct nsh_hdr *nsh_hdr_src) OVS_NOT_REACHED(); } - nsh = (struct nsh_hdr *) dp_packet_push_uninit(packet, length); + nsh = (struct nsh_hdr *) dp_packet_resize_l2(packet, length); memcpy(nsh, nsh_hdr_src, length); nsh->next_proto = next_proto; packet->packet_type = htonl(PT_NSH); @@ -1114,7 +1113,6 @@ eth_compose(struct dp_packet *b, const struct eth_addr eth_dst, eth->eth_type = htons(eth_type); b->packet_type = htonl(PT_ETH); - dp_packet_reset_offsets(b); dp_packet_set_l3(b, data); return data; @@ -1131,20 +1129,31 @@ packet_set_ipv4_addr(struct dp_packet *packet, pkt_metadata_init_conn(&packet->md); if (nh->ip_proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); - - th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); + th->tcp_csum = recalc_csum32(th->tcp_csum, old_addr, new_addr); + } } else if (nh->ip_proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN ) { - struct udp_header *uh = dp_packet_l4(packet); - - if (uh->udp_csum) { - uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum32(uh->udp_csum, old_addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } - nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr); + + if (dp_packet_hwol_l3_ipv4(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr); + } put_16aligned_be32(addr, new_addr); } @@ -1152,7 +1161,7 @@ packet_set_ipv4_addr(struct dp_packet *packet, * segements_left > 0. * * This function assumes that L3 and L4 offsets are set in the packet. */ -static bool +bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag) { const struct ovs_16aligned_ip6_hdr *nh; @@ -1241,16 +1250,24 @@ packet_update_csum128(struct dp_packet *packet, uint8_t proto, size_t l4_size = dp_packet_l4_size(packet); if (proto == IPPROTO_TCP && l4_size >= TCP_HEADER_LEN) { - struct tcp_header *th = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct tcp_header *th = dp_packet_l4(packet); - th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + th->tcp_csum = recalc_csum128(th->tcp_csum, addr, new_addr); + } } else if (proto == IPPROTO_UDP && l4_size >= UDP_HEADER_LEN) { - struct udp_header *uh = dp_packet_l4(packet); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); - if (!uh->udp_csum) { - uh->udp_csum = htons(0xffff); + if (uh->udp_csum) { + uh->udp_csum = recalc_csum128(uh->udp_csum, addr, new_addr); + if (!uh->udp_csum) { + uh->udp_csum = htons(0xffff); + } } } } else if (proto == IPPROTO_ICMPV6 && @@ -1274,7 +1291,7 @@ packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto, pkt_metadata_init_conn(&packet->md); } -static void +void packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, ovs_be32 flow_key) { ovs_be32 old_label = get_16aligned_be32(flow_label); @@ -1282,7 +1299,7 @@ packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, ovs_be32 flow_key) put_16aligned_be32(flow_label, new_label); } -static void +void packet_set_ipv6_tc(ovs_16aligned_be32 *flow_label, uint8_t tc) { ovs_be32 old_label = get_16aligned_be32(flow_label); @@ -1311,16 +1328,26 @@ packet_set_ipv4(struct dp_packet *packet, ovs_be32 src, ovs_be32 dst, if (nh->ip_tos != tos) { uint8_t *field = &nh->ip_tos; - nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field), - htons((uint16_t) tos)); + if (dp_packet_hwol_l3_ipv4(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field), + htons((uint16_t) tos)); + } + *field = tos; } if (nh->ip_ttl != ttl) { uint8_t *field = &nh->ip_ttl; - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8), - htons(ttl << 8)); + if (dp_packet_hwol_l3_ipv4(packet)) { + dp_packet_ol_reset_ip_csum_good(packet); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8), + htons(ttl << 8)); + } + *field = ttl; } } @@ -1360,7 +1387,9 @@ static void packet_set_port(ovs_be16 *port, ovs_be16 new_port, ovs_be16 *csum) { if (*port != new_port) { - *csum = recalc_csum16(*csum, *port, new_port); + if (csum) { + *csum = recalc_csum16(*csum, *port, new_port); + } *port = new_port; } } @@ -1372,9 +1401,16 @@ void packet_set_tcp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct tcp_header *th = dp_packet_l4(packet); + ovs_be16 *csum = NULL; - packet_set_port(&th->tcp_src, src, &th->tcp_csum); - packet_set_port(&th->tcp_dst, dst, &th->tcp_csum); + if (dp_packet_hwol_l4_is_tcp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + } else { + csum = &th->tcp_csum; + } + + packet_set_port(&th->tcp_src, src, csum); + packet_set_port(&th->tcp_dst, dst, csum); pkt_metadata_init_conn(&packet->md); } @@ -1386,17 +1422,21 @@ packet_set_udp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct udp_header *uh = dp_packet_l4(packet); - if (uh->udp_csum) { - packet_set_port(&uh->udp_src, src, &uh->udp_csum); - packet_set_port(&uh->udp_dst, dst, &uh->udp_csum); + if (dp_packet_hwol_l4_is_udp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + packet_set_port(&uh->udp_src, src, NULL); + packet_set_port(&uh->udp_dst, dst, NULL); + } else { + ovs_be16 *csum = uh->udp_csum ? &uh->udp_csum : NULL; - if (!uh->udp_csum) { + packet_set_port(&uh->udp_src, src, csum); + packet_set_port(&uh->udp_dst, dst, csum); + + if (csum && !uh->udp_csum) { uh->udp_csum = htons(0xffff); } - } else { - uh->udp_src = src; - uh->udp_dst = dst; } + pkt_metadata_init_conn(&packet->md); } @@ -1407,18 +1447,27 @@ void packet_set_sctp_port(struct dp_packet *packet, ovs_be16 src, ovs_be16 dst) { struct sctp_header *sh = dp_packet_l4(packet); - ovs_be32 old_csum, old_correct_csum, new_csum; - uint16_t tp_len = dp_packet_l4_size(packet); - old_csum = get_16aligned_be32(&sh->sctp_csum); - put_16aligned_be32(&sh->sctp_csum, 0); - old_correct_csum = crc32c((void *)sh, tp_len); + if (dp_packet_hwol_l4_is_sctp(packet)) { + dp_packet_ol_reset_l4_csum_good(packet); + sh->sctp_src = src; + sh->sctp_dst = dst; + } else { + ovs_be32 old_csum, old_correct_csum, new_csum; + uint16_t tp_len = dp_packet_l4_size(packet); - sh->sctp_src = src; - sh->sctp_dst = dst; + old_csum = get_16aligned_be32(&sh->sctp_csum); + put_16aligned_be32(&sh->sctp_csum, 0); + old_correct_csum = crc32c((void *) sh, tp_len); + + sh->sctp_src = src; + sh->sctp_dst = dst; + + new_csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum + ^ new_csum); + } - new_csum = crc32c((void *)sh, tp_len); - put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum ^ new_csum); pkt_metadata_init_conn(&packet->md); } @@ -1696,7 +1745,6 @@ compose_arp__(struct dp_packet *b) arp->ar_hln = sizeof arp->ar_sha; arp->ar_pln = sizeof arp->ar_spa; - dp_packet_reset_offsets(b); dp_packet_set_l3(b, arp); b->packet_type = htonl(PT_ETH); @@ -1931,9 +1979,145 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) tos |= IP_ECN_CE; if (nh->ip_tos != tos) { - nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos), - htons((uint16_t) tos)); + if (dp_packet_hwol_l3_ipv4(pkt)) { + dp_packet_ol_reset_ip_csum_good(pkt); + } else { + nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos), + htons((uint16_t) tos)); + } + nh->ip_tos = tos; } } } + +/* Set TCP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_tcp_complete_csum(struct dp_packet *p, bool inner) +{ + struct tcp_header *tcp; + size_t tcp_sz; + void *ip_hdr; + bool is_v4; + + if (inner) { + tcp = dp_packet_inner_l4(p); + ip_hdr = dp_packet_inner_l3(p); + tcp_sz = dp_packet_inner_l4_size(p); + } else { + tcp = dp_packet_l4(p); + ip_hdr = dp_packet_l3(p); + tcp_sz = dp_packet_l4_size(p); + } + + ovs_assert(tcp); + ovs_assert(ip_hdr); + + if (!inner && dp_packet_hwol_is_outer_ipv6(p)) { + is_v4 = false; + } else if (!inner && dp_packet_hwol_is_outer_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_is_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_tx_ipv6(p)) { + is_v4 = false; + } else { + OVS_NOT_REACHED(); + } + + tcp->tcp_csum = 0; + if (is_v4) { + struct ip_header *ip = ip_hdr; + + tcp->tcp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + tcp, tcp_sz)); + } else { + struct ovs_16aligned_ip6_hdr *ip6 = ip_hdr; + + tcp->tcp_csum = packet_csum_upperlayer6(ip6, tcp, ip6->ip6_nxt, + tcp_sz); + } +} + +/* Set UDP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_udp_complete_csum(struct dp_packet *p, bool inner) +{ + struct udp_header *udp; + size_t udp_sz; + void *ip_hdr; + bool is_v4; + + if (inner) { + udp = dp_packet_inner_l4(p); + ip_hdr = dp_packet_inner_l3(p); + udp_sz = dp_packet_inner_l4_size(p); + } else { + udp = dp_packet_l4(p); + ip_hdr = dp_packet_l3(p); + udp_sz = dp_packet_l4_size(p); + } + + ovs_assert(udp); + ovs_assert(ip_hdr); + + /* Skip csum calculation if the udp_csum is zero. */ + if (!udp->udp_csum) { + return; + } + + if (!inner && dp_packet_hwol_is_outer_ipv6(p)) { + is_v4 = false; + } else if (!inner && dp_packet_hwol_is_outer_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_is_ipv4(p)) { + is_v4 = true; + } else if (dp_packet_hwol_tx_ipv6(p)) { + is_v4 = false; + } else { + OVS_NOT_REACHED(); + } + + udp->udp_csum = 0; + if (is_v4) { + struct ip_header *ip = ip_hdr; + + udp->udp_csum = csum_finish(csum_continue(packet_csum_pseudoheader(ip), + udp, udp_sz)); + } else { + struct ovs_16aligned_ip6_hdr *ip6 = ip_hdr; + + udp->udp_csum = packet_csum_upperlayer6(ip6, udp, ip6->ip6_nxt, + udp_sz); + } + + if (!udp->udp_csum) { + udp->udp_csum = htons(0xffff); + } +} + +/* Set SCTP checksum field in packet 'p' with complete checksum. + * The packet must have the L3 and L4 offsets. */ +void +packet_sctp_complete_csum(struct dp_packet *p, bool inner) +{ + struct sctp_header *sh; + uint16_t tp_len; + ovs_be32 csum; + + if (inner) { + sh = dp_packet_inner_l4(p); + tp_len = dp_packet_inner_l4_size(p); + } else { + sh = dp_packet_l4(p); + tp_len = dp_packet_l4_size(p); + } + + ovs_assert(sh); + + put_16aligned_be32(&sh->sctp_csum, 0); + csum = crc32c((void *) sh, tp_len); + put_16aligned_be32(&sh->sctp_csum, csum); +} diff --git a/lib/packets.h b/lib/packets.h index 5bdf6e4bbd9..a102f81634e 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -706,6 +706,10 @@ char *ip_parse_cidr_len(const char *s, int *n, ovs_be32 *ip, #define IPPROTO_IGMP 2 #endif +#ifndef IPPROTO_IPIP +#define IPPROTO_IPIP 4 +#endif + #ifndef IPPROTO_UDPLITE #define IPPROTO_UDPLITE 136 #endif @@ -850,6 +854,17 @@ struct sctp_header { }; BUILD_ASSERT_DECL(SCTP_HEADER_LEN == sizeof(struct sctp_header)); +#define SCTP_CHUNK_HEADER_LEN 4 +struct sctp_chunk_header { + uint8_t type; + uint8_t flags; + ovs_be16 length; +}; +BUILD_ASSERT_DECL(SCTP_CHUNK_HEADER_LEN == sizeof(struct sctp_chunk_header)); + +#define SCTP_NEXT_CHUNK(sh, off) \ + ALIGNED_CAST(struct sctp_chunk_header *, (uint8_t *) sh + off) + #define UDP_HEADER_LEN 8 struct udp_header { ovs_be16 udp_src; @@ -988,6 +1003,15 @@ struct ovs_16aligned_ip6_frag { ovs_16aligned_be32 ip6f_ident; }; +#define IP6_RT_HDR_LEN 4 +struct ip6_rt_hdr { + uint8_t nexthdr; + uint8_t hdrlen; + uint8_t type; + uint8_t segments_left; +}; +BUILD_ASSERT_DECL(IP6_RT_HDR_LEN == sizeof(struct ip6_rt_hdr)); + #define ICMP6_HEADER_LEN 4 struct icmp6_header { uint8_t icmp6_type; @@ -1514,6 +1538,17 @@ BUILD_ASSERT_DECL(sizeof(struct vxlanhdr) == 8); #define VXLAN_F_GPE 0x4000 #define VXLAN_HF_GPE 0x04000000 +/* SRv6 protocol header. */ +#define IPV6_SRCRT_TYPE_4 4 +#define SRV6_BASE_HDR_LEN 8 +struct srv6_base_hdr { + struct ip6_rt_hdr rt_hdr; + uint8_t last_entry; + uint8_t flags; + ovs_be16 tag; +}; +BUILD_ASSERT_DECL(sizeof(struct srv6_base_hdr) == SRV6_BASE_HDR_LEN); + /* Input values for PACKET_TYPE macros have to be in host byte order. * The _BE postfix indicates result is in network byte order. Otherwise result * is in host byte order. */ @@ -1598,6 +1633,9 @@ void packet_set_ipv6_addr(struct dp_packet *packet, uint8_t proto, ovs_16aligned_be32 addr[4], const struct in6_addr *new_addr, bool recalculate_csum); +void packet_set_ipv6_flow_label(ovs_16aligned_be32 *flow_label, + ovs_be32 flow_key); +void packet_set_ipv6_tc(ovs_16aligned_be32 *flow_label, uint8_t tc); void packet_set_tcp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_udp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); void packet_set_sctp_port(struct dp_packet *, ovs_be16 src, ovs_be16 dst); @@ -1642,7 +1680,12 @@ void packet_put_ra_prefix_opt(struct dp_packet *, ovs_be32 preferred_lifetime, const ovs_be128 router_prefix); uint32_t packet_csum_pseudoheader(const struct ip_header *); +bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, + bool *first_frag); void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); +void packet_tcp_complete_csum(struct dp_packet *, bool is_inner); +void packet_udp_complete_csum(struct dp_packet *, bool is_inner); +void packet_sctp_complete_csum(struct dp_packet *, bool is_inner); #define DNS_HEADER_LEN 12 struct dns_header { diff --git a/lib/pcap-file.c b/lib/pcap-file.c index 3ed7ea4880e..0c2fed77662 100644 --- a/lib/pcap-file.c +++ b/lib/pcap-file.c @@ -280,10 +280,12 @@ ovs_pcap_read(struct pcap_file *p_file, struct dp_packet **bufp, void ovs_pcap_write(struct pcap_file *p_file, struct dp_packet *buf) { + const void *data_dp = dp_packet_data(buf); struct pcaprec_hdr prh; struct timeval tv; ovs_assert(dp_packet_is_eth(buf)); + ovs_assert(data_dp); xgettimeofday(&tv); prh.ts_sec = tv.tv_sec; @@ -291,7 +293,7 @@ ovs_pcap_write(struct pcap_file *p_file, struct dp_packet *buf) prh.incl_len = dp_packet_size(buf); prh.orig_len = dp_packet_size(buf); ignore(fwrite(&prh, sizeof prh, 1, p_file->file)); - ignore(fwrite(dp_packet_data(buf), dp_packet_size(buf), 1, p_file->file)); + ignore(fwrite(data_dp, dp_packet_size(buf), 1, p_file->file)); fflush(p_file->file); } diff --git a/lib/rconn.c b/lib/rconn.c index a96b2eb8bf4..4afa2151540 100644 --- a/lib/rconn.c +++ b/lib/rconn.c @@ -1426,6 +1426,7 @@ is_admitted_msg(const struct ofpbuf *b) case OFPTYPE_IPFIX_FLOW_STATS_REQUEST: case OFPTYPE_IPFIX_FLOW_STATS_REPLY: case OFPTYPE_CT_FLUSH_ZONE: + case OFPTYPE_CT_FLUSH: default: return true; } diff --git a/lib/rculist.h b/lib/rculist.h index c0d77acf943..6df963eb2b8 100644 --- a/lib/rculist.h +++ b/lib/rculist.h @@ -378,20 +378,22 @@ rculist_is_singleton_protected(const struct rculist *list) UPDATE_MULTIVAR(ITER, rculist_next(ITER_VAR(ITER)))) #define RCULIST_FOR_EACH_REVERSE_PROTECTED(ITER, MEMBER, RCULIST) \ - for (INIT_MULTIVAR(ITER, MEMBER, (RCULIST)->prev, struct rculist); \ + for (INIT_MULTIVAR(ITER, MEMBER, rculist_back_protected(RCULIST), \ + struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, ITER_VAR(VAR).prev)) + UPDATE_MULTIVAR(ITER, rculist_back_protected(ITER_VAR(ITER)))) #define RCULIST_FOR_EACH_REVERSE_PROTECTED_CONTINUE(ITER, MEMBER, RCULIST) \ - for (INIT_MULTIVAR(ITER, MEMBER, (ITER)->MEMBER.prev, struct rculist); \ + for (INIT_MULTIVAR(ITER, MEMBER, rculist_back_protected(ITER->MEMBER), \ + struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, ITER_VAR(VAR).prev)) + UPDATE_MULTIVAR(ITER, ITER_VAR(ITER)->prev)) #define RCULIST_FOR_EACH_PROTECTED(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR(ITER, MEMBER, rculist_next_protected(RCULIST), \ struct rculist); \ CONDITION_MULTIVAR(ITER, MEMBER, ITER_VAR(ITER) != (RCULIST)); \ - UPDATE_MULTIVAR(ITER, rculist_next_protected(ITER_VAR(ITER))) \ + UPDATE_MULTIVAR(ITER, rculist_next_protected(ITER_VAR(ITER)))) \ #define RCULIST_FOR_EACH_SAFE_SHORT_PROTECTED(ITER, MEMBER, RCULIST) \ for (INIT_MULTIVAR_SAFE_SHORT(ITER, MEMBER, \ @@ -399,18 +401,18 @@ rculist_is_singleton_protected(const struct rculist *list) struct rculist); \ CONDITION_MULTIVAR_SAFE_SHORT(ITER, MEMBER, \ ITER_VAR(ITER) != (RCULIST), \ - ITER_NEXT_VAR(ITER) = rculist_next_protected(ITER_VAR(VAR))); \ - UPDATE_MULTIVAR_SHORT(ITER)) + ITER_NEXT_VAR(ITER) = rculist_next_protected(ITER_VAR(ITER))); \ + UPDATE_MULTIVAR_SAFE_SHORT(ITER)) #define RCULIST_FOR_EACH_SAFE_LONG_PROTECTED(ITER, NEXT, MEMBER, RCULIST) \ for (INIT_MULTIVAR_SAFE_LONG(ITER, NEXT, MEMBER, \ - rculist_next_protected(RCULIST) \ + rculist_next_protected(RCULIST), \ struct rculist); \ - CONDITION_MULTIVAR_SAFE_LONG(VAR, NEXT, MEMBER \ + CONDITION_MULTIVAR_SAFE_LONG(ITER, NEXT, MEMBER, \ ITER_VAR(ITER) != (RCULIST), \ - ITER_VAR(NEXT) = rculist_next_protected(ITER_VAR(VAR)), \ + ITER_VAR(NEXT) = rculist_next_protected(ITER_VAR(ITER)), \ ITER_VAR(NEXT) != (RCULIST)); \ - UPDATE_MULTIVAR_LONG(ITER)) + UPDATE_MULTIVAR_SAFE_LONG(ITER, NEXT)) #define RCULIST_FOR_EACH_SAFE_PROTECTED(...) \ OVERLOAD_SAFE_MACRO(RCULIST_FOR_EACH_SAFE_LONG_PROTECTED, \ diff --git a/lib/route-table.c b/lib/route-table.c index ac82cf262f8..f1fe32714e8 100644 --- a/lib/route-table.c +++ b/lib/route-table.c @@ -26,6 +26,7 @@ #include #include +#include "coverage.h" #include "hash.h" #include "netdev.h" #include "netlink.h" @@ -44,6 +45,8 @@ VLOG_DEFINE_THIS_MODULE(route_table); +COVERAGE_DEFINE(route_table_dump); + struct route_data { /* Copied from struct rtmsg. */ unsigned char rtm_dst_len; @@ -51,6 +54,7 @@ struct route_data { /* Extracted from Netlink attributes. */ struct in6_addr rta_dst; /* 0 if missing. */ + struct in6_addr rta_prefsrc; /* 0 if missing. */ struct in6_addr rta_gw; char ifname[IFNAMSIZ]; /* Interface name. */ uint32_t mark; @@ -79,7 +83,7 @@ static struct nln_notifier *name_notifier = NULL; static bool route_table_valid = false; -static int route_table_reset(void); +static void route_table_reset(void); static void route_table_handle_msg(const struct route_table_msg *); static int route_table_parse(struct ofpbuf *, struct route_table_msg *); static void route_table_change(const struct route_table_msg *, void *); @@ -152,26 +156,22 @@ route_table_wait(void) ovs_mutex_unlock(&route_table_mutex); } -static int -route_table_reset(void) +static bool +route_table_dump_one_table(unsigned char id) { - struct nl_dump dump; - struct rtgenmsg *rtgenmsg; uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; struct ofpbuf request, reply, buf; - - route_map_clear(); - netdev_get_addrs_list_flush(); - route_table_valid = true; - rt_change_seq++; + struct rtmsg *rq_msg; + bool filtered = true; + struct nl_dump dump; ofpbuf_init(&request, 0); - nl_msg_put_nlmsghdr(&request, sizeof *rtgenmsg, RTM_GETROUTE, - NLM_F_REQUEST); + nl_msg_put_nlmsghdr(&request, sizeof *rq_msg, RTM_GETROUTE, NLM_F_REQUEST); - rtgenmsg = ofpbuf_put_zeros(&request, sizeof *rtgenmsg); - rtgenmsg->rtgen_family = AF_UNSPEC; + rq_msg = ofpbuf_put_zeros(&request, sizeof *rq_msg); + rq_msg->rtm_family = AF_UNSPEC; + rq_msg->rtm_table = id; nl_dump_start(&dump, NETLINK_ROUTE, &request); ofpbuf_uninit(&request); @@ -181,12 +181,43 @@ route_table_reset(void) struct route_table_msg msg; if (route_table_parse(&reply, &msg)) { + struct nlmsghdr *nlmsghdr = nl_msg_nlmsghdr(&reply); + + /* Older kernels do not support filtering. */ + if (!(nlmsghdr->nlmsg_flags & NLM_F_DUMP_FILTERED)) { + filtered = false; + } route_table_handle_msg(&msg); } } ofpbuf_uninit(&buf); + nl_dump_done(&dump); + + return filtered; +} + +static void +route_table_reset(void) +{ + unsigned char tables[] = { + RT_TABLE_DEFAULT, + RT_TABLE_MAIN, + RT_TABLE_LOCAL, + }; - return nl_dump_done(&dump); + route_map_clear(); + netdev_get_addrs_list_flush(); + route_table_valid = true; + rt_change_seq++; + + COVERAGE_INC(route_table_dump); + + for (size_t i = 0; i < ARRAY_SIZE(tables); i++) { + if (!route_table_dump_one_table(tables[i])) { + /* Got unfiltered reply, no need to dump further. */ + break; + } + } } /* Return RTNLGRP_IPV4_ROUTE or RTNLGRP_IPV6_ROUTE on success, 0 on parse @@ -201,6 +232,8 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_OIF] = { .type = NL_A_U32, .optional = true }, [RTA_GATEWAY] = { .type = NL_A_U32, .optional = true }, [RTA_MARK] = { .type = NL_A_U32, .optional = true }, + [RTA_PREFSRC] = { .type = NL_A_U32, .optional = true }, + [RTA_TABLE] = { .type = NL_A_U32, .optional = true }, }; static const struct nl_policy policy6[] = { @@ -208,6 +241,8 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) [RTA_OIF] = { .type = NL_A_U32, .optional = true }, [RTA_MARK] = { .type = NL_A_U32, .optional = true }, [RTA_GATEWAY] = { .type = NL_A_IPV6, .optional = true }, + [RTA_PREFSRC] = { .type = NL_A_IPV6, .optional = true }, + [RTA_TABLE] = { .type = NL_A_U32, .optional = true }, }; struct nlattr *attrs[ARRAY_SIZE(policy)]; @@ -229,6 +264,7 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) if (parsed) { const struct nlmsghdr *nlmsg; + uint32_t table_id; int rta_oif; /* Output interface index. */ nlmsg = buf->data; @@ -244,6 +280,19 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) rtm->rtm_type != RTN_LOCAL) { change->relevant = false; } + + table_id = rtm->rtm_table; + if (attrs[RTA_TABLE]) { + table_id = nl_attr_get_u32(attrs[RTA_TABLE]); + } + /* Do not consider changes in non-standard routing tables. */ + if (table_id + && table_id != RT_TABLE_DEFAULT + && table_id != RT_TABLE_MAIN + && table_id != RT_TABLE_LOCAL) { + change->relevant = false; + } + change->nlmsg_type = nlmsg->nlmsg_type; change->rd.rtm_dst_len = rtm->rtm_dst_len + (ipv4 ? 96 : 0); change->rd.local = rtm->rtm_type == RTN_LOCAL; @@ -274,6 +323,16 @@ route_table_parse(struct ofpbuf *buf, struct route_table_msg *change) } else if (ipv4) { in6_addr_set_mapped_ipv4(&change->rd.rta_dst, 0); } + if (attrs[RTA_PREFSRC]) { + if (ipv4) { + ovs_be32 prefsrc; + prefsrc = nl_attr_get_be32(attrs[RTA_PREFSRC]); + in6_addr_set_mapped_ipv4(&change->rd.rta_prefsrc, prefsrc); + } else { + change->rd.rta_prefsrc = + nl_attr_get_in6_addr(attrs[RTA_PREFSRC]); + } + } if (attrs[RTA_GATEWAY]) { if (ipv4) { ovs_be32 gw; @@ -299,7 +358,9 @@ static void route_table_change(const struct route_table_msg *change OVS_UNUSED, void *aux OVS_UNUSED) { - route_table_valid = false; + if (!change || change->relevant) { + route_table_valid = false; + } } static void @@ -309,7 +370,8 @@ route_table_handle_msg(const struct route_table_msg *change) const struct route_data *rd = &change->rd; ovs_router_insert(rd->mark, &rd->rta_dst, rd->rtm_dst_len, - rd->local, rd->ifname, &rd->rta_gw); + rd->local, rd->ifname, &rd->rta_gw, + &rd->rta_prefsrc); } } diff --git a/lib/rstp.c b/lib/rstp.c index 7e351bf32ff..90e80945997 100644 --- a/lib/rstp.c +++ b/lib/rstp.c @@ -50,7 +50,7 @@ VLOG_DEFINE_THIS_MODULE(rstp); -struct ovs_mutex rstp_mutex = OVS_MUTEX_INITIALIZER; +struct ovs_mutex rstp_mutex; static struct ovs_list all_rstps__ = OVS_LIST_INITIALIZER(&all_rstps__); static struct ovs_list *const all_rstps OVS_GUARDED_BY(rstp_mutex) = &all_rstps__; @@ -248,6 +248,10 @@ void rstp_init(void) OVS_EXCLUDED(rstp_mutex) { + /* We need a recursive mutex because rstp_send_bpdu() could loop back + * into the rstp module through a patch port. */ + ovs_mutex_init_recursive(&rstp_mutex); + unixctl_command_register("rstp/tcn", "[bridge]", 0, 1, rstp_unixctl_tcn, NULL); unixctl_command_register("rstp/show", "[bridge]", 0, 1, rstp_unixctl_show, @@ -784,7 +788,7 @@ rstp_convert_speed_to_cost(unsigned int speed) : speed >= 100 ? 200000 /* 100 Mb/s. */ : speed >= 10 ? 2000000 /* 10 Mb/s. */ : speed >= 1 ? 20000000 /* 1 Mb/s. */ - : RSTP_DEFAULT_PORT_PATH_COST; /* 100 Mb/s. */ + : RSTP_DEFAULT_PORT_PATH_COST; /* 10 Gb/s. */ return value; } diff --git a/lib/rstp.h b/lib/rstp.h index 39a13b58c1f..13af2019516 100644 --- a/lib/rstp.h +++ b/lib/rstp.h @@ -84,7 +84,7 @@ struct dp_packet; /* Port path cost [Table 17-3] */ #define RSTP_MIN_PORT_PATH_COST 1 #define RSTP_MAX_PORT_PATH_COST 200000000 -#define RSTP_DEFAULT_PORT_PATH_COST 200000 +#define RSTP_DEFAULT_PORT_PATH_COST 2000 /* RSTP Bridge identifier [9.2.5]. Top four most significant bits are a * priority value. The next most significant twelve bits are a locally diff --git a/lib/rtnetlink.c b/lib/rtnetlink.c index f67352603f7..37078d00e10 100644 --- a/lib/rtnetlink.c +++ b/lib/rtnetlink.c @@ -112,7 +112,7 @@ rtnetlink_parse(struct ofpbuf *buf, struct rtnetlink_change *change) if (parsed) { const struct ifinfomsg *ifinfo; - ifinfo = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *ifinfo); + ifinfo = ofpbuf_at_assert(buf, NLMSG_HDRLEN, sizeof *ifinfo); /* Wireless events can be spammy and cause a * lot of unnecessary churn and CPU load in @@ -175,7 +175,7 @@ rtnetlink_parse(struct ofpbuf *buf, struct rtnetlink_change *change) if (parsed) { const struct ifaddrmsg *ifaddr; - ifaddr = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *ifaddr); + ifaddr = ofpbuf_at_assert(buf, NLMSG_HDRLEN, sizeof *ifaddr); change->nlmsg_type = nlmsg->nlmsg_type; change->if_index = ifaddr->ifa_index; diff --git a/lib/seq.c b/lib/seq.c index 99e5bf8bd10..7c2fa0c69f3 100644 --- a/lib/seq.c +++ b/lib/seq.c @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change); /* A sequence number object. */ struct seq { - uint64_t value OVS_GUARDED; + atomic_uint64_t value; struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */ }; @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex); struct seq * OVS_EXCLUDED(seq_mutex) seq_create(void) { + uint64_t seq_value; struct seq *seq; seq_init(); @@ -81,7 +82,8 @@ seq_create(void) COVERAGE_INC(seq_change); ovs_mutex_lock(&seq_mutex); - seq->value = seq_next++; + seq_value = seq_next++; + atomic_store_relaxed(&seq->value, seq_value); hmap_init(&seq->waiters); ovs_mutex_unlock(&seq_mutex); @@ -126,9 +128,11 @@ void seq_change_protected(struct seq *seq) OVS_REQUIRES(seq_mutex) { + uint64_t seq_value = seq_next++; + COVERAGE_INC(seq_change); - seq->value = seq_next++; + atomic_store_explicit(&seq->value, seq_value, memory_order_release); seq_wake_waiters(seq); } @@ -143,18 +147,6 @@ seq_change(struct seq *seq) ovs_mutex_unlock(&seq_mutex); } -/* Returns 'seq''s current sequence number (which could change immediately). - * - * seq_read() and seq_wait() can be used together to yield a race-free wakeup - * when an object changes, even without an ability to lock the object. See - * Usage in seq.h for details. */ -uint64_t -seq_read_protected(const struct seq *seq) - OVS_REQUIRES(seq_mutex) -{ - return seq->value; -} - /* Returns 'seq''s current sequence number (which could change immediately). * * seq_read() and seq_wait() can be used together to yield a race-free wakeup @@ -162,14 +154,12 @@ seq_read_protected(const struct seq *seq) * Usage in seq.h for details. */ uint64_t seq_read(const struct seq *seq) - OVS_EXCLUDED(seq_mutex) { uint64_t value; - ovs_mutex_lock(&seq_mutex); - value = seq_read_protected(seq); - ovs_mutex_unlock(&seq_mutex); - + /* Note that the odd CONST_CAST() is here to keep sparse happy. */ + atomic_read_explicit(&CONST_CAST(struct seq *, seq)->value, &value, + memory_order_acquire); return value; } @@ -226,7 +216,7 @@ seq_wait_at(const struct seq *seq_, uint64_t value, const char *where) struct seq *seq = CONST_CAST(struct seq *, seq_); ovs_mutex_lock(&seq_mutex); - if (value == seq->value) { + if (value == seq_read(seq_)) { seq_wait__(seq, value, where); } else { poll_immediate_wake_at(where); diff --git a/lib/seq.h b/lib/seq.h index c88b9d1c814..fcfa010376d 100644 --- a/lib/seq.h +++ b/lib/seq.h @@ -128,7 +128,6 @@ void seq_unlock(void); /* For observers. */ uint64_t seq_read(const struct seq *); -uint64_t seq_read_protected(const struct seq *); void seq_wait_at(const struct seq *, uint64_t value, const char *where); #define seq_wait(seq, value) seq_wait_at(seq, value, OVS_SOURCE_LOCATOR) diff --git a/lib/sflow_agent.c b/lib/sflow_agent.c index c95f654a59c..743774a27b3 100644 --- a/lib/sflow_agent.c +++ b/lib/sflow_agent.c @@ -510,8 +510,16 @@ void sfl_agent_sysError(SFLAgent *agent, char *modName, char *msg) static void * sflAlloc(SFLAgent *agent, size_t bytes) { - if(agent->allocFn) return (*agent->allocFn)(agent->magic, agent, bytes); - else return SFL_ALLOC(bytes); + void *alloc; + + if (agent->allocFn) { + alloc = (*agent->allocFn)(agent->magic, agent, bytes); + ovs_assert(alloc); + memset(alloc, 0, bytes); + } else { + alloc = SFL_ALLOC(bytes); + } + return alloc; } static void sflFree(SFLAgent *agent, void *obj) diff --git a/lib/sflow_api.h b/lib/sflow_api.h index a0530b37ab4..b884a6a7d09 100644 --- a/lib/sflow_api.h +++ b/lib/sflow_api.h @@ -97,7 +97,7 @@ typedef struct _SFLReceiver { struct _SFLReceiver *nxt; /* MIB fields */ char *sFlowRcvrOwner; - time_t sFlowRcvrTimeout; + u_int32_t sFlowRcvrTimeout; u_int32_t sFlowRcvrMaximumDatagramSize; SFLAddress sFlowRcvrAddress; u_int32_t sFlowRcvrPort; @@ -148,7 +148,7 @@ typedef struct _SFLPoller { /* MIB fields */ SFLDataSource_instance dsi; u_int32_t sFlowCpReceiver; - time_t sFlowCpInterval; + u_int32_t sFlowCpInterval; /* public fields */ struct _SFLAgent *agent; /* pointer to my agent */ void *magic; /* ptr to pass back in getCountersFn() */ @@ -156,7 +156,7 @@ typedef struct _SFLPoller { u_int32_t bridgePort; /* port number local to bridge */ /* private fields */ SFLReceiver *myReceiver; - time_t countersCountdown; + u_int32_t countersCountdown; u_int32_t countersSampleSeqNo; } SFLPoller; @@ -251,8 +251,8 @@ SFLSampler *sfl_agent_getSamplerByIfIndex(SFLAgent *agent, u_int32_t ifIndex); /* receiver */ char * sfl_receiver_get_sFlowRcvrOwner(SFLReceiver *receiver); void sfl_receiver_set_sFlowRcvrOwner(SFLReceiver *receiver, char *sFlowRcvrOwner); -time_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver); -void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, time_t sFlowRcvrTimeout); +u_int32_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver); +void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, u_int32_t sFlowRcvrTimeout); u_int32_t sfl_receiver_get_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver); void sfl_receiver_set_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver, u_int32_t sFlowRcvrMaximumDatagramSize); SFLAddress *sfl_receiver_get_sFlowRcvrAddress(SFLReceiver *receiver); @@ -337,7 +337,7 @@ void sfl_agent_sysError(SFLAgent *agent, char *modName, char *msg); u_int32_t sfl_receiver_samplePacketsSent(SFLReceiver *receiver); -#define SFL_ALLOC malloc +#define SFL_ALLOC xzalloc #define SFL_FREE free #endif /* SFLOW_API_H */ diff --git a/lib/sflow_poller.c b/lib/sflow_poller.c index 9e6a487bc07..46e40cbd42d 100644 --- a/lib/sflow_poller.c +++ b/lib/sflow_poller.c @@ -6,6 +6,7 @@ */ #include "sflow_api.h" +#include "random.h" /*_________________--------------------------__________________ _________________ sfl_poller_init __________________ @@ -88,7 +89,7 @@ void sfl_poller_set_sFlowCpInterval(SFLPoller *poller, u_int32_t sFlowCpInterval Another smoothing factor is that the tick() function called here is usually driven from a fairly "soft" polling loop rather than a hard real-time event. */ - poller->countersCountdown = 1 + (random() % sFlowCpInterval); + poller->countersCountdown = 1 + random_range(sFlowCpInterval); } else { /* Setting sFlowCpInterval to 0 disables counter polling altogether. Thanks to diff --git a/lib/sflow_receiver.c b/lib/sflow_receiver.c index 4162518e3c4..3c5aec897e4 100644 --- a/lib/sflow_receiver.c +++ b/lib/sflow_receiver.c @@ -102,10 +102,10 @@ void sfl_receiver_set_sFlowRcvrOwner(SFLReceiver *receiver, char *sFlowRcvrOwner reset(receiver); } } -time_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver) { +u_int32_t sfl_receiver_get_sFlowRcvrTimeout(SFLReceiver *receiver) { return receiver->sFlowRcvrTimeout; } -void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, time_t sFlowRcvrTimeout) { +void sfl_receiver_set_sFlowRcvrTimeout(SFLReceiver *receiver, u_int32_t sFlowRcvrTimeout) { receiver->sFlowRcvrTimeout =sFlowRcvrTimeout; } u_int32_t sfl_receiver_get_sFlowRcvrMaximumDatagramSize(SFLReceiver *receiver) { @@ -146,7 +146,8 @@ void sfl_receiver_tick(SFLReceiver *receiver) // if there are any samples to send, flush them now if(receiver->sampleCollector.numSamples > 0) sendSample(receiver); // check the timeout - if(receiver->sFlowRcvrTimeout && (u_int32_t)receiver->sFlowRcvrTimeout != 0xFFFFFFFF) { + if(receiver->sFlowRcvrTimeout + && receiver->sFlowRcvrTimeout != UINT32_MAX) { // count down one tick and reset if we reach 0 if(--receiver->sFlowRcvrTimeout == 0) reset(receiver); } diff --git a/lib/shash.c b/lib/shash.c index a7b2c645829..92260cddf8c 100644 --- a/lib/shash.c +++ b/lib/shash.c @@ -17,6 +17,7 @@ #include #include "openvswitch/shash.h" #include "hash.h" +#include "util.h" static struct shash_node *shash_find__(const struct shash *, const char *name, size_t name_len, @@ -100,6 +101,7 @@ shash_is_empty(const struct shash *shash) size_t shash_count(const struct shash *shash) { + ovs_assert(shash); return hmap_count(&shash->map); } @@ -203,6 +205,10 @@ shash_delete(struct shash *sh, struct shash_node *node) char * shash_steal(struct shash *sh, struct shash_node *node) { + if (!node) { + return NULL; + } + char *name = node->name; hmap_remove(&sh->map, &node->node); @@ -263,7 +269,7 @@ void * shash_find_and_delete_assert(struct shash *sh, const char *name) { void *data = shash_find_and_delete(sh, name); - ovs_assert(data != NULL); + ovs_assert(data); return data; } diff --git a/lib/simap.c b/lib/simap.c index 0ee08d74d52..1c01d4ebe22 100644 --- a/lib/simap.c +++ b/lib/simap.c @@ -17,6 +17,7 @@ #include #include "simap.h" #include "hash.h" +#include "util.h" static size_t hash_name(const char *, size_t length); static struct simap_node *simap_find__(const struct simap *, @@ -84,6 +85,7 @@ simap_is_empty(const struct simap *simap) size_t simap_count(const struct simap *simap) { + ovs_assert(simap); return hmap_count(&simap->map); } diff --git a/lib/smap.c b/lib/smap.c index c1633e2a18d..122adca2717 100644 --- a/lib/smap.c +++ b/lib/smap.c @@ -100,7 +100,7 @@ smap_add_format(struct smap *smap, const char *key, const char *format, ...) /* Adds 'key' paired with a string representation of 'addr'. It is the * caller's responsibility to avoid duplicate keys if desirable. */ void -smap_add_ipv6(struct smap *smap, const char *key, struct in6_addr *addr) +smap_add_ipv6(struct smap *smap, const char *key, const struct in6_addr *addr) { char buf[INET6_ADDRSTRLEN]; ipv6_string_mapped(buf, addr); @@ -300,6 +300,7 @@ smap_is_empty(const struct smap *smap) size_t smap_count(const struct smap *smap) { + ovs_assert(smap); return hmap_count(&smap->map); } diff --git a/lib/smap.h b/lib/smap.h index 2fe6c540a71..d1d2ae6f20a 100644 --- a/lib/smap.h +++ b/lib/smap.h @@ -100,7 +100,7 @@ struct smap_node *smap_add_nocopy(struct smap *, char *, char *); bool smap_add_once(struct smap *, const char *, const char *); void smap_add_format(struct smap *, const char *key, const char *, ...) OVS_PRINTF_FORMAT(3, 4); -void smap_add_ipv6(struct smap *, const char *, struct in6_addr *); +void smap_add_ipv6(struct smap *, const char *, const struct in6_addr *); void smap_replace(struct smap *, const char *, const char *); void smap_replace_nocopy(struct smap *, const char *, char *); diff --git a/lib/socket-util.c b/lib/socket-util.c index 38705cc51e0..c569b7d1664 100644 --- a/lib/socket-util.c +++ b/lib/socket-util.c @@ -546,9 +546,15 @@ inet_parse_active(const char *target_, int default_port, if (!host) { VLOG_ERR("%s: host must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else if (!port && default_port < 0) { VLOG_ERR("%s: port must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else { ok = parse_sockaddr_components(ss, host, port, default_port, target_, resolve_host, dns_failure); @@ -660,7 +666,8 @@ inet_open_active(int style, const char *target, int default_port, * zeros '*ss' and returns false. */ bool inet_parse_passive(const char *target_, int default_port, - struct sockaddr_storage *ss) + struct sockaddr_storage *ss, + bool resolve_host, bool *dns_failure) { char *target = xstrdup(target_); char *port, *host; @@ -670,9 +677,12 @@ inet_parse_passive(const char *target_, int default_port, if (!port && default_port < 0) { VLOG_ERR("%s: port must be specified", target_); ok = false; + if (dns_failure) { + *dns_failure = false; + } } else { ok = parse_sockaddr_components(ss, host, port, default_port, - target_, true, NULL); + target_, resolve_host, dns_failure); } if (!ok) { memset(ss, 0, sizeof *ss); @@ -710,8 +720,14 @@ inet_open_passive(int style, const char *target, int default_port, struct sockaddr_storage ss; int fd = 0, error; unsigned int yes = 1; + bool dns_failure; - if (!inet_parse_passive(target, default_port, &ss)) { + if (!inet_parse_passive(target, default_port, &ss, true, &dns_failure)) { + if (dns_failure) { + /* DNS failure means asynchronous DNS resolution is in progress, + * or that the name does currently not resolve. */ + return -EAGAIN; + } return -EAFNOSUPPORT; } kernel_chooses_port = ss_get_port(&ss) == 0; @@ -753,7 +769,7 @@ inet_open_passive(int style, const char *target, int default_port, } /* Listen. */ - if (style == SOCK_STREAM && listen(fd, 10) < 0) { + if (style == SOCK_STREAM && listen(fd, 64) < 0) { error = sock_errno(); VLOG_ERR("%s: listen: %s", target, sock_strerror(error)); goto error; diff --git a/lib/socket-util.h b/lib/socket-util.h index bf66393df94..4eec627e3ed 100644 --- a/lib/socket-util.h +++ b/lib/socket-util.h @@ -55,7 +55,8 @@ int inet_open_active(int style, const char *target, int default_port, struct sockaddr_storage *ssp, int *fdp, uint8_t dscp); bool inet_parse_passive(const char *target, int default_port, - struct sockaddr_storage *ssp); + struct sockaddr_storage *ssp, + bool resolve_host, bool *dns_failure); int inet_open_passive(int style, const char *target, int default_port, struct sockaddr_storage *ssp, uint8_t dscp, bool kernel_print_port); diff --git a/lib/sset.c b/lib/sset.c index aa179002020..fda26812906 100644 --- a/lib/sset.c +++ b/lib/sset.c @@ -261,6 +261,11 @@ char * sset_pop(struct sset *set) { const char *name = SSET_FIRST(set); + + if (!name) { + return NULL; + } + char *copy = xstrdup(name); sset_delete(set, SSET_NODE_FROM_NAME(name)); return copy; diff --git a/lib/stp.c b/lib/stp.c index a869b5f390c..f37337992a3 100644 --- a/lib/stp.c +++ b/lib/stp.c @@ -313,7 +313,7 @@ stp_create(const char *name, stp_identifier bridge_id, for (p = stp->ports; p < &stp->ports[ARRAY_SIZE(stp->ports)]; p++) { p->stp = stp; p->port_id = (stp_port_no(p) + 1) | (STP_DEFAULT_PORT_PRIORITY << 8); - p->path_cost = 19; /* Recommended default for 100 Mb/s link. */ + p->path_cost = 2; /* Recommended default for 10 Gb/s link. */ stp_initialize_port(p, STP_DISABLED); } ovs_refcount_init(&stp->ref_cnt); @@ -989,7 +989,7 @@ stp_convert_speed_to_cost(unsigned int speed) : speed >= 16 ? 62 /* 16 Mb/s. */ : speed >= 10 ? 100 /* 10 Mb/s. */ : speed >= 4 ? 250 /* 4 Mb/s. */ - : 19; /* 100 Mb/s (guess). */ + : 2; /* 10 Gb/s (guess). */ ovs_mutex_unlock(&mutex); return ret; } diff --git a/lib/stream-ssl.c b/lib/stream-ssl.c index f4fe3432e77..86747e58ba2 100644 --- a/lib/stream-ssl.c +++ b/lib/stream-ssl.c @@ -193,7 +193,9 @@ static void ssl_clear_txbuf(struct ssl_stream *); static void interpret_queued_ssl_error(const char *function); static int interpret_ssl_error(const char *function, int ret, int error, int *want); +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static DH *tmp_dh_callback(SSL *ssl, int is_export OVS_UNUSED, int keylength); +#endif static void log_ca_cert(const char *file_name, X509 *cert); static void stream_ssl_set_ca_cert_file__(const char *file_name, bool bootstrap, bool force); @@ -471,7 +473,11 @@ static char * get_peer_common_name(const struct ssl_stream *sslv) { char *peer_name = NULL; +#if OPENSSL_VERSION_NUMBER < 0x3000000fL X509 *peer_cert = SSL_get_peer_certificate(sslv->ssl); +#else + X509 *peer_cert = SSL_get1_peer_certificate(sslv->ssl); +#endif if (!peer_cert) { return NULL; } @@ -1069,8 +1075,18 @@ do_ssl_init(void) VLOG_ERR("SSL_CTX_new: %s", ERR_error_string(ERR_get_error(), NULL)); return ENOPROTOOPT; } - SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); + + long options = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3; +#ifdef SSL_OP_IGNORE_UNEXPECTED_EOF + options |= SSL_OP_IGNORE_UNEXPECTED_EOF; +#endif + SSL_CTX_set_options(ctx, options); + +#if OPENSSL_VERSION_NUMBER < 0x3000000fL SSL_CTX_set_tmp_dh_callback(ctx, tmp_dh_callback); +#else + SSL_CTX_set_dh_auto(ctx, 1); +#endif SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE); SSL_CTX_set_mode(ctx, SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, @@ -1081,6 +1097,7 @@ do_ssl_init(void) return 0; } +#if OPENSSL_VERSION_NUMBER < 0x3000000fL static DH * tmp_dh_callback(SSL *ssl OVS_UNUSED, int is_export OVS_UNUSED, int keylength) { @@ -1112,6 +1129,7 @@ tmp_dh_callback(SSL *ssl OVS_UNUSED, int is_export OVS_UNUSED, int keylength) keylength); return NULL; } +#endif /* Returns true if SSL is at least partially configured. */ bool diff --git a/lib/table.c b/lib/table.c index 48d18b65182..b7addbf390f 100644 --- a/lib/table.c +++ b/lib/table.c @@ -522,7 +522,7 @@ table_print_json__(const struct table *table, const struct table_style *style, json_object_put_string(json, "caption", table->caption); } if (table->timestamp) { - json_object_put_nocopy( + json_object_put( json, "time", json_string_create_nocopy(table_format_timestamp__())); } diff --git a/lib/tc.c b/lib/tc.c index 94044cde606..e55ba3b1bbc 100644 --- a/lib/tc.c +++ b/lib/tc.c @@ -36,8 +36,10 @@ #include #include "byte-order.h" +#include "coverage.h" #include "netlink-socket.h" #include "netlink.h" +#include "odp-util.h" #include "openvswitch/ofpbuf.h" #include "openvswitch/util.h" #include "openvswitch/vlog.h" @@ -67,6 +69,8 @@ VLOG_DEFINE_THIS_MODULE(tc); +COVERAGE_DEFINE(tc_netlink_malformed_reply); + static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(60, 5); static enum tc_offload_policy tc_policy = TC_POLICY_NONE; @@ -84,6 +88,11 @@ struct flower_key_to_pedit { int boundary_shift; }; +struct tc_flow_stats { + uint64_t n_packets; + uint64_t n_bytes; +}; + static struct flower_key_to_pedit flower_pedit_map[] = { { TCA_PEDIT_KEY_EX_HDR_TYPE_IP4, @@ -691,9 +700,41 @@ nl_parse_geneve_key(const struct nlattr *in_nlattr, return 0; } +static int +nl_parse_vxlan_key(const struct nlattr *in_nlattr, + struct tc_flower_tunnel *tunnel) +{ + const struct ofpbuf *msg; + struct nlattr *nla; + struct ofpbuf buf; + uint32_t gbp_raw; + size_t left; + + nl_attr_get_nested(in_nlattr, &buf); + msg = &buf; + + NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(msg, 0, 0), msg->size) { + uint16_t type = nl_attr_type(nla); + + switch (type) { + case TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP: + gbp_raw = nl_attr_get_u32(nla); + odp_decode_gbp_raw(gbp_raw, &tunnel->gbp.id, + &tunnel->gbp.flags); + tunnel->gbp.id_present = true; + break; + default: + VLOG_WARN_RL(&error_rl, "failed to parse vxlan tun options"); + return EINVAL; + } + } + + return 0; +} + static int nl_parse_flower_tunnel_opts(struct nlattr *options, - struct tun_metadata *metadata) + struct tc_flower_tunnel *tunnel) { const struct ofpbuf *msg; struct nlattr *nla; @@ -708,7 +749,14 @@ nl_parse_flower_tunnel_opts(struct nlattr *options, uint16_t type = nl_attr_type(nla); switch (type) { case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: - err = nl_parse_geneve_key(nla, metadata); + err = nl_parse_geneve_key(nla, &tunnel->metadata); + if (err) { + return err; + } + + break; + case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: + err = nl_parse_vxlan_key(nla, tunnel); if (err) { return err; } @@ -820,13 +868,13 @@ nl_parse_flower_tunnel(struct nlattr **attrs, struct tc_flower *flower) if (attrs[TCA_FLOWER_KEY_ENC_OPTS] && attrs[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { err = nl_parse_flower_tunnel_opts(attrs[TCA_FLOWER_KEY_ENC_OPTS], - &flower->key.tunnel.metadata); + &flower->key.tunnel); if (err) { return err; } err = nl_parse_flower_tunnel_opts(attrs[TCA_FLOWER_KEY_ENC_OPTS_MASK], - &flower->mask.tunnel.metadata); + &flower->mask.tunnel); if (err) { return err; } @@ -1087,6 +1135,10 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) } ex_type = nl_attr_find_nested(nla, TCA_PEDIT_KEY_EX_HTYPE); + if (!ex_type) { + return EOPNOTSUPP; + } + type = nl_attr_get_u16(ex_type); err = csum_update_flag(flower, type); @@ -1114,7 +1166,7 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) int diff = flower_off + (keys->off - mf); ovs_be32 *dst = (void *) (rewrite_key + diff); ovs_be32 *dst_m = (void *) (rewrite_mask + diff); - ovs_be32 mask, mask_word, data_word; + ovs_be32 mask, mask_word, data_word, val; uint32_t zero_bits; mask_word = htonl(ntohl(keys->mask) << m->boundary_shift); @@ -1129,8 +1181,13 @@ nl_parse_act_pedit(struct nlattr *options, struct tc_flower *flower) mask &= htonl(UINT32_MAX << zero_bits); } - *dst_m |= mask; - *dst |= data_word & mask; + val = get_unaligned_be32(dst_m); + val |= mask; + put_unaligned_be32(dst_m, val); + + val = get_unaligned_be32(dst); + val |= data_word & mask; + put_unaligned_be32(dst, val); } } @@ -1233,6 +1290,35 @@ nl_parse_act_geneve_opts(const struct nlattr *in_nlattr, return 0; } +static int +nl_parse_act_vxlan_opts(struct nlattr *in_nlattr, struct tc_action *action) +{ + const struct ofpbuf *msg; + struct nlattr *nla; + struct ofpbuf buf; + size_t left; + + nl_attr_get_nested(in_nlattr, &buf); + msg = &buf; + + NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(msg, 0, 0), msg->size) { + uint16_t type = nl_attr_type(nla); + int32_t gbp_raw; + + switch (type) { + case TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP: + gbp_raw = nl_attr_get_u32(nla); + odp_decode_gbp_raw(gbp_raw, &action->encap.gbp.id, + &action->encap.gbp.flags); + action->encap.gbp.id_present = true; + + break; + } + } + + return 0; +} + static int nl_parse_act_tunnel_opts(struct nlattr *options, struct tc_action *action) { @@ -1257,7 +1343,12 @@ nl_parse_act_tunnel_opts(struct nlattr *options, struct tc_action *action) if (err) { return err; } - + break; + case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: + err = nl_parse_act_vxlan_opts(nla, action); + if (err) { + return err; + } break; } } @@ -1352,7 +1443,19 @@ get_user_hz(void) static void nl_parse_tcf(const struct tcf_t *tm, struct tc_flower *flower) { - uint64_t lastused = time_msec() - (tm->lastuse * 1000 / get_user_hz()); + uint64_t lastused; + + /* On creation both tm->install and tm->lastuse are set to jiffies + * by the kernel. So if both values are the same, the flow has not been + * used yet. + * + * Note that tm->firstuse can not be used due to some kernel bug, i.e., + * hardware offloaded flows do not update tm->firstuse. */ + if (tm->lastuse == tm->install) { + lastused = 0; + } else { + lastused = time_msec() - (tm->lastuse * 1000 / get_user_hz()); + } if (flower->lastused < lastused) { flower->lastused = lastused; @@ -1401,6 +1504,8 @@ static const struct nl_policy police_policy[] = { [TCA_POLICE_RATE] = { .type = NL_A_UNSPEC, .min_len = 1024, .optional = true, }, + [TCA_POLICE_RATE64] = { .type = NL_A_U32, + .optional = true, }, [TCA_POLICE_PEAKRATE] = { .type = NL_A_UNSPEC, .min_len = 1024, .optional = true, }, @@ -1541,6 +1646,9 @@ static const struct nl_policy ct_policy[] = { .optional = true, }, [TCA_CT_NAT_PORT_MAX] = { .type = NL_A_U16, .optional = true, }, + [TCA_CT_TM] = { .type = NL_A_UNSPEC, + .min_len = sizeof(struct tcf_t), + .optional = true, }, }; static int @@ -1551,6 +1659,7 @@ nl_parse_act_ct(struct nlattr *options, struct tc_flower *flower) struct tc_action *action; const struct tc_ct *ct; uint16_t ct_action = 0; + struct tcf_t tm; if (!nl_parse_nested(options, ct_policy, ct_attrs, ARRAY_SIZE(ct_policy))) { @@ -1636,6 +1745,11 @@ nl_parse_act_ct(struct nlattr *options, struct tc_flower *flower) } action->type = TC_ACT_CT; + if (ct_attrs[TCA_CT_TM]) { + memcpy(&tm, nl_attr_get_unspec(ct_attrs[TCA_CT_TM], sizeof tm), + sizeof tm); + nl_parse_tcf(&tm, flower); + } nl_parse_action_pc(ct->action, action); return 0; } @@ -1834,66 +1948,89 @@ static const struct nl_policy act_policy[] = { [TCA_ACT_STATS] = { .type = NL_A_NESTED, .optional = false, }, }; -static const struct nl_policy stats_policy[] = { - [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_basic), - .optional = false, }, - [TCA_STATS_BASIC_HW] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_basic), - .optional = true, }, - [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, - .min_len = sizeof(struct gnet_stats_queue), - .optional = true, }, -}; - static int nl_parse_action_stats(struct nlattr *act_stats, struct ovs_flow_stats *stats_sw, struct ovs_flow_stats *stats_hw, struct ovs_flow_stats *stats_dropped) { - struct nlattr *stats_attrs[ARRAY_SIZE(stats_policy)]; - struct gnet_stats_basic bs_all, bs_sw, bs_hw; - const struct gnet_stats_queue *qs; + struct tc_flow_stats s_sw = {0}, s_hw = {0}; + const struct gnet_stats_queue *qs = NULL; + uint16_t prev_type = __TCA_STATS_MAX; + const struct nlattr *nla; + unsigned int seen = 0; + size_t left; - if (!nl_parse_nested(act_stats, stats_policy, stats_attrs, - ARRAY_SIZE(stats_policy))) { - VLOG_ERR_RL(&error_rl, "Failed to parse action stats policy"); - return EPROTO; - } + /* Cannot use nl_parse_nested due to duplicate attributes. */ + NL_NESTED_FOR_EACH (nla, left, act_stats) { + struct gnet_stats_basic stats_basic; + uint16_t type = nl_attr_type(nla); - memcpy(&bs_all, - nl_attr_get_unspec(stats_attrs[TCA_STATS_BASIC], sizeof bs_all), - sizeof bs_all); - if (stats_attrs[TCA_STATS_BASIC_HW]) { - memcpy(&bs_hw, nl_attr_get_unspec(stats_attrs[TCA_STATS_BASIC_HW], - sizeof bs_hw), - sizeof bs_hw); + seen |= 1 << type; - bs_sw.packets = bs_all.packets - bs_hw.packets; - bs_sw.bytes = bs_all.bytes - bs_hw.bytes; - } else { - bs_sw.packets = bs_all.packets; - bs_sw.bytes = bs_all.bytes; + switch (type) { + case TCA_STATS_BASIC: + memcpy(&stats_basic, nl_attr_get_unspec(nla, sizeof stats_basic), + sizeof stats_basic); + s_sw.n_packets = stats_basic.packets; + s_sw.n_bytes = stats_basic.bytes; + break; + + case TCA_STATS_BASIC_HW: + memcpy(&stats_basic, nl_attr_get_unspec(nla, sizeof stats_basic), + sizeof stats_basic); + s_hw.n_packets = stats_basic.packets; + s_hw.n_bytes = stats_basic.bytes; + break; + + case TCA_STATS_QUEUE: + qs = nl_attr_get_unspec(nla, sizeof *qs); + break; + + case TCA_STATS_PKT64: + if (prev_type == TCA_STATS_BASIC) { + s_sw.n_packets = nl_attr_get_u64(nla); + } else if (prev_type == TCA_STATS_BASIC_HW) { + s_hw.n_packets = nl_attr_get_u64(nla); + } else { + goto err; + } + break; + + default: + break; + } + prev_type = type; } - if (bs_sw.packets > get_32aligned_u64(&stats_sw->n_packets)) { - put_32aligned_u64(&stats_sw->n_packets, bs_sw.packets); - put_32aligned_u64(&stats_sw->n_bytes, bs_sw.bytes); + if (!(seen & (1 << TCA_STATS_BASIC))) { + goto err; } - if (stats_attrs[TCA_STATS_BASIC_HW] - && bs_hw.packets > get_32aligned_u64(&stats_hw->n_packets)) { - put_32aligned_u64(&stats_hw->n_packets, bs_hw.packets); - put_32aligned_u64(&stats_hw->n_bytes, bs_hw.bytes); + if (seen & (1 << TCA_STATS_BASIC_HW)) { + s_sw.n_packets = s_sw.n_packets - s_hw.n_packets; + s_sw.n_bytes = s_sw.n_bytes - s_hw.n_bytes; + + if (s_hw.n_packets > get_32aligned_u64(&stats_hw->n_packets)) { + put_32aligned_u64(&stats_hw->n_packets, s_hw.n_packets); + put_32aligned_u64(&stats_hw->n_bytes, s_hw.n_bytes); + } } - if (stats_dropped && stats_attrs[TCA_STATS_QUEUE]) { - qs = nl_attr_get_unspec(stats_attrs[TCA_STATS_QUEUE], sizeof *qs); + if (s_sw.n_packets > get_32aligned_u64(&stats_sw->n_packets)) { + put_32aligned_u64(&stats_sw->n_packets, s_sw.n_packets); + put_32aligned_u64(&stats_sw->n_bytes, s_sw.n_bytes); + } + + if (stats_dropped && qs) { put_32aligned_u64(&stats_dropped->n_packets, qs->drops); } return 0; + +err: + VLOG_ERR_RL(&error_rl, "Failed to parse action stats policy"); + return EPROTO; } static int @@ -1904,8 +2041,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, struct nlattr *act_cookie; const char *act_kind; struct nlattr *action_attrs[ARRAY_SIZE(act_policy)]; - int act_index = flower->action_count; - bool is_meter = false; int err = 0; if (!nl_parse_nested(action, act_policy, action_attrs, @@ -1943,7 +2078,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, nl_parse_act_ct(act_options, flower); } else if (!strcmp(act_kind, "police")) { nl_parse_act_police(act_options, flower); - is_meter = tc_is_meter_index(flower->actions[act_index].police.index); } else { VLOG_ERR_RL(&error_rl, "unknown tc action kind: %s", act_kind); err = EINVAL; @@ -1958,14 +2092,6 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower, flower->act_cookie.len = nl_attr_get_size(act_cookie); } - /* Skip the stats update when act_police is meter since there are always - * some other actions following meter. For other potential kinds of - * act_police actions, whose stats could not be skipped (e.g. filter has - * only one police action), update the action stats to the flow rule. */ - if (is_meter) { - return 0; - } - return nl_parse_action_stats(action_attrs[TCA_ACT_STATS], &flower->stats_sw, &flower->stats_hw, NULL); } @@ -2143,18 +2269,19 @@ int parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, struct tc_flower *flower, bool terse) { - struct tcmsg *tc; + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); struct nlattr *ta[ARRAY_SIZE(tca_policy)]; const char *kind; - if (NLMSG_HDRLEN + sizeof *tc > reply->size) { + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); return EPROTO; } memset(flower, 0, sizeof *flower); - tc = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); - flower->key.eth_type = (OVS_FORCE ovs_be16) tc_get_minor(tc->tcm_info); flower->mask.eth_type = OVS_BE16_MAX; id->prio = tc_get_major(tc->tcm_info); @@ -2168,8 +2295,7 @@ parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, return EAGAIN; } - if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof *tc, - tca_policy, ta, ARRAY_SIZE(ta))) { + if (!nl_policy_parse(&b, 0, tca_policy, ta, ARRAY_SIZE(ta))) { VLOG_ERR_RL(&error_rl, "failed to parse tca policy"); return EPROTO; } @@ -2190,13 +2316,17 @@ parse_netlink_to_tc_flower(struct ofpbuf *reply, struct tcf_id *id, int parse_netlink_to_tc_chain(struct ofpbuf *reply, uint32_t *chain) { + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); struct nlattr *ta[ARRAY_SIZE(tca_chain_policy)]; - struct tcmsg *tc; - tc = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); + return EPROTO; + } - if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof *tc, - tca_chain_policy, ta, ARRAY_SIZE(ta))) { + if (!nl_policy_parse(&b, 0, tca_chain_policy, ta, ARRAY_SIZE(ta))) { VLOG_ERR_RL(&error_rl, "failed to parse tca chain policy"); return EINVAL; } @@ -2260,21 +2390,27 @@ int parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]) { static struct nl_policy actions_orders_policy[TCA_ACT_MAX_PRIO] = {}; + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); struct nlattr *actions_orders[ARRAY_SIZE(actions_orders_policy)]; + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); const int max_size = ARRAY_SIZE(actions_orders_policy); + struct tcamsg *tca = ofpbuf_try_pull(&b, sizeof *tca); const struct nlattr *actions; struct tc_flower flower; - struct tcamsg *tca; int i, cnt = 0; int err; + if (!nlmsg || !tca) { + COVERAGE_INC(tc_netlink_malformed_reply); + return EPROTO; + } + for (i = 0; i < max_size; i++) { actions_orders_policy[i].type = NL_A_NESTED; actions_orders_policy[i].optional = true; } - tca = ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tca); - actions = nl_attr_find(reply, NLMSG_HDRLEN + sizeof *tca, TCA_ACT_TAB); + actions = nl_attr_find(&b, 0, TCA_ACT_TAB); if (!actions || !nl_parse_nested(actions, actions_orders_policy, actions_orders, max_size)) { VLOG_ERR_RL(&error_rl, @@ -2302,14 +2438,23 @@ parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]) } int -tc_del_filter(struct tcf_id *id) +tc_del_filter(struct tcf_id *id, const char *kind) { struct ofpbuf request; request_from_tcf_id(id, 0, RTM_DELTFILTER, NLM_F_ACK, &request); + if (kind) { + nl_msg_put_string(&request, TCA_KIND, kind); + } return tc_transact(&request, NULL); } +int +tc_del_flower_filter(struct tcf_id *id) +{ + return tc_del_filter(id, "flower"); +} + int tc_get_flower(struct tcf_id *id, struct tc_flower *flower) { @@ -2318,6 +2463,7 @@ tc_get_flower(struct tcf_id *id, struct tc_flower *flower) int error; request_from_tcf_id(id, 0, RTM_GETTFILTER, NLM_F_ECHO, &request); + nl_msg_put_string(&request, TCA_KIND, "flower"); error = tc_transact(&request, &reply); if (error) { return error; @@ -2498,13 +2644,13 @@ nl_msg_put_act_tunnel_key_release(struct ofpbuf *request) static void nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, - struct tun_metadata tun_metadata) + struct tun_metadata *tun_metadata) { const struct geneve_opt *opt; size_t outer, inner; int len, cnt = 0; - len = tun_metadata.present.len; + len = tun_metadata->present.len; if (!len) { return; } @@ -2512,7 +2658,7 @@ nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, outer = nl_msg_start_nested(request, TCA_TUNNEL_KEY_ENC_OPTS); while (len) { - opt = &tun_metadata.opts.gnv[cnt]; + opt = &tun_metadata->opts.gnv[cnt]; inner = nl_msg_start_nested(request, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, @@ -2531,13 +2677,30 @@ nl_msg_put_act_tunnel_geneve_option(struct ofpbuf *request, } static void -nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, bool id_present, - ovs_be64 id, ovs_be32 ipv4_src, - ovs_be32 ipv4_dst, struct in6_addr *ipv6_src, - struct in6_addr *ipv6_dst, - ovs_be16 tp_dst, uint8_t tos, uint8_t ttl, - struct tun_metadata tun_metadata, - uint8_t no_csum, uint32_t action_pc) +nl_msg_put_act_tunnel_vxlan_opts(struct ofpbuf *request, + struct tc_action_encap *encap) +{ + size_t outer, inner; + uint32_t gbp_raw; + + if (!encap->gbp.id_present) { + return; + } + + gbp_raw = odp_encode_gbp_raw(encap->gbp.flags, + encap->gbp.id); + outer = nl_msg_start_nested_with_flag(request, TCA_TUNNEL_KEY_ENC_OPTS); + inner = nl_msg_start_nested_with_flag(request, + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); + nl_msg_put_u32(request, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, gbp_raw); + nl_msg_end_nested(request, inner); + nl_msg_end_nested(request, outer); +} + +static void +nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, + struct tc_action_encap *encap, + uint32_t action_pc) { size_t offset; @@ -2549,30 +2712,34 @@ nl_msg_put_act_tunnel_key_set(struct ofpbuf *request, bool id_present, nl_msg_put_unspec(request, TCA_TUNNEL_KEY_PARMS, &tun, sizeof tun); - ovs_be32 id32 = be64_to_be32(id); - if (id_present) { + ovs_be32 id32 = be64_to_be32(encap->id); + if (encap->id_present) { nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_KEY_ID, id32); } - if (ipv4_dst) { - nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_SRC, ipv4_src); - nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_DST, ipv4_dst); - } else if (ipv6_addr_is_set(ipv6_dst)) { + if (encap->ipv4.ipv4_dst) { + nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_SRC, + encap->ipv4.ipv4_src); + nl_msg_put_be32(request, TCA_TUNNEL_KEY_ENC_IPV4_DST, + encap->ipv4.ipv4_dst); + } else if (ipv6_addr_is_set(&encap->ipv6.ipv6_dst)) { nl_msg_put_in6_addr(request, TCA_TUNNEL_KEY_ENC_IPV6_DST, - ipv6_dst); + &encap->ipv6.ipv6_dst); nl_msg_put_in6_addr(request, TCA_TUNNEL_KEY_ENC_IPV6_SRC, - ipv6_src); + &encap->ipv6.ipv6_src); } - if (tos) { - nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TOS, tos); + if (encap->tos) { + nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TOS, encap->tos); } - if (ttl) { - nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TTL, ttl); + if (encap->ttl) { + nl_msg_put_u8(request, TCA_TUNNEL_KEY_ENC_TTL, encap->ttl); } - if (tp_dst) { - nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_DST_PORT, tp_dst); + if (encap->tp_dst) { + nl_msg_put_be16(request, TCA_TUNNEL_KEY_ENC_DST_PORT, + encap->tp_dst); } - nl_msg_put_act_tunnel_geneve_option(request, tun_metadata); - nl_msg_put_u8(request, TCA_TUNNEL_KEY_NO_CSUM, no_csum); + nl_msg_put_act_tunnel_vxlan_opts(request, encap); + nl_msg_put_act_tunnel_geneve_option(request, &encap->data); + nl_msg_put_u8(request, TCA_TUNNEL_KEY_NO_CSUM, encap->no_csum); } nl_msg_end_nested(request, offset); } @@ -2806,11 +2973,18 @@ csum_update_flag(struct tc_flower *flower, } else if (flower->key.ip_proto == IPPROTO_UDP) { flower->needs_full_ip_proto_mask = true; flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDP; - } else if (flower->key.ip_proto == IPPROTO_ICMP) { + } else if (flower->key.ip_proto == IPPROTO_ICMP || + flower->key.ip_proto == IPPROTO_IGMP || + flower->key.ip_proto == IPPROTO_SCTP || + flower->key.ip_proto == IPPROTO_IPIP || + flower->key.ip_proto == IPPROTO_GRE) { flower->needs_full_ip_proto_mask = true; } else if (flower->key.ip_proto == IPPROTO_ICMPV6) { flower->needs_full_ip_proto_mask = true; flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_ICMP; + } else if (flower->key.ip_proto == IPPROTO_UDPLITE) { + flower->needs_full_ip_proto_mask = true; + flower->csum_update_flags |= TCA_CSUM_UPDATE_FLAG_UDPLITE; } else { VLOG_WARN_RL(&error_rl, "can't offload rewrite of IP/IPV6 with ip_proto: %d", @@ -2882,17 +3056,17 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, struct tc_action *action, uint32_t action_pc) { - struct { + union { struct tc_pedit sel; - struct tc_pedit_key keys[MAX_PEDIT_OFFSETS]; - struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS]; - } sel = { - .sel = { - .nkeys = 0 - } - }; + uint8_t buffer[sizeof(struct tc_pedit) + + MAX_PEDIT_OFFSETS * sizeof(struct tc_pedit_key)]; + } sel; + struct tc_pedit_key_ex keys_ex[MAX_PEDIT_OFFSETS]; int i, j, err; + memset(&sel, 0, sizeof sel); + memset(keys_ex, 0, sizeof keys_ex); + for (i = 0; i < ARRAY_SIZE(flower_pedit_map); i++) { struct flower_key_to_pedit *m = &flower_pedit_map[i]; struct tc_pedit_key *pedit_key = NULL; @@ -2926,8 +3100,8 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, return EOPNOTSUPP; } - pedit_key = &sel.keys[sel.sel.nkeys]; - pedit_key_ex = &sel.keys_ex[sel.sel.nkeys]; + pedit_key = &sel.sel.keys[sel.sel.nkeys]; + pedit_key_ex = &keys_ex[sel.sel.nkeys]; pedit_key_ex->cmd = TCA_PEDIT_KEY_EX_CMD_SET; pedit_key_ex->htype = m->htype; pedit_key->off = cur_offset; @@ -2947,7 +3121,7 @@ nl_msg_put_flower_rewrite_pedits(struct ofpbuf *request, } } } - nl_msg_put_act_pedit(request, &sel.sel, sel.keys_ex, + nl_msg_put_act_pedit(request, &sel.sel, keys_ex, flower->csum_update_flags ? TC_ACT_PIPE : action_pc); return 0; @@ -3126,7 +3300,11 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) uint32_t action_pc; /* Programmatic Control */ if (!action->jump_action) { - action_pc = TC_ACT_PIPE; + if (i == flower->action_count - 1) { + action_pc = TC_ACT_SHOT; + } else { + action_pc = TC_ACT_PIPE; + } } else if (action->jump_action == JUMP_ACTION_STOP) { action_pc = TC_ACT_STOLEN; } else { @@ -3191,17 +3369,7 @@ nl_msg_put_flower_acts(struct ofpbuf *request, struct tc_flower *flower) } act_offset = nl_msg_start_nested(request, act_index++); - nl_msg_put_act_tunnel_key_set(request, action->encap.id_present, - action->encap.id, - action->encap.ipv4.ipv4_src, - action->encap.ipv4.ipv4_dst, - &action->encap.ipv6.ipv6_src, - &action->encap.ipv6.ipv6_dst, - action->encap.tp_dst, - action->encap.tos, - action->encap.ttl, - action->encap.data, - action->encap.no_csum, + nl_msg_put_act_tunnel_key_set(request, &action->encap, action_pc); nl_msg_put_act_flags(request); nl_msg_end_nested(request, act_offset); @@ -3371,22 +3539,18 @@ nl_msg_put_masked_value(struct ofpbuf *request, uint16_t type, } static void -nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, - struct tun_metadata metadata) +nl_msg_put_flower_geneve(struct ofpbuf *request, + const struct tc_flower_tunnel *tunnel) { - struct geneve_opt *opt; - size_t outer, inner; + const struct tun_metadata *metadata = &tunnel->metadata; + const struct geneve_opt *opt; int len, cnt = 0; + size_t offset; - len = metadata.present.len; - if (!len) { - return; - } - - outer = nl_msg_start_nested(request, type); + len = metadata->present.len; while (len) { - opt = &metadata.opts.gnv[cnt]; - inner = nl_msg_start_nested(request, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); + opt = &metadata->opts.gnv[cnt]; + offset = nl_msg_start_nested(request, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); nl_msg_put_be16(request, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class); @@ -3397,8 +3561,41 @@ nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, cnt += sizeof(struct geneve_opt) / 4 + opt->length; len -= sizeof(struct geneve_opt) + opt->length * 4; - nl_msg_end_nested(request, inner); + nl_msg_end_nested(request, offset); + } +} + +static void +nl_msg_put_flower_vxlan_tun_opts(struct ofpbuf *request, + const struct tc_flower_tunnel *tunnel) +{ + uint32_t gbp_raw; + size_t offset; + + if (!tunnel->gbp.id_present) { + return; + } + + gbp_raw = odp_encode_gbp_raw(tunnel->gbp.flags, tunnel->gbp.id); + offset = nl_msg_start_nested_with_flag(request, + TCA_FLOWER_KEY_ENC_OPTS_VXLAN); + nl_msg_put_u32(request, TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, gbp_raw); + nl_msg_end_nested(request, offset); +} + +static void +nl_msg_put_flower_tunnel_opts(struct ofpbuf *request, uint16_t type, + struct tc_flower_tunnel *tunnel) +{ + size_t outer; + + if (!tunnel->metadata.present.len && !tunnel->gbp.id_present) { + return; } + + outer = nl_msg_start_nested(request, type); + nl_msg_put_flower_geneve(request, tunnel); + nl_msg_put_flower_vxlan_tun_opts(request, tunnel); nl_msg_end_nested(request, outer); } @@ -3462,9 +3659,9 @@ nl_msg_put_flower_tunnel(struct ofpbuf *request, struct tc_flower *flower) nl_msg_put_be32(request, TCA_FLOWER_KEY_ENC_KEY_ID, id); } nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS, - flower->key.tunnel.metadata); + &flower->key.tunnel); nl_msg_put_flower_tunnel_opts(request, TCA_FLOWER_KEY_ENC_OPTS_MASK, - flower->mask.tunnel.metadata); + &flower->mask.tunnel); } #define FLOWER_PUT_MASKED_VALUE(member, type) \ @@ -3661,15 +3858,13 @@ log_tc_flower_match(const char *msg, ds_put_cstr(&s, "\nExpected Actions:\n"); for (i = 0, action = a->actions; i < a->action_count; i++, action++) { - ds_put_cstr(&s, " - "); - ds_put_hex(&s, action, sizeof *action); - ds_put_cstr(&s, "\n"); + ds_put_format(&s, " - %d -\n", i); + ds_put_sparse_hex_dump(&s, action, sizeof *action, 0, false); } - ds_put_cstr(&s, "Received Actions:\n"); + ds_put_cstr(&s, "\nReceived Actions:\n"); for (i = 0, action = b->actions; i < b->action_count; i++, action++) { - ds_put_cstr(&s, " - "); - ds_put_hex(&s, action, sizeof *action); - ds_put_cstr(&s, "\n"); + ds_put_format(&s, " - %d -\n", i); + ds_put_sparse_hex_dump(&s, action, sizeof *action, 0, false); } } else { /* Only dump the delta in actions. */ @@ -3678,12 +3873,13 @@ log_tc_flower_match(const char *msg, for (int i = 0; i < a->action_count; i++, action_a++, action_b++) { if (memcmp(action_a, action_b, sizeof *action_a)) { - ds_put_format(&s, - "\nAction %d mismatch:\n - Expected Action: ", - i); - ds_put_hex(&s, action_a, sizeof *action_a); - ds_put_cstr(&s, "\n - Received Action: "); - ds_put_hex(&s, action_b, sizeof *action_b); + ds_put_format(&s, "\nAction %d mismatch:\n" + " - Expected Action:\n", i); + ds_put_sparse_hex_dump(&s, action_a, sizeof *action_a, + 0, false); + ds_put_cstr(&s, " - Received Action:\n"); + ds_put_sparse_hex_dump(&s, action_b, sizeof *action_b, + 0, false); } } } @@ -3762,8 +3958,15 @@ tc_replace_flower(struct tcf_id *id, struct tc_flower *flower) error = tc_transact(&request, &reply); if (!error) { - struct tcmsg *tc = - ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc); + struct ofpbuf b = ofpbuf_const_initializer(reply->data, reply->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct tcmsg *tc = ofpbuf_try_pull(&b, sizeof *tc); + + if (!nlmsg || !tc) { + COVERAGE_INC(tc_netlink_malformed_reply); + ofpbuf_delete(reply); + return EPROTO; + } id->prio = tc_get_major(tc->tcm_info); id->handle = tc->tcm_handle; @@ -3808,3 +4011,24 @@ tc_set_policy(const char *policy) VLOG_INFO("tc: Using policy '%s'", policy); } + +void +nl_msg_put_act_tc_policy_flag(struct ofpbuf *request) +{ + int flag = 0; + + if (!request) { + return; + } + + if (tc_policy == TC_POLICY_SKIP_HW) { + flag = TCA_ACT_FLAGS_SKIP_HW; + } else if (tc_policy == TC_POLICY_SKIP_SW) { + flag = TCA_ACT_FLAGS_SKIP_SW; + } + + if (flag) { + struct nla_bitfield32 flags = { flag, flag }; + nl_msg_put_unspec(request, TCA_ACT_FLAGS, &flags, sizeof flags); + } +} diff --git a/lib/tc.h b/lib/tc.h index 2e64ad37259..8442c8d8b8c 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -49,6 +49,9 @@ enum tc_flower_reserved_prio { TC_RESERVED_PRIORITY_NONE, TC_RESERVED_PRIORITY_POLICE, + TC_RESERVED_PRIORITY_IPV4, + TC_RESERVED_PRIORITY_IPV6, + TC_RESERVED_PRIORITY_VLAN, __TC_RESERVED_PRIORITY_MAX }; #define TC_RESERVED_PRIORITY_MAX (__TC_RESERVED_PRIORITY_MAX -1) @@ -103,6 +106,30 @@ struct tc_cookie { size_t len; }; +struct tc_tunnel_gbp { + ovs_be16 id; + uint8_t flags; + bool id_present; +}; + +struct tc_flower_tunnel { + struct { + ovs_be32 ipv4_src; + ovs_be32 ipv4_dst; + } ipv4; + struct { + struct in6_addr ipv6_src; + struct in6_addr ipv6_dst; + } ipv6; + uint8_t tos; + uint8_t ttl; + ovs_be16 tp_src; + ovs_be16 tp_dst; + struct tc_tunnel_gbp gbp; + ovs_be64 id; + struct tun_metadata metadata; +}; + struct tc_flower_key { ovs_be16 eth_type; uint8_t ip_proto; @@ -159,22 +186,7 @@ struct tc_flower_key { uint8_t rewrite_tclass; } ipv6; - struct { - struct { - ovs_be32 ipv4_src; - ovs_be32 ipv4_dst; - } ipv4; - struct { - struct in6_addr ipv6_src; - struct in6_addr ipv6_dst; - } ipv6; - uint8_t tos; - uint8_t ttl; - ovs_be16 tp_src; - ovs_be16 tp_dst; - ovs_be64 id; - struct tun_metadata metadata; - } tunnel; + struct tc_flower_tunnel tunnel; }; enum tc_action_type { @@ -199,6 +211,27 @@ enum nat_type { TC_NAT_RESTORE, }; +struct tc_action_encap { + bool id_present; + ovs_be64 id; + /* ovs_be16 tp_src; Could have been here, but there is no + * TCA_TUNNEL_KEY_ENC_ attribute for it in the kernel. */ + ovs_be16 tp_dst; + uint8_t tos; + uint8_t ttl; + uint8_t no_csum; + struct { + ovs_be32 ipv4_src; + ovs_be32 ipv4_dst; + } ipv4; + struct { + struct in6_addr ipv6_src; + struct in6_addr ipv6_dst; + } ipv6; + struct tun_metadata data; + struct tc_tunnel_gbp gbp; +}; + struct tc_action { union { int chain; @@ -222,24 +255,7 @@ struct tc_action { uint8_t bos; } mpls; - struct { - bool id_present; - ovs_be64 id; - ovs_be16 tp_src; - ovs_be16 tp_dst; - uint8_t tos; - uint8_t ttl; - uint8_t no_csum; - struct { - ovs_be32 ipv4_src; - ovs_be32 ipv4_dst; - } ipv4; - struct { - struct in6_addr ipv6_src; - struct in6_addr ipv6_dst; - } ipv6; - struct tun_metadata data; - } encap; + struct tc_action_encap encap; struct { uint16_t zone; @@ -341,7 +357,6 @@ static inline bool is_tcf_id_eq(struct tcf_id *id1, struct tcf_id *id2) { return id1->prio == id2->prio - && id1->handle == id2->handle && id1->handle == id2->handle && id1->hook == id2->hook && id1->block_id == id2->block_id @@ -382,7 +397,8 @@ struct tc_flower { }; int tc_replace_flower(struct tcf_id *id, struct tc_flower *flower); -int tc_del_filter(struct tcf_id *id); +int tc_del_filter(struct tcf_id *id, const char *kind); +int tc_del_flower_filter(struct tcf_id *id); int tc_get_flower(struct tcf_id *id, struct tc_flower *flower); int tc_dump_flower_start(struct tcf_id *id, struct nl_dump *dump, bool terse); int tc_dump_tc_chain_start(struct tcf_id *id, struct nl_dump *dump); @@ -399,4 +415,6 @@ int tc_parse_action_stats(struct nlattr *action, int tc_dump_tc_action_start(char *name, struct nl_dump *dump); int parse_netlink_to_tc_policer(struct ofpbuf *reply, uint32_t police_idx[]); +void nl_msg_put_act_tc_policy_flag(struct ofpbuf *request); + #endif /* tc.h */ diff --git a/lib/timeval.c b/lib/timeval.c index 193c7bab178..10c1b9ca15a 100644 --- a/lib/timeval.c +++ b/lib/timeval.c @@ -41,6 +41,8 @@ VLOG_DEFINE_THIS_MODULE(timeval); +COVERAGE_DEFINE(long_poll_interval); + #if !defined(HAVE_CLOCK_GETTIME) typedef unsigned int clockid_t; static int clock_gettime(clock_t id, struct timespec *ts); @@ -644,6 +646,8 @@ log_poll_interval(long long int last_wakeup) const struct rusage *last_rusage = get_recent_rusage(); struct rusage rusage; + COVERAGE_INC(long_poll_interval); + if (!getrusage_thread(&rusage)) { VLOG_WARN("Unreasonably long %lldms poll interval" " (%lldms user, %lldms system)", @@ -763,17 +767,22 @@ get_cpu_usage(void) /* "time/stop" stops the monotonic time returned by e.g. time_msec() from * advancing, except due to later calls to "time/warp". */ -static void -timeval_stop_cb(struct unixctl_conn *conn, - int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, - void *aux OVS_UNUSED) +void +timeval_stop(void) { ovs_mutex_lock(&monotonic_clock.mutex); atomic_store_relaxed(&monotonic_clock.slow_path, true); monotonic_clock.stopped = true; xclock_gettime(monotonic_clock.id, &monotonic_clock.cache); ovs_mutex_unlock(&monotonic_clock.mutex); +} +static void +timeval_stop_cb(struct unixctl_conn *conn, + int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, + void *aux OVS_UNUSED) +{ + timeval_stop(); unixctl_command_reply(conn, NULL); } @@ -814,6 +823,21 @@ timeval_warp_cb(struct unixctl_conn *conn, timewarp_work(); } +/* Direct monotonic clock into slow path and advance the current monotonic + * time by 'msecs' milliseconds directly. This is for use in unit tests. */ +void +timeval_warp(long long int msecs) +{ + struct clock *c = &monotonic_clock; + struct timespec warp; + + ovs_mutex_lock(&monotonic_clock.mutex); + atomic_store_relaxed(&monotonic_clock.slow_path, true); + msec_to_timespec(msecs, &warp); + timespec_add(&c->warp, &c->warp, &warp); + ovs_mutex_unlock(&monotonic_clock.mutex); +} + void timeval_dummy_register(void) { diff --git a/lib/timeval.h b/lib/timeval.h index 502f703d4c2..1c40530e27e 100644 --- a/lib/timeval.h +++ b/lib/timeval.h @@ -81,6 +81,9 @@ long long int time_boot_msec(void); void timewarp_run(void); +void timeval_stop(void); +void timeval_warp(long long int msecs); + #ifdef __cplusplus } #endif diff --git a/lib/tnl-ports.c b/lib/tnl-ports.c index 050eafa6b8c..bb0b0b0c55f 100644 --- a/lib/tnl-ports.c +++ b/lib/tnl-ports.c @@ -112,7 +112,7 @@ map_insert(odp_port_t port, struct eth_addr mac, struct in6_addr *addr, tnl_port_init_flow(&match.flow, mac, addr, nw_proto, tp_port); do { - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &match.flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &match.flow, NULL, NULL); p = tnl_port_cast(cr); /* Try again if the rule was released before we get the reference. */ } while (p && !ovs_refcount_try_ref_rcu(&p->ref_cnt)); @@ -126,7 +126,7 @@ map_insert(odp_port_t port, struct eth_addr mac, struct in6_addr *addr, /* XXX: No fragments support. */ match.wc.masks.nw_frag = FLOW_NW_FRAG_MASK; - /* 'tp_port' is zero for GRE tunnels. In this case it + /* 'tp_port' is zero for GRE and SRv6 tunnels. In this case it * doesn't make sense to match on UDP port numbers. */ if (tp_port) { match.wc.masks.tp_dst = OVS_BE16_MAX; @@ -161,40 +161,31 @@ map_insert_ipdev__(struct ip_device *ip_dev, char dev_name[], } } -static uint8_t -tnl_type_to_nw_proto(const char type[]) +static void +tnl_type_to_nw_proto(const char type[], uint8_t nw_protos[2]) { - if (!strcmp(type, "geneve")) { - return IPPROTO_UDP; - } - if (!strcmp(type, "stt")) { - return IPPROTO_TCP; - } - if (!strcmp(type, "gre") || !strcmp(type, "erspan") || - !strcmp(type, "ip6erspan") || !strcmp(type, "ip6gre")) { - return IPPROTO_GRE; - } - if (!strcmp(type, "vxlan")) { - return IPPROTO_UDP; - } - if (!strcmp(type, "gtpu")) { - return IPPROTO_UDP; + nw_protos[0] = nw_protos[1] = 0; + + if (!strcmp(type, "geneve") || !strcmp(type, "vxlan") || + !strcmp(type, "gtpu")) { + nw_protos[0] = IPPROTO_UDP; + } else if (!strcmp(type, "stt")) { + nw_protos[0] = IPPROTO_TCP; + } else if (!strcmp(type, "gre") || !strcmp(type, "erspan") || + !strcmp(type, "ip6erspan") || !strcmp(type, "ip6gre")) { + nw_protos[0] = IPPROTO_GRE; + } else if (!strcmp(type, "srv6")) { + nw_protos[0] = IPPROTO_IPIP; + nw_protos[1] = IPPROTO_IPV6; } - return 0; } -void -tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, - const char dev_name[], const char type[]) +static void +tnl_port_map_insert__(odp_port_t port, ovs_be16 tp_port, + const char dev_name[], uint8_t nw_proto) { struct tnl_port *p; struct ip_device *ip_dev; - uint8_t nw_proto; - - nw_proto = tnl_type_to_nw_proto(type); - if (!nw_proto) { - return; - } ovs_mutex_lock(&mutex); LIST_FOR_EACH(p, node, &port_list) { @@ -220,6 +211,22 @@ tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, ovs_mutex_unlock(&mutex); } +void +tnl_port_map_insert(odp_port_t port, ovs_be16 tp_port, + const char dev_name[], const char type[]) +{ + uint8_t nw_protos[2]; + int i; + + tnl_type_to_nw_proto(type, nw_protos); + + for (i = 0; i < 2; i++) { + if (nw_protos[i]) { + tnl_port_map_insert__(port, tp_port, dev_name, nw_protos[i]); + } + } +} + static void tnl_port_unref(const struct cls_rule *cr) { @@ -240,7 +247,7 @@ map_delete(struct eth_addr mac, struct in6_addr *addr, tnl_port_init_flow(&flow, mac, addr, nw_proto, tp_port); - cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL); + cr = classifier_lookup(&cls, OVS_VERSION_MAX, &flow, NULL, NULL); tnl_port_unref(cr); } @@ -256,14 +263,11 @@ ipdev_map_delete(struct ip_device *ip_dev, ovs_be16 tp_port, uint8_t nw_proto) } } -void -tnl_port_map_delete(odp_port_t port, const char type[]) +static void +tnl_port_map_delete__(odp_port_t port, uint8_t nw_proto) { struct tnl_port *p; struct ip_device *ip_dev; - uint8_t nw_proto; - - nw_proto = tnl_type_to_nw_proto(type); ovs_mutex_lock(&mutex); LIST_FOR_EACH_SAFE (p, node, &port_list) { @@ -280,13 +284,28 @@ tnl_port_map_delete(odp_port_t port, const char type[]) ovs_mutex_unlock(&mutex); } +void +tnl_port_map_delete(odp_port_t port, const char type[]) +{ + uint8_t nw_protos[2]; + int i; + + tnl_type_to_nw_proto(type, nw_protos); + + for (i = 0; i < 2; i++) { + if (nw_protos[i]) { + tnl_port_map_delete__(port, nw_protos[i]); + } + } +} + /* 'flow' is non-const to allow for temporary modifications during the lookup. * Any changes are restored before returning. */ odp_port_t tnl_port_map_lookup(struct flow *flow, struct flow_wildcards *wc) { const struct cls_rule *cr = classifier_lookup(&cls, OVS_VERSION_MAX, flow, - wc); + wc, NULL); return (cr) ? tnl_port_cast(cr)->portno : ODPP_NONE; } diff --git a/lib/unaligned.h b/lib/unaligned.h index f40e4e10df6..15334e3c764 100644 --- a/lib/unaligned.h +++ b/lib/unaligned.h @@ -95,7 +95,7 @@ GCC_UNALIGNED_ACCESSORS(ovs_be64, be64); static inline uint16_t get_unaligned_u16(const uint16_t *p_) { const uint8_t *p = (const uint8_t *) p_; - return ntohs((p[0] << 8) | p[1]); + return ntohs(((uint16_t) p[0] << 8) | (uint16_t) p[1]); } static inline void put_unaligned_u16(uint16_t *p_, uint16_t x_) @@ -110,7 +110,8 @@ static inline void put_unaligned_u16(uint16_t *p_, uint16_t x_) static inline uint32_t get_unaligned_u32(const uint32_t *p_) { const uint8_t *p = (const uint8_t *) p_; - return ntohl((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]); + return ntohl(((uint32_t) p[0] << 24) | ((uint32_t) p[1] << 16) | + ((uint32_t) p[2] << 8) | (uint32_t) p[3]); } static inline void put_unaligned_u32(uint32_t *p_, uint32_t x_) @@ -131,10 +132,10 @@ static inline uint64_t get_unaligned_u64__(const uint64_t *p_) | ((uint64_t) p[1] << 48) | ((uint64_t) p[2] << 40) | ((uint64_t) p[3] << 32) - | (p[4] << 24) - | (p[5] << 16) - | (p[6] << 8) - | p[7]); + | ((uint64_t) p[4] << 24) + | ((uint64_t) p[5] << 16) + | ((uint64_t) p[6] << 8) + | (uint64_t) p[7]); } static inline void put_unaligned_u64__(uint64_t *p_, uint64_t x_) diff --git a/lib/unixctl.c b/lib/unixctl.c index 103357ee91b..c060e86597d 100644 --- a/lib/unixctl.c +++ b/lib/unixctl.c @@ -17,7 +17,9 @@ #include #include "unixctl.h" #include +#include #include +#include "command-line.h" #include "coverage.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" @@ -50,6 +52,8 @@ struct unixctl_conn { /* Only one request can be in progress at a time. While the request is * being processed, 'request_id' is populated, otherwise it is null. */ struct json *request_id; /* ID of the currently active request. */ + + enum unixctl_output_fmt fmt; /* Output format of current connection. */ }; /* Server for control connection. */ @@ -63,28 +67,68 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); static struct shash commands = SHASH_INITIALIZER(&commands); +const char * +unixctl_output_fmt_to_string(enum unixctl_output_fmt fmt) +{ + switch (fmt) { + case UNIXCTL_OUTPUT_FMT_TEXT: return "text"; + case UNIXCTL_OUTPUT_FMT_JSON: return "json"; + default: return ""; + } +} + +bool +unixctl_output_fmt_from_string(const char *string, + enum unixctl_output_fmt *fmt) +{ + if (!strcasecmp(string, "text")) { + *fmt = UNIXCTL_OUTPUT_FMT_TEXT; + } else if (!strcasecmp(string, "json")) { + *fmt = UNIXCTL_OUTPUT_FMT_JSON; + } else { + return false; + } + return true; +} + static void unixctl_list_commands(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) { - struct ds ds = DS_EMPTY_INITIALIZER; - const struct shash_node **nodes = shash_sort(&commands); - size_t i; + if (unixctl_command_get_output_format(conn) == UNIXCTL_OUTPUT_FMT_JSON) { + struct json *json_commands = json_object_create(); + const struct shash_node *node; + + SHASH_FOR_EACH (node, &commands) { + const struct unixctl_command *command = node->data; + + if (command->usage) { + json_object_put_string(json_commands, node->name, + command->usage); + } + } + unixctl_command_reply_json(conn, json_commands); + } else { + struct ds ds = DS_EMPTY_INITIALIZER; + const struct shash_node **nodes = shash_sort(&commands); + size_t i; - ds_put_cstr(&ds, "The available commands are:\n"); + ds_put_cstr(&ds, "The available commands are:\n"); - for (i = 0; i < shash_count(&commands); i++) { - const struct shash_node *node = nodes[i]; - const struct unixctl_command *command = node->data; + for (i = 0; i < shash_count(&commands); ++i) { + const struct shash_node *node = nodes[i]; + const struct unixctl_command *command = node->data; - if (command->usage) { - ds_put_format(&ds, " %-23s %s\n", node->name, command->usage); + if (command->usage) { + ds_put_format(&ds, " %-23s %s\n", node->name, + command->usage); + } } - } - free(nodes); + free(nodes); - unixctl_command_reply(conn, ds_cstr(&ds)); - ds_destroy(&ds); + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); + } } static void @@ -94,6 +138,52 @@ unixctl_version(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, ovs_get_program_version()); } +static void +unixctl_set_options(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + struct ovs_cmdl_parsed_option *parsed_options = NULL; + size_t n_parsed_options; + char *error = NULL; + + static const struct option options[] = { + {"format", required_argument, NULL, 'f'}, + {NULL, 0, NULL, 0}, + }; + + error = ovs_cmdl_parse_all(argc--, (char **) (argv++), options, + &parsed_options, &n_parsed_options); + if (error) { + goto error; + } + + for (size_t i = 0; i < n_parsed_options; i++) { + struct ovs_cmdl_parsed_option *parsed_option = &parsed_options[i]; + + switch (parsed_option->o->val) { + case 'f': + if (!unixctl_output_fmt_from_string(parsed_option->arg, + &conn->fmt)) { + error = xasprintf("option format has invalid value %s", + parsed_option->arg); + goto error; + } + break; + + default: + OVS_NOT_REACHED(); + } + } + + unixctl_command_reply(conn, NULL); + free(parsed_options); + return; +error: + unixctl_command_reply_error(conn, error); + free(error); + free(parsed_options); +} + /* Registers a unixctl command with the given 'name'. 'usage' describes the * arguments to the command; it is used only for presentation to the user in * "list-commands" output. (If 'usage' is NULL, then the command is hidden.) @@ -128,36 +218,35 @@ unixctl_command_register(const char *name, const char *usage, shash_add(&commands, name, command); } +enum unixctl_output_fmt +unixctl_command_get_output_format(struct unixctl_conn *conn) +{ + return conn->fmt; +} + +/* Takes ownership of the 'body'. */ static void unixctl_command_reply__(struct unixctl_conn *conn, - bool success, const char *body) + bool success, struct json *body) { - struct json *body_json; struct jsonrpc_msg *reply; COVERAGE_INC(unixctl_replied); ovs_assert(conn->request_id); - if (!body) { - body = ""; - } - - if (body[0] && body[strlen(body) - 1] != '\n') { - body_json = json_string_create_nocopy(xasprintf("%s\n", body)); - } else { - body_json = json_string_create(body); - } - if (success) { - reply = jsonrpc_create_reply(body_json, conn->request_id); + reply = jsonrpc_create_reply(body, conn->request_id); } else { - reply = jsonrpc_create_error(body_json, conn->request_id); + reply = jsonrpc_create_error(body, conn->request_id); } if (VLOG_IS_DBG_ENABLED()) { char *id = json_to_string(conn->request_id, 0); + char *msg = json_to_string(body, JSSF_SORT); + VLOG_DBG("replying with %s, id=%s: \"%s\"", - success ? "success" : "error", id, body); + success ? "success" : "error", id, msg); + free(msg); free(id); } @@ -169,23 +258,52 @@ unixctl_command_reply__(struct unixctl_conn *conn, } /* Replies to the active unixctl connection 'conn'. 'result' is sent to the - * client indicating the command was processed successfully. Only one call to - * unixctl_command_reply() or unixctl_command_reply_error() may be made per - * request. */ + * client indicating the command was processed successfully. 'result' should + * be plain-text; use unixctl_command_reply_json() to return a JSON document + * when JSON output has been requested. Only one call to + * unixctl_command_reply*() functions may be made per request. */ void unixctl_command_reply(struct unixctl_conn *conn, const char *result) { - unixctl_command_reply__(conn, true, result); + struct json *json_result = json_string_create(result ? result : ""); + + if (conn->fmt == UNIXCTL_OUTPUT_FMT_JSON) { + /* Wrap plain-text reply in provisional JSON document when JSON output + * has been requested. */ + struct json *json_reply = json_object_create(); + + json_object_put_string(json_reply, "reply-format", "plain"); + json_object_put(json_reply, "reply", json_result); + + json_result = json_reply; + } + + unixctl_command_reply__(conn, true, json_result); +} + +/* Replies to the active unixctl connection 'conn'. 'body' is sent to the + * client indicating the command was processed successfully. Use this function + * when JSON output has been requested; otherwise use unixctl_command_reply() + * for plain-text output. Only one call to unixctl_command_reply*() functions + * may be made per request. + * + * Takes ownership of the 'body'. */ +void +unixctl_command_reply_json(struct unixctl_conn *conn, struct json *body) +{ + ovs_assert(conn->fmt == UNIXCTL_OUTPUT_FMT_JSON); + unixctl_command_reply__(conn, true, body); } /* Replies to the active unixctl connection 'conn'. 'error' is sent to the - * client indicating an error occurred processing the command. Only one call to - * unixctl_command_reply() or unixctl_command_reply_error() may be made per - * request. */ + * client indicating an error occurred processing the command. 'error' should + * be plain-text. Only one call to unixctl_command_reply*() functions may be + * made per request. */ void unixctl_command_reply_error(struct unixctl_conn *conn, const char *error) { - unixctl_command_reply__(conn, false, error); + unixctl_command_reply__(conn, false, + json_string_create(error ? error : "")); } /* Creates a unixctl server listening on 'path', which for POSIX may be: @@ -250,6 +368,8 @@ unixctl_server_create(const char *path, struct unixctl_server **serverp) unixctl_command_register("list-commands", "", 0, 0, unixctl_list_commands, NULL); unixctl_command_register("version", "", 0, 0, unixctl_version, NULL); + unixctl_command_register("set-options", "[--format text|json]", 1, 2, + unixctl_set_options, NULL); struct unixctl_server *server = xmalloc(sizeof *server); server->listener = listener; @@ -381,6 +501,7 @@ unixctl_server_run(struct unixctl_server *server) struct unixctl_conn *conn = xzalloc(sizeof *conn); ovs_list_push_back(&server->conns, &conn->node); conn->rpc = jsonrpc_open(stream); + conn->fmt = UNIXCTL_OUTPUT_FMT_TEXT; } else if (error == EAGAIN) { break; } else { @@ -483,7 +604,7 @@ unixctl_client_create(const char *path, struct jsonrpc **client) * '*err' if not NULL. */ int unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, - char *argv[], char **result, char **err) + char *argv[], struct json **result, struct json **err) { struct jsonrpc_msg *request, *reply; struct json **json_args, *params; @@ -506,24 +627,15 @@ unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, return error; } - if (reply->error) { - if (reply->error->type == JSON_STRING) { - *err = xstrdup(json_string(reply->error)); - } else { - VLOG_WARN("%s: unexpected error type in JSON RPC reply: %s", - jsonrpc_get_name(client), - json_type_to_string(reply->error->type)); - error = EINVAL; - } - } else if (reply->result) { - if (reply->result->type == JSON_STRING) { - *result = xstrdup(json_string(reply->result)); - } else { - VLOG_WARN("%s: unexpected result type in JSON rpc reply: %s", - jsonrpc_get_name(client), - json_type_to_string(reply->result->type)); - error = EINVAL; - } + if (reply->result && reply->error) { + VLOG_WARN("unexpected response when communicating with %s: %s\n %s", + jsonrpc_get_name(client), + json_to_string(reply->result, JSSF_SORT), + json_to_string(reply->error, JSSF_SORT)); + error = EINVAL; + } else { + *result = json_nullable_clone(reply->result); + *err = json_nullable_clone(reply->error); } jsonrpc_msg_destroy(reply); diff --git a/lib/unixctl.h b/lib/unixctl.h index 4562dbc4911..1965f100dc2 100644 --- a/lib/unixctl.h +++ b/lib/unixctl.h @@ -17,10 +17,21 @@ #ifndef UNIXCTL_H #define UNIXCTL_H 1 +#include + #ifdef __cplusplus extern "C" { #endif +struct json; +enum unixctl_output_fmt { + UNIXCTL_OUTPUT_FMT_TEXT = 1 << 0, + UNIXCTL_OUTPUT_FMT_JSON = 1 << 1, +}; + +const char *unixctl_output_fmt_to_string(enum unixctl_output_fmt); +bool unixctl_output_fmt_from_string(const char *, enum unixctl_output_fmt *); + /* Server for Unix domain socket control connection. */ struct unixctl_server; int unixctl_server_create(const char *path, struct unixctl_server **); @@ -36,7 +47,7 @@ int unixctl_client_create(const char *path, struct jsonrpc **client); int unixctl_client_transact(struct jsonrpc *client, const char *command, int argc, char *argv[], - char **result, char **error); + struct json **result, struct json **error); /* Command registration. */ struct unixctl_conn; @@ -45,8 +56,12 @@ typedef void unixctl_cb_func(struct unixctl_conn *, void unixctl_command_register(const char *name, const char *usage, int min_args, int max_args, unixctl_cb_func *cb, void *aux); +enum unixctl_output_fmt unixctl_command_get_output_format( + struct unixctl_conn *); void unixctl_command_reply_error(struct unixctl_conn *, const char *error); void unixctl_command_reply(struct unixctl_conn *, const char *body); +void unixctl_command_reply_json(struct unixctl_conn *, + struct json *body); #ifdef __cplusplus } diff --git a/lib/util.c b/lib/util.c index 1195c798211..bdd6408b2a6 100644 --- a/lib/util.c +++ b/lib/util.c @@ -25,6 +25,10 @@ #include #include #include +#ifdef __linux__ +#include +#include +#endif #include #include #include "bitmap.h" @@ -63,8 +67,8 @@ DEFINE_PER_THREAD_MALLOCED_DATA(char *, subprogram_name); /* --version option output. */ static char *program_version; -/* 'true' if mlockall() succeeded. */ -static bool is_memory_locked = false; +/* 'true' if mlockall() succeeded, but doesn't support ONFAULT. */ +static bool is_all_memory_locked = false; /* Buffer used by ovs_strerror() and ovs_format_message(). */ DEFINE_STATIC_PER_THREAD_DATA(struct { char s[128]; }, @@ -98,15 +102,15 @@ ovs_assert_failure(const char *where, const char *function, } void -set_memory_locked(void) +set_all_memory_locked(void) { - is_memory_locked = true; + is_all_memory_locked = true; } bool -memory_locked(void) +memory_all_locked(void) { - return is_memory_locked; + return is_all_memory_locked; } void @@ -221,6 +225,8 @@ xvasprintf(const char *format, va_list args) size_t needed; char *s; + ovs_assert(format); + va_copy(args2, args); needed = vsnprintf(NULL, 0, format, args); @@ -614,12 +620,14 @@ ovs_set_program_name(const char *argv0, const char *version) program_name = basename; free(program_version); - if (!strcmp(version, VERSION)) { - program_version = xasprintf("%s (Open vSwitch) "VERSION"\n", + if (!strcmp(version, VERSION VERSION_SUFFIX)) { + program_version = xasprintf("%s (Open vSwitch) "VERSION + VERSION_SUFFIX, program_name); } else { program_version = xasprintf("%s %s\n" - "Open vSwitch Library "VERSION"\n", + "Open vSwitch Library "VERSION + VERSION_SUFFIX, program_name, version); } } @@ -642,6 +650,12 @@ set_subprogram_name(const char *subprogram_name) free(subprogram_name_set(pname)); #if HAVE_GLIBC_PTHREAD_SETNAME_NP + /* The maximum supported thread name including '\0' is 16. + * Add '>' at 0th position to highlight that the name was truncated. */ + if (strlen(pname) > 15) { + memmove(pname, &pname[strlen(pname) - 15], 15 + 1); + pname[0] = '>'; + } pthread_setname_np(pthread_self(), pname); #elif HAVE_NETBSD_PTHREAD_SETNAME_NP pthread_setname_np(pthread_self(), "%s", pname); @@ -750,7 +764,7 @@ ovs_get_program_name(void) void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp) { - printf("%s", program_version); + printf("%s\n", program_version); if (min_ofp || max_ofp) { printf("OpenFlow versions %#x:%#x\n", min_ofp, max_ofp); } @@ -2371,11 +2385,9 @@ xsleep(unsigned int seconds) ovsrcu_quiesce_end(); } -/* High resolution sleep. */ -void -xnanosleep(uint64_t nanoseconds) +static void +xnanosleep__(uint64_t nanoseconds) { - ovsrcu_quiesce_start(); #ifndef _WIN32 int retval; struct timespec ts_sleep; @@ -2403,9 +2415,37 @@ xnanosleep(uint64_t nanoseconds) ovs_lasterror_to_string()); } #endif +} + +/* High resolution sleep with thread quiesce. */ +void +xnanosleep(uint64_t nanoseconds) +{ + ovsrcu_quiesce_start(); + xnanosleep__(nanoseconds); ovsrcu_quiesce_end(); } +/* High resolution sleep without thread quiesce. */ +void +xnanosleep_no_quiesce(uint64_t nanoseconds) +{ + xnanosleep__(nanoseconds); +} + +#if __linux__ +void +set_timer_resolution(unsigned long nanoseconds) +{ + prctl(PR_SET_TIMERSLACK, nanoseconds); +} +#else +void +set_timer_resolution(unsigned long nanoseconds OVS_UNUSED) +{ +} +#endif + /* Determine whether standard output is a tty or not. This is useful to decide * whether to use color output or not when --color option for utilities is set * to `auto`. @@ -2465,3 +2505,29 @@ OVS_CONSTRUCTOR(winsock_start) { } } #endif + +#ifdef __linux__ +bool +ovs_kernel_is_version_or_newer(int target_major, int target_minor) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static int current_major, current_minor = -1; + + if (ovsthread_once_start(&once)) { + struct utsname utsname; + + if (uname(&utsname) == -1) { + VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); + } else if (!ovs_scan(utsname.release, "%d.%d", + ¤t_major, ¤t_minor)) { + VLOG_WARN("uname reported bad OS release (%s)", utsname.release); + } + ovsthread_once_done(&once); + } + if (current_major == -1 || current_minor == -1) { + return false; + } + return current_major > target_major || ( + current_major == target_major && current_minor >= target_minor); +} +#endif diff --git a/lib/util.h b/lib/util.h index f1521b1abd6..385a425b0e7 100644 --- a/lib/util.h +++ b/lib/util.h @@ -156,34 +156,36 @@ void ctl_timeout_setup(unsigned int secs); void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp); -void set_memory_locked(void); -bool memory_locked(void); +void set_all_memory_locked(void); +bool memory_all_locked(void); OVS_NO_RETURN void out_of_memory(void); /* Allocation wrappers that abort if memory is exhausted. */ -void *xmalloc(size_t) MALLOC_LIKE; -void *xcalloc(size_t, size_t) MALLOC_LIKE; -void *xzalloc(size_t) MALLOC_LIKE; -void *xrealloc(void *, size_t); -void *xmemdup(const void *, size_t) MALLOC_LIKE; -char *xmemdup0(const char *, size_t) MALLOC_LIKE; -char *xstrdup(const char *) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xcalloc(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xrealloc(void *, size_t); +OVS_RETURNS_NONNULL void *xmemdup(const void *, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xmemdup0(const char *, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xstrdup(const char *) MALLOC_LIKE; char *nullable_xstrdup(const char *) MALLOC_LIKE; bool nullable_string_is_equal(const char *a, const char *b); -char *xasprintf(const char *format, ...) OVS_PRINTF_FORMAT(1, 2) MALLOC_LIKE; -char *xvasprintf(const char *format, va_list) OVS_PRINTF_FORMAT(1, 0) MALLOC_LIKE; -void *x2nrealloc(void *p, size_t *n, size_t s); +OVS_RETURNS_NONNULL char *xasprintf(const char *format, ...) + OVS_PRINTF_FORMAT(1, 2) MALLOC_LIKE; +OVS_RETURNS_NONNULL char *xvasprintf(const char *format, va_list) + OVS_PRINTF_FORMAT(1, 0) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *x2nrealloc(void *p, size_t *n, size_t s); /* Allocation wrappers for specialized situations where coverage counters * cannot be used. */ -void *xmalloc__(size_t) MALLOC_LIKE; -void *xcalloc__(size_t, size_t) MALLOC_LIKE; -void *xzalloc__(size_t) MALLOC_LIKE; -void *xrealloc__(void *, size_t); +OVS_RETURNS_NONNULL void *xmalloc__(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xcalloc__(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc__(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xrealloc__(void *, size_t); -void *xmalloc_cacheline(size_t) MALLOC_LIKE; -void *xzalloc_cacheline(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_cacheline(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xzalloc_cacheline(size_t) MALLOC_LIKE; void free_cacheline(void *); void ovs_strlcpy(char *dst, const char *src, size_t size); @@ -191,9 +193,9 @@ void ovs_strzcpy(char *dst, const char *src, size_t size); int string_ends_with(const char *str, const char *suffix); -void *xmalloc_pagealign(size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_pagealign(size_t) MALLOC_LIKE; void free_pagealign(void *); -void *xmalloc_size_align(size_t, size_t) MALLOC_LIKE; +OVS_RETURNS_NONNULL void *xmalloc_size_align(size_t, size_t) MALLOC_LIKE; void free_size_align(void *); /* The C standards say that neither the 'dst' nor 'src' argument to @@ -593,6 +595,8 @@ ovs_u128_is_superset(ovs_u128 super, ovs_u128 sub) void xsleep(unsigned int seconds); void xnanosleep(uint64_t nanoseconds); +void xnanosleep_no_quiesce(uint64_t nanoseconds); +void set_timer_resolution(unsigned long nanoseconds); bool is_stdout_a_tty(void); @@ -607,4 +611,8 @@ int ftruncate(int fd, off_t length); } #endif +#ifdef __linux__ +bool ovs_kernel_is_version_or_newer(int target_major, int target_minor); +#endif + #endif /* util.h */ diff --git a/lib/vconn.c b/lib/vconn.c index b5567622779..e9603432d2d 100644 --- a/lib/vconn.c +++ b/lib/vconn.c @@ -682,7 +682,6 @@ do_send(struct vconn *vconn, struct ofpbuf *msg) ofpmsg_update_length(msg); if (!VLOG_IS_DBG_ENABLED()) { - COVERAGE_INC(vconn_sent); retval = (vconn->vclass->send)(vconn, msg); } else { char *s = ofp_to_string(msg->data, msg->size, NULL, NULL, 1); @@ -693,6 +692,9 @@ do_send(struct vconn *vconn, struct ofpbuf *msg) } free(s); } + if (!retval) { + COVERAGE_INC(vconn_sent); + } return retval; } diff --git a/lib/versions.h b/lib/versions.h index d92f0a319e6..724880cb7e4 100644 --- a/lib/versions.h +++ b/lib/versions.h @@ -36,7 +36,7 @@ struct versions { }; #define VERSIONS_INITIALIZER(ADD, REMOVE) \ - (struct versions){ ADD, ATOMIC_VAR_INIT(REMOVE) } + (struct versions){ ADD, REMOVE } static inline void versions_set_remove_version(struct versions *versions, ovs_version_t version) diff --git a/lib/vlog.c b/lib/vlog.c index 0a615bb664b..59b524b097e 100644 --- a/lib/vlog.c +++ b/lib/vlog.c @@ -29,6 +29,7 @@ #include #include #include "async-append.h" +#include "backtrace.h" #include "coverage.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" @@ -118,7 +119,7 @@ static struct ovs_list vlog_modules OVS_GUARDED_BY(log_file_mutex) static int syslog_fd OVS_GUARDED_BY(pattern_rwlock) = -1; /* Log facility configuration. */ -static atomic_int log_facility = ATOMIC_VAR_INIT(0); +static atomic_int log_facility = 0; /* Facility name and its value. */ struct vlog_facility { @@ -410,10 +411,10 @@ vlog_set_log_file__(char *new_log_file_name) /* Close old log file, if any. */ ovs_mutex_lock(&log_file_mutex); + async_append_destroy(log_writer); if (log_fd >= 0) { close(log_fd); } - async_append_destroy(log_writer); free(log_file_name); /* Install new log file. */ @@ -664,6 +665,13 @@ vlog_direct_write_to_log_file_unsafe(const char *s) } } +int +vlog_get_log_file_fd_unsafe(void) + OVS_NO_THREAD_SAFETY_ANALYSIS +{ + return log_fd; +} + /* Returns 'false' if 'facility' is not a valid string. If 'facility' * is a valid string, sets 'value' with the integer value of 'facility' * and returns 'true'. */ @@ -1267,8 +1275,9 @@ vlog_fatal(const struct vlog_module *module, const char *message, ...) va_end(args); } -/* Logs 'message' to 'module' at maximum verbosity, then calls abort(). Always - * writes the message to stderr, even if the console destination is disabled. +/* Attempts to log a stack trace, logs 'message' to 'module' at maximum + * verbosity, then calls abort(). Always writes the message to stderr, even + * if the console destination is disabled. * * Choose this function instead of vlog_fatal_valist() if the daemon monitoring * facility should automatically restart the current daemon. */ @@ -1282,6 +1291,10 @@ vlog_abort_valist(const struct vlog_module *module_, * message written by the later ovs_abort_valist(). */ module->levels[VLF_CONSOLE] = VLL_OFF; + /* Printing the stack trace before the 'message', because the 'message' + * will flush the async log queue (VLL_EMER). With a different order we + * would need to flush the queue manually again. */ + log_backtrace(); vlog_valist(module, VLL_EMER, message, args); ovs_abort_valist(0, message, args); } diff --git a/m4/ax_check_openssl.m4 b/m4/ax_check_openssl.m4 index 281d4dc65eb..faa5babde26 100644 --- a/m4/ax_check_openssl.m4 +++ b/m4/ax_check_openssl.m4 @@ -81,7 +81,8 @@ AC_DEFUN([AX_CHECK_OPENSSL], [ SSL_INCLUDES="-I$ssldir/include" SSL_LDFLAGS="-L$ssldir/lib" if test "$WIN32" = "yes"; then - SSL_LIBS="-lssleay32 -llibeay32" + SSL_LDFLAGS="$SSL_LDFLAGS -L$ssldir/lib/VC/x64/MT" + SSL_LIBS="-llibssl -llibcrypto" SSL_DIR=/$(echo ${ssldir} | ${SED} -e 's/://') else SSL_LIBS="-lssl -lcrypto" diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 index 14d9249b89c..47aa9da16a1 100644 --- a/m4/openvswitch.m4 +++ b/m4/openvswitch.m4 @@ -360,8 +360,12 @@ AC_DEFUN([OVS_CHECK_DBDIR], dnl Defines HAVE_BACKTRACE if backtrace() is found. AC_DEFUN([OVS_CHECK_BACKTRACE], [AC_SEARCH_LIBS([backtrace], [execinfo ubacktrace], - [AC_DEFINE([HAVE_BACKTRACE], [1], - [Define to 1 if you have backtrace(3).])])]) + [HAVE_BACKTRACE=yes], [HAVE_BACKTRACE=no]) + if test "$HAVE_BACKTRACE" = "yes"; then + AC_DEFINE([HAVE_BACKTRACE], [1], [Define to 1 if you have backtrace(3).]) + fi + AM_CONDITIONAL([HAVE_BACKTRACE], [test "$HAVE_BACKTRACE" = "yes"]) + AC_SUBST([HAVE_BACKTRACE])]) dnl Defines HAVE_PERF_EVENT if linux/perf_event.h is found. AC_DEFUN([OVS_CHECK_PERF_EVENT], @@ -371,16 +375,16 @@ dnl Checks for valgrind/valgrind.h. AC_DEFUN([OVS_CHECK_VALGRIND], [AC_CHECK_HEADERS([valgrind/valgrind.h])]) -dnl Checks for Python 3.4 or later. +dnl Checks for Python 3.6 or later. AC_DEFUN([OVS_CHECK_PYTHON3], [AC_CACHE_CHECK( - [for Python 3 (version 3.4 or later)], + [for Python 3 (version 3.6 or later)], [ovs_cv_python3], [if test -n "$PYTHON3"; then ovs_cv_python3=$PYTHON3 else ovs_cv_python3=no - for binary in python3 python3.4 python3.5 python3.6 python3.7; do + for binary in python3 python3.6 python3.7 python3.8 python3.9 python3.10 python3.11 python3.12; do ovs_save_IFS=$IFS; IFS=$PATH_SEPARATOR for dir in $PATH; do IFS=$ovs_save_IFS @@ -397,7 +401,7 @@ else: done fi]) if test "$ovs_cv_python3" = no; then - AC_MSG_ERROR([Python 3.4 or later is required but not found in $PATH, please install it or set $PYTHON3 to point to it]) + AC_MSG_ERROR([Python 3.6 or later is required but not found in $PATH, please install it or set $PYTHON3 to point to it]) fi AC_ARG_VAR([PYTHON3]) PYTHON3=$ovs_cv_python3]) diff --git a/ofproto/automake.mk b/ofproto/automake.mk index 7c08b563bc3..cb1361b8a61 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -30,6 +30,8 @@ ofproto_libofproto_la_SOURCES = \ ofproto/ofproto-dpif.h \ ofproto/ofproto-dpif-ipfix.c \ ofproto/ofproto-dpif-ipfix.h \ + ofproto/ofproto-dpif-lsample.c \ + ofproto/ofproto-dpif-lsample.h \ ofproto/ofproto-dpif-mirror.c \ ofproto/ofproto-dpif-mirror.h \ ofproto/ofproto-dpif-monitor.c \ diff --git a/ofproto/bond.c b/ofproto/bond.c index 47630a6b06a..c31869a4c76 100644 --- a/ofproto/bond.c +++ b/ofproto/bond.c @@ -186,7 +186,7 @@ static struct bond_member *choose_output_member(const struct bond *, struct flow_wildcards *, uint16_t vlan) OVS_REQ_RDLOCK(rwlock); -static void update_recirc_rules__(struct bond *); +static void update_recirc_rules(struct bond *) OVS_REQ_WRLOCK(rwlock); static bool bond_may_recirc(const struct bond *); static void bond_update_post_recirc_rules__(struct bond *, bool force) OVS_REQ_WRLOCK(rwlock); @@ -299,7 +299,10 @@ bond_unref(struct bond *bond) } free(bond->hash); bond->hash = NULL; - update_recirc_rules__(bond); + + ovs_rwlock_wrlock(&rwlock); + update_recirc_rules(bond); + ovs_rwlock_unlock(&rwlock); hmap_destroy(&bond->pr_rule_ops); free(bond->primary); @@ -331,17 +334,8 @@ add_pr_rule(struct bond *bond, const struct match *match, hmap_insert(&bond->pr_rule_ops, &pr_op->hmap_node, hash); } -/* This function should almost never be called directly. - * 'update_recirc_rules()' should be called instead. Since - * this function modifies 'bond->pr_rule_ops', it is only - * safe when 'rwlock' is held. - * - * However, when the 'bond' is the only reference in the system, - * calling this function avoid acquiring lock only to satisfy - * lock annotation. Currently, only 'bond_unref()' calls - * this function directly. */ static void -update_recirc_rules__(struct bond *bond) +update_recirc_rules(struct bond *bond) OVS_REQ_WRLOCK(rwlock) { struct match match; struct bond_pr_rule_op *pr_op; @@ -407,6 +401,15 @@ update_recirc_rules__(struct bond *bond) VLOG_ERR("failed to remove post recirculation flow %s", err_s); free(err_s); + } else if (bond->hash) { + /* If the flow deletion failed, a subsequent call to + * ofproto_dpif_add_internal_flow() would just modify the + * flow preserving its statistics. Therefore, only reset + * the entry's byte counter if it succeeds. */ + uint32_t hash = pr_op->match.flow.dp_hash & BOND_MASK; + struct bond_entry *entry = &bond->hash[hash]; + + entry->pr_tx_bytes = 0; } hmap_remove(&bond->pr_rule_ops, &pr_op->hmap_node); @@ -421,12 +424,6 @@ update_recirc_rules__(struct bond *bond) ofpbuf_uninit(&ofpacts); } -static void -update_recirc_rules(struct bond *bond) - OVS_REQ_RDLOCK(rwlock) -{ - update_recirc_rules__(bond); -} /* Updates 'bond''s overall configuration to 's'. * @@ -897,7 +894,7 @@ bond_check_admissibility(struct bond *bond, const void *member_, if (!member->enabled && member->may_enable) { VLOG_DBG_RL(&rl, "bond %s: member %s: " "main thread has not yet enabled member", - bond->name, bond->active_member->name); + bond->name, member->name); } goto out; case LACP_CONFIGURED: diff --git a/ofproto/connmgr.c b/ofproto/connmgr.c index 7b14cae7733..f7f7b127996 100644 --- a/ofproto/connmgr.c +++ b/ofproto/connmgr.c @@ -1209,7 +1209,7 @@ ofconn_create(struct ofservice *ofservice, struct rconn *rconn, hmap_init(&ofconn->bundles); ofconn->next_bundle_expiry_check = time_msec() + BUNDLE_EXPIRY_INTERVAL; - ofconn_set_rate_limit(ofconn, settings->rate_limit, settings->burst_limit); + ofservice_reconfigure(ofservice, settings); ovs_mutex_unlock(&ofproto_mutex); } @@ -1649,6 +1649,8 @@ connmgr_send_table_status(struct connmgr *mgr, } } +COVERAGE_DEFINE(connmgr_async_unsent); + /* Given 'pin', sends an OFPT_PACKET_IN message to each OpenFlow controller as * necessary according to their individual configurations. */ void @@ -1656,6 +1658,7 @@ connmgr_send_async_msg(struct connmgr *mgr, const struct ofproto_async_msg *am) { struct ofconn *ofconn; + bool sent = false; LIST_FOR_EACH (ofconn, connmgr_node, &mgr->conns) { enum ofputil_protocol protocol = ofconn_get_protocol(ofconn); @@ -1677,6 +1680,11 @@ connmgr_send_async_msg(struct connmgr *mgr, am->pin.up.base.flow_metadata.flow.in_port.ofp_port, msg, &txq); do_send_packet_ins(ofconn, &txq); + sent = true; + } + + if (!sent) { + COVERAGE_INC(connmgr_async_unsent); } } @@ -1907,10 +1915,7 @@ connmgr_count_hidden_rules(const struct connmgr *mgr) } /* Creates a new ofservice for 'target' in 'mgr'. Returns 0 if successful, - * otherwise a positive errno value. - * - * ofservice_reconfigure() must be called to fully configure the new - * ofservice. */ + * otherwise a positive errno value. */ static void ofservice_create(struct connmgr *mgr, const char *target, const struct ofproto_controller *c) @@ -1920,7 +1925,8 @@ ofservice_create(struct connmgr *mgr, const char *target, struct rconn *rconn = NULL; if (!vconn_verify_name(target)) { char *name = ofconn_make_name(mgr, target); - rconn = rconn_create(5, 8, c->dscp, c->allowed_versions); + rconn = rconn_create(c->probe_interval, c->max_backoff, + c->dscp, c->allowed_versions); rconn_connect(rconn, target, name); free(name); } else if (!pvconn_verify_name(target)) { @@ -1943,7 +1949,6 @@ ofservice_create(struct connmgr *mgr, const char *target, ofservice->rconn = rconn; ofservice->pvconn = pvconn; ofservice->s = *c; - ofservice_reconfigure(ofservice, c); VLOG_INFO("%s: added %s controller \"%s\"", mgr->name, ofconn_type_to_string(ofservice->type), target); diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c index 742eed39981..15b65623351 100644 --- a/ofproto/ofproto-dpif-ipfix.c +++ b/ofproto/ofproto-dpif-ipfix.c @@ -124,15 +124,24 @@ struct dpif_ipfix_port { uint32_t ifindex; }; +struct dpif_ipfix_domain { + struct hmap_node hmap_node; /* In struct dpif_ipfix_exporter's domains. */ + time_t last_template_set_time; +}; + struct dpif_ipfix_exporter { uint32_t exporter_id; /* Exporting Process identifier */ - struct collectors *collectors; uint32_t seq_number; - time_t last_template_set_time; + struct collectors *collectors; + struct hmap domains; /* Contains struct dpif_ipfix_domain indexed by + observation domain id. */ + time_t last_stats_sent_time; struct hmap cache_flow_key_map; /* ipfix_flow_cache_entry. */ struct ovs_list cache_flow_start_timestamp_list; /* ipfix_flow_cache_entry. */ uint32_t cache_active_timeout; /* In seconds. */ uint32_t cache_max_flows; + uint32_t stats_interval; + uint32_t template_interval; char *virtual_obs_id; uint8_t virtual_obs_len; @@ -167,11 +176,6 @@ struct dpif_ipfix { #define IPFIX_VERSION 0x000a -/* When using UDP, IPFIX Template Records must be re-sent regularly. - * The standard default interval is 10 minutes (600 seconds). - * Cf. IETF RFC 5101 Section 10.3.6. */ -#define IPFIX_TEMPLATE_INTERVAL 600 - /* Cf. IETF RFC 5101 Section 3.1. */ OVS_PACKED( struct ipfix_header { @@ -617,6 +621,9 @@ static void get_export_time_now(uint64_t *, uint32_t *); static void dpif_ipfix_cache_expire_now(struct dpif_ipfix_exporter *, bool); +static void dpif_ipfix_exporter_del_domain(struct dpif_ipfix_exporter *, + struct dpif_ipfix_domain *); + static bool ofproto_ipfix_bridge_exporter_options_equal( const struct ofproto_ipfix_bridge_exporter_options *a, @@ -627,6 +634,8 @@ ofproto_ipfix_bridge_exporter_options_equal( && a->sampling_rate == b->sampling_rate && a->cache_active_timeout == b->cache_active_timeout && a->cache_max_flows == b->cache_max_flows + && a->stats_interval == b->stats_interval + && a->template_interval == b->template_interval && a->enable_tunnel_sampling == b->enable_tunnel_sampling && a->enable_input_sampling == b->enable_input_sampling && a->enable_output_sampling == b->enable_output_sampling @@ -664,6 +673,8 @@ ofproto_ipfix_flow_exporter_options_equal( return (a->collector_set_id == b->collector_set_id && a->cache_active_timeout == b->cache_active_timeout && a->cache_max_flows == b->cache_max_flows + && a->stats_interval == b->stats_interval + && a->template_interval == b->template_interval && a->enable_tunnel_sampling == b->enable_tunnel_sampling && sset_equals(&a->targets, &b->targets) && nullable_string_is_equal(a->virtual_obs_id, b->virtual_obs_id)); @@ -697,13 +708,17 @@ dpif_ipfix_exporter_init(struct dpif_ipfix_exporter *exporter) exporter->exporter_id = ++exporter_total_count; exporter->collectors = NULL; exporter->seq_number = 1; - exporter->last_template_set_time = 0; + exporter->last_stats_sent_time = 0; hmap_init(&exporter->cache_flow_key_map); ovs_list_init(&exporter->cache_flow_start_timestamp_list); exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; + exporter->stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->template_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->last_stats_sent_time = 0; exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; + hmap_init(&exporter->domains); memset(&exporter->ipfix_global_stats, 0, sizeof(struct dpif_ipfix_global_stats)); @@ -711,6 +726,7 @@ dpif_ipfix_exporter_init(struct dpif_ipfix_exporter *exporter) static void dpif_ipfix_exporter_clear(struct dpif_ipfix_exporter *exporter) + OVS_REQUIRES(mutex) { /* Flush the cache with flow end reason "forced end." */ dpif_ipfix_cache_expire_now(exporter, true); @@ -719,22 +735,32 @@ dpif_ipfix_exporter_clear(struct dpif_ipfix_exporter *exporter) exporter->exporter_id = 0; exporter->collectors = NULL; exporter->seq_number = 1; - exporter->last_template_set_time = 0; + exporter->last_stats_sent_time = 0; exporter->cache_active_timeout = 0; exporter->cache_max_flows = 0; + exporter->stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->template_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + exporter->last_stats_sent_time = 0; free(exporter->virtual_obs_id); exporter->virtual_obs_id = NULL; exporter->virtual_obs_len = 0; + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_SAFE (dom, hmap_node, &exporter->domains) { + dpif_ipfix_exporter_del_domain(exporter, dom); + } + memset(&exporter->ipfix_global_stats, 0, sizeof(struct dpif_ipfix_global_stats)); } static void dpif_ipfix_exporter_destroy(struct dpif_ipfix_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(exporter); hmap_destroy(&exporter->cache_flow_key_map); + hmap_destroy(&exporter->domains); } static bool @@ -742,7 +768,9 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, const struct sset *targets, const uint32_t cache_active_timeout, const uint32_t cache_max_flows, - const char *virtual_obs_id) + const uint32_t stats_interval, + const uint32_t template_interval, + const char *virtual_obs_id) OVS_REQUIRES(mutex) { size_t virtual_obs_len; collectors_destroy(exporter->collectors); @@ -756,6 +784,8 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, } exporter->cache_active_timeout = cache_active_timeout; exporter->cache_max_flows = cache_max_flows; + exporter->stats_interval = stats_interval; + exporter->template_interval = template_interval; virtual_obs_len = virtual_obs_id ? strlen(virtual_obs_id) : 0; if (virtual_obs_len > IPFIX_VIRTUAL_OBS_MAX_LEN) { VLOG_WARN_RL(&rl, "Virtual obsevation ID too long (%d bytes), " @@ -769,6 +799,37 @@ dpif_ipfix_exporter_set_options(struct dpif_ipfix_exporter *exporter, return true; } +static struct dpif_ipfix_domain * +dpif_ipfix_exporter_find_domain(const struct dpif_ipfix_exporter *exporter, + uint32_t domain_id) OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_WITH_HASH (dom, hmap_node, hash_int(domain_id, 0), + &exporter->domains) { + return dom; + } + return NULL; +} + +static struct dpif_ipfix_domain * +dpif_ipfix_exporter_insert_domain(struct dpif_ipfix_exporter *exporter, + const uint32_t domain_id) OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *dom = xmalloc(sizeof *dom); + dom->last_template_set_time = 0; + hmap_insert(&exporter->domains, &dom->hmap_node, hash_int(domain_id, 0)); + return dom; +} + +static void +dpif_ipfix_exporter_del_domain(struct dpif_ipfix_exporter *exporter, + struct dpif_ipfix_domain *dom) + OVS_REQUIRES(mutex) +{ + hmap_remove(&exporter->domains, &dom->hmap_node); + free(dom); +} + static struct dpif_ipfix_port * dpif_ipfix_find_port(const struct dpif_ipfix *di, odp_port_t odp_port) OVS_REQUIRES(mutex) @@ -909,6 +970,7 @@ dpif_ipfix_bridge_exporter_init(struct dpif_ipfix_bridge_exporter *exporter) static void dpif_ipfix_bridge_exporter_clear(struct dpif_ipfix_bridge_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(&exporter->exporter); ofproto_ipfix_bridge_exporter_options_destroy(exporter->options); @@ -918,6 +980,7 @@ dpif_ipfix_bridge_exporter_clear(struct dpif_ipfix_bridge_exporter *exporter) static void dpif_ipfix_bridge_exporter_destroy(struct dpif_ipfix_bridge_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_bridge_exporter_clear(exporter); dpif_ipfix_exporter_destroy(&exporter->exporter); @@ -927,7 +990,7 @@ static void dpif_ipfix_bridge_exporter_set_options( struct dpif_ipfix_bridge_exporter *exporter, const struct ofproto_ipfix_bridge_exporter_options *options, - bool *options_changed) + bool *options_changed) OVS_REQUIRES(mutex) { if (!options || sset_is_empty(&options->targets)) { /* No point in doing any work if there are no targets. */ @@ -955,6 +1018,7 @@ dpif_ipfix_bridge_exporter_set_options( if (!dpif_ipfix_exporter_set_options( &exporter->exporter, &options->targets, options->cache_active_timeout, options->cache_max_flows, + options->stats_interval, options->template_interval, options->virtual_obs_id)) { return; } @@ -970,6 +1034,14 @@ dpif_ipfix_bridge_exporter_set_options( exporter->probability = MAX(1, UINT32_MAX / exporter->options->sampling_rate); + /* Configure static observation_domain_id. */ + struct dpif_ipfix_domain *dom; + HMAP_FOR_EACH_SAFE (dom, hmap_node, &(exporter->exporter.domains)) { + dpif_ipfix_exporter_del_domain(&exporter->exporter, dom); + } + dpif_ipfix_exporter_insert_domain(&exporter->exporter, + options->obs_domain_id); + /* Run over the cache as some entries might have expired after * changing the timeouts. */ dpif_ipfix_cache_expire_now(&exporter->exporter, false); @@ -1003,6 +1075,7 @@ dpif_ipfix_flow_exporter_init(struct dpif_ipfix_flow_exporter *exporter) static void dpif_ipfix_flow_exporter_clear(struct dpif_ipfix_flow_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_exporter_clear(&exporter->exporter); ofproto_ipfix_flow_exporter_options_destroy(exporter->options); @@ -1011,6 +1084,7 @@ dpif_ipfix_flow_exporter_clear(struct dpif_ipfix_flow_exporter *exporter) static void dpif_ipfix_flow_exporter_destroy(struct dpif_ipfix_flow_exporter *exporter) + OVS_REQUIRES(mutex) { dpif_ipfix_flow_exporter_clear(exporter); dpif_ipfix_exporter_destroy(&exporter->exporter); @@ -1020,7 +1094,7 @@ static bool dpif_ipfix_flow_exporter_set_options( struct dpif_ipfix_flow_exporter *exporter, const struct ofproto_ipfix_flow_exporter_options *options, - bool *options_changed) + bool *options_changed) OVS_REQUIRES(mutex) { if (sset_is_empty(&options->targets)) { /* No point in doing any work if there are no targets. */ @@ -1048,6 +1122,7 @@ dpif_ipfix_flow_exporter_set_options( if (!dpif_ipfix_exporter_set_options( &exporter->exporter, &options->targets, options->cache_active_timeout, options->cache_max_flows, + options->stats_interval, options->template_interval, options->virtual_obs_id)) { return false; } @@ -1071,6 +1146,7 @@ dpif_ipfix_flow_exporter_set_options( static void remove_flow_exporter(struct dpif_ipfix *di, struct dpif_ipfix_flow_exporter_map_node *node) + OVS_REQUIRES(mutex) { hmap_remove(&di->flow_exporter_map, &node->node); dpif_ipfix_flow_exporter_destroy(&node->exporter); @@ -2000,6 +2076,7 @@ static void ipfix_cache_update(struct dpif_ipfix_exporter *exporter, struct ipfix_flow_cache_entry *entry, enum ipfix_sampled_packet_type sampled_pkt_type) + OVS_REQUIRES(mutex) { struct ipfix_flow_cache_entry *old_entry; size_t current_flows = 0; @@ -2811,14 +2888,36 @@ dpif_ipfix_flow_sample(struct dpif_ipfix *di, const struct dp_packet *packet, ovs_mutex_unlock(&mutex); } +static bool +dpif_ipfix_should_send_template(struct dpif_ipfix_exporter *exporter, + const uint32_t observation_domain_id, + const uint32_t export_time_sec) + OVS_REQUIRES(mutex) +{ + struct dpif_ipfix_domain *domain; + domain = dpif_ipfix_exporter_find_domain(exporter, + observation_domain_id); + if (!domain) { + /* First time we see this obs_domain_id. */ + domain = dpif_ipfix_exporter_insert_domain(exporter, + observation_domain_id); + } + + if ((domain->last_template_set_time + exporter->template_interval) + <= export_time_sec) { + domain->last_template_set_time = export_time_sec; + return true; + } + return false; +} + static void dpif_ipfix_cache_expire(struct dpif_ipfix_exporter *exporter, bool forced_end, const uint64_t export_time_usec, - const uint32_t export_time_sec) + const uint32_t export_time_sec) OVS_REQUIRES(mutex) { struct ipfix_flow_cache_entry *entry; uint64_t max_flow_start_timestamp_usec; - bool template_msg_sent = false; enum ipfix_flow_end_reason flow_end_reason; if (ovs_list_is_empty(&exporter->cache_flow_start_timestamp_list)) { @@ -2844,25 +2943,25 @@ dpif_ipfix_cache_expire(struct dpif_ipfix_exporter *exporter, break; } - ovs_list_remove(&entry->cache_flow_start_timestamp_list_node); - hmap_remove(&exporter->cache_flow_key_map, - &entry->flow_key_map_node); + if ((exporter->last_stats_sent_time + exporter->stats_interval) + <= export_time_sec) { + exporter->last_stats_sent_time = export_time_sec; + ipfix_send_exporter_data_msg(exporter, export_time_sec); + } - /* XXX: Make frequency of the (Options) Template and Exporter Process - * Statistics transmission configurable. - * Cf. IETF RFC 5101 Section 4.3. and 10.3.6. */ - if (!template_msg_sent - && (exporter->last_template_set_time + IPFIX_TEMPLATE_INTERVAL) - <= export_time_sec) { + if (dpif_ipfix_should_send_template(exporter, + entry->flow_key.obs_domain_id, + export_time_sec)) { + VLOG_DBG("Sending templates for ObservationDomainID %"PRIu32, + entry->flow_key.obs_domain_id); ipfix_send_template_msgs(exporter, export_time_sec, entry->flow_key.obs_domain_id); - exporter->last_template_set_time = export_time_sec; - template_msg_sent = true; - - /* Send Exporter Process Statistics. */ - ipfix_send_exporter_data_msg(exporter, export_time_sec); } + ovs_list_remove(&entry->cache_flow_start_timestamp_list_node); + hmap_remove(&exporter->cache_flow_key_map, + &entry->flow_key_map_node); + /* XXX: Group multiple data records for the same obs domain id * into the same message. */ ipfix_send_data_msg(exporter, export_time_sec, entry, flow_end_reason); @@ -2883,7 +2982,7 @@ get_export_time_now(uint64_t *export_time_usec, uint32_t *export_time_sec) static void dpif_ipfix_cache_expire_now(struct dpif_ipfix_exporter *exporter, - bool forced_end) + bool forced_end) OVS_REQUIRES(mutex) { uint64_t export_time_usec; uint32_t export_time_sec; @@ -3036,6 +3135,8 @@ dpif_ipfix_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_UNSPEC: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: default: break; diff --git a/ofproto/ofproto-dpif-lsample.c b/ofproto/ofproto-dpif-lsample.c new file mode 100644 index 00000000000..11706e3635c --- /dev/null +++ b/ofproto/ofproto-dpif-lsample.c @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "ofproto-dpif-lsample.h" + +#include "cmap.h" +#include "hash.h" +#include "ofproto.h" +#include "openvswitch/thread.h" + +/* Dpif local sampling. + * + * Thread safety: dpif_lsample allows lockless concurrent reads of local + * sampling exporters as long as the following restrictions are met: + * 1) While the last reference is being dropped, i.e: a thread is calling + * "dpif_lsample_unref" on the last reference, other threads cannot call + * "dpif_lsample_ref". + * 2) Threads do not quiese while holding references to internal + * lsample_exporter objects. + */ + +struct dpif_lsample { + struct cmap exporters; /* Contains lsample_exporter_node instances + * indexed by collector_set_id. */ + struct ovs_mutex mutex; /* Protects concurrent insertion/deletion + * of exporters. */ + struct ovs_refcount ref_cnt; /* Controls references to this instance. */ +}; + +struct lsample_exporter { + struct ofproto_lsample_options options; +}; + +struct lsample_exporter_node { + struct cmap_node node; /* In dpif_lsample->exporters. */ + struct lsample_exporter exporter; +}; + +static void +dpif_lsample_delete_exporter(struct dpif_lsample *lsample, + struct lsample_exporter_node *node) +{ + ovs_mutex_lock(&lsample->mutex); + cmap_remove(&lsample->exporters, &node->node, + hash_int(node->exporter.options.collector_set_id, 0)); + ovs_mutex_unlock(&lsample->mutex); + + ovsrcu_postpone(free, node); +} + +/* Adds an exporter with the provided options which are copied. */ +static struct lsample_exporter_node * +dpif_lsample_add_exporter(struct dpif_lsample *lsample, + const struct ofproto_lsample_options *options) +{ + struct lsample_exporter_node *node; + + node = xzalloc(sizeof *node); + node->exporter.options = *options; + + ovs_mutex_lock(&lsample->mutex); + cmap_insert(&lsample->exporters, &node->node, + hash_int(options->collector_set_id, 0)); + ovs_mutex_unlock(&lsample->mutex); + + return node; +} + +static struct lsample_exporter_node * +dpif_lsample_find_exporter_node(const struct dpif_lsample *lsample, + const uint32_t collector_set_id) +{ + struct lsample_exporter_node *node; + + CMAP_FOR_EACH_WITH_HASH (node, node, hash_int(collector_set_id, 0), + &lsample->exporters) { + if (node->exporter.options.collector_set_id == collector_set_id) { + return node; + } + } + return NULL; +} + +/* Sets the lsample configuration and returns true if the configuration + * has changed. */ +bool +dpif_lsample_set_options(struct dpif_lsample *lsample, + const struct ofproto_lsample_options *options, + size_t n_options) +{ + const struct ofproto_lsample_options *opt; + struct lsample_exporter_node *node; + bool changed = false; + int i; + + for (i = 0; i < n_options; i++) { + opt = &options[i]; + node = dpif_lsample_find_exporter_node(lsample, + opt->collector_set_id); + if (!node) { + dpif_lsample_add_exporter(lsample, opt); + changed = true; + } else if (memcmp(&node->exporter.options, opt, sizeof *opt)) { + dpif_lsample_delete_exporter(lsample, node); + dpif_lsample_add_exporter(lsample, opt); + changed = true; + } + } + + /* Delete exporters that have been removed. */ + CMAP_FOR_EACH (node, node, &lsample->exporters) { + for (i = 0; i < n_options; i++) { + if (node->exporter.options.collector_set_id + == options[i].collector_set_id) { + break; + } + } + if (i == n_options) { + dpif_lsample_delete_exporter(lsample, node); + changed = true; + } + } + + return changed; +} + +/* Returns the group_id for a given collector_set_id, if it exists. */ +bool +dpif_lsample_get_group_id(struct dpif_lsample *ps, uint32_t collector_set_id, + uint32_t *group_id) +{ + struct lsample_exporter_node *node; + + node = dpif_lsample_find_exporter_node(ps, collector_set_id); + if (node) { + *group_id = node->exporter.options.group_id; + } + return !!node; +} + +struct dpif_lsample * +dpif_lsample_create(void) +{ + struct dpif_lsample *lsample; + + lsample = xzalloc(sizeof *lsample); + cmap_init(&lsample->exporters); + ovs_mutex_init(&lsample->mutex); + ovs_refcount_init(&lsample->ref_cnt); + + return lsample; +} + +static void +dpif_lsample_destroy(struct dpif_lsample *lsample) +{ + if (lsample) { + struct lsample_exporter_node *node; + + CMAP_FOR_EACH (node, node, &lsample->exporters) { + dpif_lsample_delete_exporter(lsample, node); + } + cmap_destroy(&lsample->exporters); + free(lsample); + } +} + +struct dpif_lsample * +dpif_lsample_ref(const struct dpif_lsample *lsample_) +{ + struct dpif_lsample *lsample = CONST_CAST(struct dpif_lsample *, lsample_); + + if (lsample) { + ovs_refcount_ref(&lsample->ref_cnt); + } + return lsample; +} + +void +dpif_lsample_unref(struct dpif_lsample *lsample) +{ + if (lsample && ovs_refcount_unref_relaxed(&lsample->ref_cnt) == 1) { + dpif_lsample_destroy(lsample); + } +} diff --git a/ofproto/ofproto-dpif-lsample.h b/ofproto/ofproto-dpif-lsample.h new file mode 100644 index 00000000000..26517a64590 --- /dev/null +++ b/ofproto/ofproto-dpif-lsample.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_DPIF_LSAMPLE_H +#define OFPROTO_DPIF_LSAMPLE_H 1 + +#include +#include +#include + +struct dpif_lsample; +struct ofproto_lsample_options; + +struct dpif_lsample *dpif_lsample_create(void); + +struct dpif_lsample *dpif_lsample_ref(const struct dpif_lsample *); +void dpif_lsample_unref(struct dpif_lsample *); + +bool dpif_lsample_set_options(struct dpif_lsample *, + const struct ofproto_lsample_options *, + size_t n_opts); + +bool dpif_lsample_get_group_id(struct dpif_lsample *, + uint32_t collector_set_id, + uint32_t *group_id); + +#endif /* OFPROTO_DPIF_LSAMPLE_H */ diff --git a/ofproto/ofproto-dpif-mirror.c b/ofproto/ofproto-dpif-mirror.c index 343b75f0ed0..e8a2830fb44 100644 --- a/ofproto/ofproto-dpif-mirror.c +++ b/ofproto/ofproto-dpif-mirror.c @@ -21,6 +21,7 @@ #include "cmap.h" #include "hmapx.h" #include "ofproto.h" +#include "ofproto-dpif-trace.h" #include "vlan-bitmap.h" #include "openvswitch/vlog.h" @@ -48,6 +49,11 @@ struct mbundle { mirror_mask_t mirror_out; /* Mirrors that output to this mbundle. */ }; +struct filtermask { + struct miniflow *flow; + struct minimask *mask; +}; + struct mirror { struct mbridge *mbridge; /* Owning ofproto. */ size_t idx; /* In ofproto's "mirrors" array. */ @@ -57,6 +63,10 @@ struct mirror { struct hmapx srcs; /* Contains "struct mbundle*"s. */ struct hmapx dsts; /* Contains "struct mbundle*"s. */ + /* Filter criteria. */ + OVSRCU_TYPE(struct filtermask *) filter_mask; + char *filter_str; + /* This is accessed by handler threads assuming RCU protection (see * mirror_get()), but can be manipulated by mirror_set() without any * explicit synchronization. */ @@ -83,6 +93,25 @@ static void mbundle_lookup_multiple(const struct mbridge *, struct ofbundle **, static int mirror_scan(struct mbridge *); static void mirror_update_dups(struct mbridge *); +static void +filtermask_free(struct filtermask *fm) +{ + free(fm->flow); + free(fm->mask); + free(fm); +} + +static struct filtermask * +filtermask_create(struct flow *flow, struct flow_wildcards *wc) +{ + struct filtermask *fm; + + fm = xmalloc(sizeof *fm); + fm->flow = miniflow_create(flow); + fm->mask = minimask_create(wc); + return fm; +} + struct mbridge * mbridge_create(void) { @@ -207,19 +236,22 @@ mirror_bundle_dst(struct mbridge *mbridge, struct ofbundle *ofbundle) } int -mirror_set(struct mbridge *mbridge, void *aux, const char *name, - struct ofbundle **srcs, size_t n_srcs, - struct ofbundle **dsts, size_t n_dsts, - unsigned long *src_vlans, struct ofbundle *out_bundle, - uint16_t snaplen, - uint16_t out_vlan) +mirror_set(struct mbridge *mbridge, const struct ofproto *ofproto, + void *aux, const struct ofproto_mirror_settings *ms, + const struct mirror_bundles *mb) { struct mbundle *mbundle, *out; mirror_mask_t mirror_bit; struct mirror *mirror; struct hmapx srcs_map; /* Contains "struct ofbundle *"s. */ struct hmapx dsts_map; /* Contains "struct ofbundle *"s. */ + uint16_t out_vlan; + if (!ms || !mbridge) { + return EINVAL; + } + + out_vlan = ms->out_vlan; mirror = mirror_lookup(mbridge, aux); if (!mirror) { int idx; @@ -227,7 +259,7 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, idx = mirror_scan(mbridge); if (idx < 0) { VLOG_WARN("maximum of %d port mirrors reached, cannot create %s", - MAX_MIRRORS, name); + MAX_MIRRORS, ms->name); return EFBIG; } @@ -242,8 +274,8 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, unsigned long *vlans = ovsrcu_get(unsigned long *, &mirror->vlans); /* Get the new configuration. */ - if (out_bundle) { - out = mbundle_lookup(mbridge, out_bundle); + if (mb->out_bundle) { + out = mbundle_lookup(mbridge, mb->out_bundle); if (!out) { mirror_destroy(mbridge, mirror->aux); return EINVAL; @@ -252,20 +284,22 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, } else { out = NULL; } - mbundle_lookup_multiple(mbridge, srcs, n_srcs, &srcs_map); - mbundle_lookup_multiple(mbridge, dsts, n_dsts, &dsts_map); + mbundle_lookup_multiple(mbridge, mb->srcs, mb->n_srcs, &srcs_map); + mbundle_lookup_multiple(mbridge, mb->dsts, mb->n_dsts, &dsts_map); /* If the configuration has not changed, do nothing. */ if (hmapx_equals(&srcs_map, &mirror->srcs) && hmapx_equals(&dsts_map, &mirror->dsts) - && vlan_bitmap_equal(vlans, src_vlans) + && vlan_bitmap_equal(vlans, ms->src_vlans) && mirror->out == out && mirror->out_vlan == out_vlan - && mirror->snaplen == snaplen) + && mirror->snaplen == ms->snaplen + && nullable_string_is_equal(mirror->filter_str, ms->filter) + && !ms->filter) { hmapx_destroy(&srcs_map); hmapx_destroy(&dsts_map); - return 0; + return ECANCELED; } /* XXX: Not sure if these need to be thread safe. */ @@ -275,15 +309,59 @@ mirror_set(struct mbridge *mbridge, void *aux, const char *name, hmapx_swap(&dsts_map, &mirror->dsts); hmapx_destroy(&dsts_map); - if (vlans || src_vlans) { + if (vlans || ms->src_vlans) { ovsrcu_postpone(free, vlans); - vlans = vlan_bitmap_clone(src_vlans); + vlans = vlan_bitmap_clone(ms->src_vlans); ovsrcu_set(&mirror->vlans, vlans); } mirror->out = out; mirror->out_vlan = out_vlan; - mirror->snaplen = snaplen; + mirror->snaplen = ms->snaplen; + + if (!nullable_string_is_equal(mirror->filter_str, ms->filter)) { + if (mirror->filter_str) { + ovsrcu_postpone(filtermask_free, + ovsrcu_get(struct filtermask *, + &mirror->filter_mask)); + free(mirror->filter_str); + mirror->filter_str = NULL; + ovsrcu_set(&mirror->filter_mask, NULL); + } + + if (ms->filter && strlen(ms->filter)) { + struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); + struct flow_wildcards wc; + struct flow flow; + char *err; + + ofproto_append_ports_to_map(&map, ofproto->ports); + err = parse_ofp_exact_flow(&flow, &wc, + ofproto_get_tun_tab(ofproto), + ms->filter, &map); + ofputil_port_map_destroy(&map); + if (err) { + VLOG_WARN("filter is invalid: %s", err); + free(err); + mirror_destroy(mbridge, mirror->aux); + return EINVAL; + } + + /* If the user wants to filter on in_port, they should use the srcs + * bundle. Users setting in_port could experience unexpected + * behavior, and it would be overly complex to detect all possible + * issues. So instead we attempt to extract the in_port and error + * if successful. */ + if (wc.masks.in_port.ofp_port) { + VLOG_WARN("filter is invalid due to in_port field."); + mirror_destroy(mbridge, mirror->aux); + return EINVAL; + } + + mirror->filter_str = xstrdup(ms->filter); + ovsrcu_set(&mirror->filter_mask, filtermask_create(&flow, &wc)); + } + } /* Update mbundles. */ mirror_bit = MIRROR_MASK_C(1) << mirror->idx; @@ -340,6 +418,15 @@ mirror_destroy(struct mbridge *mbridge, void *aux) ovsrcu_postpone(free, vlans); } + if (mirror->filter_str) { + ovsrcu_postpone(filtermask_free, + ovsrcu_get(struct filtermask *, + &mirror->filter_mask)); + free(mirror->filter_str); + mirror->filter_str = NULL; + ovsrcu_set(&mirror->filter_mask, NULL); + } + mbridge->mirrors[mirror->idx] = NULL; /* mirror_get() might have just read the pointer, so we must postpone the * free. */ @@ -406,23 +493,25 @@ mirror_update_stats(struct mbridge *mbridge, mirror_mask_t mirrors, /* Retrieves the mirror numbered 'index' in 'mbridge'. Returns true if such a * mirror exists, false otherwise. * - * If successful, '*vlans' receives the mirror's VLAN membership information, + * If successful 'mc->vlans' receives the mirror's VLAN membership information, * either a null pointer if the mirror includes all VLANs or a 4096-bit bitmap * in which a 1-bit indicates that the mirror includes a particular VLAN, - * '*dup_mirrors' receives a bitmap of mirrors whose output duplicates mirror - * 'index', '*out' receives the output ofbundle (if any), and '*out_vlan' - * receives the output VLAN (if any). + * 'mc->dup_mirrors' receives a bitmap of mirrors whose output duplicates + * mirror 'index', 'mc->out' receives the output ofbundle (if any), + * and 'mc->out_vlan' receives the output VLAN (if any). In cases where the + * mirror has a filter configured 'mc->filter_flow' and 'mc->filter_mask' + * receives the flow and mask that this mirror should collect. * * Everything returned here is assumed to be RCU protected. */ bool -mirror_get(struct mbridge *mbridge, int index, const unsigned long **vlans, - mirror_mask_t *dup_mirrors, struct ofbundle **out, - int *snaplen, int *out_vlan) +mirror_get(struct mbridge *mbridge, int index, + struct mirror_config *mc) { + struct filtermask *fm; struct mirror *mirror; - if (!mbridge) { + if (!mc || !mbridge) { return false; } @@ -433,11 +522,19 @@ mirror_get(struct mbridge *mbridge, int index, const unsigned long **vlans, /* Assume 'mirror' is RCU protected, i.e., it will not be freed until this * thread quiesces. */ - *vlans = ovsrcu_get(unsigned long *, &mirror->vlans); - *dup_mirrors = mirror->dup_mirrors; - *out = mirror->out ? mirror->out->ofbundle : NULL; - *out_vlan = mirror->out_vlan; - *snaplen = mirror->snaplen; + mc->vlans = ovsrcu_get(unsigned long *, &mirror->vlans); + mc->dup_mirrors = mirror->dup_mirrors; + mc->out_bundle = mirror->out ? mirror->out->ofbundle : NULL; + mc->out_vlan = mirror->out_vlan; + mc->snaplen = mirror->snaplen; + fm = ovsrcu_get(struct filtermask *, &mirror->filter_mask); + if (fm) { + mc->filter_flow = fm->flow; + mc->filter_mask = fm->mask; + } else { + mc->filter_flow = NULL; + mc->filter_mask = NULL; + } return true; } diff --git a/ofproto/ofproto-dpif-mirror.h b/ofproto/ofproto-dpif-mirror.h index eed63ec4a48..a03dd82356f 100644 --- a/ofproto/ofproto-dpif-mirror.h +++ b/ofproto/ofproto-dpif-mirror.h @@ -22,8 +22,40 @@ #define MAX_MIRRORS 32 typedef uint32_t mirror_mask_t; -struct ofproto_dpif; +struct ofproto_mirror_settings; struct ofbundle; +struct ofproto; + +struct mirror_bundles { + struct ofbundle **srcs; + size_t n_srcs; + + struct ofbundle **dsts; + size_t n_dsts; + + struct ofbundle *out_bundle; +}; + +struct mirror_config { + /* A bitmap of mirrors that duplicate the current mirror. */ + mirror_mask_t dup_mirrors; + + /* VLANs of packets to select for mirroring. */ + unsigned long *vlans; /* vlan_bitmap, NULL selects all VLANs. */ + + /* Miniflow and minimask if a filter is configured, else both are NULL. */ + struct miniflow *filter_flow; + struct minimask *filter_mask; + + /* Output (mutually exclusive). */ + struct ofbundle *out_bundle; /* A registered ofbundle handle or NULL. */ + uint16_t out_vlan; /* Output VLAN, not used if out_bundle is + set. */ + + /* Max size of a mirrored packet in bytes, if set to zero then no + * truncation will occur. */ + uint16_t snaplen; +}; /* The following functions are used by handler threads without any locking, * assuming RCU protection. */ @@ -38,9 +70,7 @@ mirror_mask_t mirror_bundle_dst(struct mbridge *, struct ofbundle *); void mirror_update_stats(struct mbridge*, mirror_mask_t, uint64_t packets, uint64_t bytes); -bool mirror_get(struct mbridge *, int index, const unsigned long **vlans, - mirror_mask_t *dup_mirrors, struct ofbundle **out, - int *snaplen, int *out_vlan); +bool mirror_get(struct mbridge *, int index, struct mirror_config *); /* The remaining functions are assumed to be called by the main thread only. */ @@ -50,11 +80,9 @@ bool mbridge_need_revalidate(struct mbridge *); void mbridge_register_bundle(struct mbridge *, struct ofbundle *); void mbridge_unregister_bundle(struct mbridge *, struct ofbundle *); -int mirror_set(struct mbridge *, void *aux, const char *name, - struct ofbundle **srcs, size_t n_srcs, - struct ofbundle **dsts, size_t n_dsts, - unsigned long *src_vlans, struct ofbundle *out_bundle, - uint16_t snaplen, uint16_t out_vlan); +int mirror_set(struct mbridge *, const struct ofproto *, void *aux, + const struct ofproto_mirror_settings *, + const struct mirror_bundles *); void mirror_destroy(struct mbridge *, void *aux); int mirror_get_stats(struct mbridge *, void *aux, uint64_t *packets, uint64_t *bytes); diff --git a/ofproto/ofproto-dpif-monitor.c b/ofproto/ofproto-dpif-monitor.c index bb0e4909101..5132f9c952f 100644 --- a/ofproto/ofproto-dpif-monitor.c +++ b/ofproto/ofproto-dpif-monitor.c @@ -275,19 +275,16 @@ monitor_mport_run(struct mport *mport, struct dp_packet *packet) long long int lldp_wake_time = LLONG_MAX; if (mport->cfm && cfm_should_send_ccm(mport->cfm)) { - dp_packet_clear(packet); cfm_compose_ccm(mport->cfm, packet, mport->hw_addr); ofproto_dpif_send_packet(mport->ofport, false, packet); } if (mport->bfd && bfd_should_send_packet(mport->bfd)) { bool oam; - dp_packet_clear(packet); bfd_put_packet(mport->bfd, packet, mport->hw_addr, &oam); ofproto_dpif_send_packet(mport->ofport, oam, packet); } if (mport->lldp && lldp_should_send_packet(mport->lldp)) { - dp_packet_clear(packet); lldp_put_packet(mport->lldp, packet, mport->hw_addr); ofproto_dpif_send_packet(mport->ofport, false, packet); } diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c index a405eb0563f..fb12cf41927 100644 --- a/ofproto/ofproto-dpif-sflow.c +++ b/ofproto/ofproto-dpif-sflow.c @@ -306,6 +306,7 @@ sflow_agent_get_counters(void *ds_, SFLPoller *poller, struct netdev_stats stats; enum netdev_flags flags; struct lacp_member_stats lacp_stats; + uint32_t curr_speed; const char *ifName; dsp = dpif_sflow_find_port(ds, u32_to_odp(poller->bridgePort)); @@ -320,13 +321,19 @@ sflow_agent_get_counters(void *ds_, SFLPoller *poller, if (!netdev_get_features(dsp->ofport->netdev, ¤t, NULL, NULL, NULL)) { /* The values of ifDirection come from MAU MIB (RFC 2668): 0 = unknown, 1 = full-duplex, 2 = half-duplex, 3 = in, 4=out */ - counters->ifSpeed = netdev_features_to_bps(current, 0); counters->ifDirection = (netdev_features_is_full_duplex(current) ? 1 : 2); } else { - counters->ifSpeed = 100000000; counters->ifDirection = 0; } + + netdev_get_speed(dsp->ofport->netdev, &curr_speed, NULL); + if (curr_speed) { + counters->ifSpeed = curr_speed * 1000000; + } else { + counters->ifSpeed = 100000000; + } + if (!netdev_get_flags(dsp->ofport->netdev, &flags) && flags & NETDEV_UP) { counters->ifStatus = 1; /* ifAdminStatus up. */ if (netdev_get_carrier(dsp->ofport->netdev)) { @@ -801,7 +808,7 @@ dpif_sflow_set_options(struct dpif_sflow *ds, receiver = sfl_agent_addReceiver(ds->sflow_agent); sfl_receiver_set_sFlowRcvrOwner(receiver, "Open vSwitch sFlow"); - sfl_receiver_set_sFlowRcvrTimeout(receiver, 0xffffffff); + sfl_receiver_set_sFlowRcvrTimeout(receiver, UINT32_MAX); /* Set the sampling_rate down in the datapath. */ ds->probability = MAX(1, UINT32_MAX / ds->options->sampling_rate); @@ -1229,6 +1236,8 @@ dpif_sflow_read_actions(const struct flow *flow, case OVS_ACTION_ATTR_CHECK_PKT_LEN: case OVS_ACTION_ATTR_DROP: case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: + case OVS_ACTION_ATTR_PSAMPLE: case __OVS_ACTION_ATTR_MAX: default: break; diff --git a/ofproto/ofproto-dpif-trace.c b/ofproto/ofproto-dpif-trace.c index 527e2f17ede..e43d9f88c9c 100644 --- a/ofproto/ofproto-dpif-trace.c +++ b/ofproto/ofproto-dpif-trace.c @@ -102,7 +102,7 @@ oftrace_add_recirc_node(struct ovs_list *recirc_queue, node->flow = *flow; node->flow.recirc_id = recirc_id; node->flow.ct_zone = zone; - node->nat_act = ofn; + node->nat_act = ofn ? xmemdup(ofn, sizeof *ofn) : NULL; node->packet = packet ? dp_packet_clone(packet) : NULL; return true; @@ -113,6 +113,7 @@ oftrace_recirc_node_destroy(struct oftrace_recirc_node *node) { if (node) { recirc_free_id(node->recirc_id); + free(node->nat_act); dp_packet_delete(node->packet); free(node); } @@ -440,7 +441,7 @@ parse_flow_and_packet(int argc, const char *argv[], if (generate_packet) { /* Generate a packet, as requested. */ packet = dp_packet_new(0); - flow_compose(packet, flow, l7, l7_len); + flow_compose(packet, flow, l7, l7_len, false); } else if (packet) { /* Use the metadata from the flow and the packet argument to * reconstruct the flow. */ @@ -845,17 +846,35 @@ ofproto_trace(struct ofproto_dpif *ofproto, const struct flow *flow, bool names) { struct ovs_list recirc_queue = OVS_LIST_INITIALIZER(&recirc_queue); + int recirculations = 0; + ofproto_trace__(ofproto, flow, packet, &recirc_queue, ofpacts, ofpacts_len, output, names); struct oftrace_recirc_node *recirc_node; LIST_FOR_EACH_POP (recirc_node, node, &recirc_queue) { + if (recirculations++ > 4096) { + ds_put_cstr(output, "\n\n"); + ds_put_char_multiple(output, '=', 79); + ds_put_cstr(output, "\nTrace reached the recirculation limit." + " Sopping the trace here."); + ds_put_format(output, + "\nQueued but not processed: %"PRIuSIZE + " recirculations.", + ovs_list_size(&recirc_queue) + 1); + oftrace_recirc_node_destroy(recirc_node); + break; + } ofproto_trace_recirc_node(recirc_node, next_ct_states, output); ofproto_trace__(ofproto, &recirc_node->flow, recirc_node->packet, &recirc_queue, ofpacts, ofpacts_len, output, names); oftrace_recirc_node_destroy(recirc_node); } + /* Destroy remaining recirculation nodes, if any. */ + LIST_FOR_EACH_POP (recirc_node, node, &recirc_queue) { + oftrace_recirc_node_destroy(recirc_node); + } } void diff --git a/ofproto/ofproto-dpif-trace.h b/ofproto/ofproto-dpif-trace.h index f579a5ca468..f023b10cdf4 100644 --- a/ofproto/ofproto-dpif-trace.h +++ b/ofproto/ofproto-dpif-trace.h @@ -73,7 +73,7 @@ struct oftrace_recirc_node { uint32_t recirc_id; struct flow flow; struct dp_packet *packet; - const struct ofpact_nat *nat_act; + struct ofpact_nat *nat_act; }; /* A node within a next_ct_states list. */ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 7ad728adffd..4d39bc5a713 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -42,22 +42,31 @@ #include "seq.h" #include "tunnel.h" #include "unixctl.h" +#include "openvswitch/usdt-probes.h" #include "openvswitch/vlog.h" #include "lib/netdev-provider.h" #define UPCALL_MAX_BATCH 64 #define REVALIDATE_MAX_BATCH 50 +#define UINT64_THREE_QUARTERS (UINT64_MAX / 4 * 3) VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); COVERAGE_DEFINE(dumped_duplicate_flow); +COVERAGE_DEFINE(dumped_inconsistent_flow); COVERAGE_DEFINE(dumped_new_flow); COVERAGE_DEFINE(handler_duplicate_upcall); -COVERAGE_DEFINE(upcall_ukey_contention); -COVERAGE_DEFINE(upcall_ukey_replace); COVERAGE_DEFINE(revalidate_missed_dp_flow); +COVERAGE_DEFINE(ukey_dp_change); +COVERAGE_DEFINE(ukey_invalid_stat_reset); +COVERAGE_DEFINE(ukey_replace_contention); +COVERAGE_DEFINE(upcall_flow_limit_grew); COVERAGE_DEFINE(upcall_flow_limit_hit); COVERAGE_DEFINE(upcall_flow_limit_kill); +COVERAGE_DEFINE(upcall_flow_limit_reduced); +COVERAGE_DEFINE(upcall_flow_limit_scaled); +COVERAGE_DEFINE(upcall_ukey_contention); +COVERAGE_DEFINE(upcall_ukey_replace); /* A thread that reads upcalls from dpif, forwards each upcall's packet, * and possibly sets up a kernel flow as a cache. */ @@ -254,12 +263,29 @@ enum ukey_state { UKEY_CREATED = 0, UKEY_VISIBLE, /* Ukey is in umap, datapath flow install is queued. */ UKEY_OPERATIONAL, /* Ukey is in umap, datapath flow is installed. */ + UKEY_INCONSISTENT, /* Ukey is in umap, datapath flow is inconsistent. */ UKEY_EVICTING, /* Ukey is in umap, datapath flow delete is queued. */ UKEY_EVICTED, /* Ukey is in umap, datapath flow is deleted. */ UKEY_DELETED, /* Ukey removed from umap, ukey free is deferred. */ }; #define N_UKEY_STATES (UKEY_DELETED + 1) +/* Ukey delete reasons used by USDT probes. Please keep in sync with the + * definition in utilities/usdt-scripts/flow_reval_monitor.py. */ +enum flow_del_reason { + FDR_NONE = 0, /* No delete reason specified. */ + FDR_AVOID_CACHING, /* Cache avoidance flag set. */ + FDR_BAD_ODP_FIT, /* Bad ODP flow fit. */ + FDR_FLOW_IDLE, /* Flow idle timeout. */ + FDR_FLOW_LIMIT, /* Kill all flows condition reached. */ + FDR_FLOW_WILDCARDED, /* Flow needs a narrower wildcard mask. */ + FDR_NO_OFPROTO, /* Bridge not found. */ + FDR_PURGE, /* User requested flow deletion. */ + FDR_TOO_EXPENSIVE, /* Too expensive to revalidate. */ + FDR_UPDATE_FAIL, /* Datapath update failed. */ + FDR_XLATION_ERROR, /* Flow translation error. */ +}; + /* 'udpif_key's are responsible for tracking the little bit of state udpif * needs to do flow expiration which can't be pulled directly from the * datapath. They may be created by any handler or revalidator thread at any @@ -287,6 +313,7 @@ struct udpif_key { struct ovs_mutex mutex; /* Guards the following. */ struct dpif_flow_stats stats OVS_GUARDED; /* Last known stats.*/ + const char *dp_layer OVS_GUARDED; /* Last known dp_layer. */ long long int created OVS_GUARDED; /* Estimate of creation time. */ uint64_t dump_seq OVS_GUARDED; /* Tracks udpif->dump_seq. */ uint64_t reval_seq OVS_GUARDED; /* Tracks udpif->reval_seq. */ @@ -356,6 +383,7 @@ static void upcall_unixctl_disable_ufid(struct unixctl_conn *, int argc, const char *argv[], void *aux); static void upcall_unixctl_enable_ufid(struct unixctl_conn *, int argc, const char *argv[], void *aux); + static void upcall_unixctl_set_flow_limit(struct unixctl_conn *conn, int argc, const char *argv[], void *aux); static void upcall_unixctl_dump_wait(struct unixctl_conn *conn, int argc, @@ -367,6 +395,9 @@ static void upcall_unixctl_pause(struct unixctl_conn *conn, int argc, static void upcall_unixctl_resume(struct unixctl_conn *conn, int argc, const char *argv[], void *aux); +static void upcall_unixctl_ofproto_detrace(struct unixctl_conn *, int argc, + const char *argv[], void *aux); + static struct udpif_key *ukey_create_from_upcall(struct upcall *, struct flow_wildcards *); static int ukey_create_from_dpif_flow(const struct udpif *, @@ -402,7 +433,8 @@ static int upcall_receive(struct upcall *, const struct dpif_backer *, const struct dp_packet *packet, enum dpif_upcall_type, const struct nlattr *userdata, const struct flow *, const unsigned int mru, - const ovs_u128 *ufid, const unsigned pmd_id); + const ovs_u128 *ufid, const unsigned pmd_id, + char **errorp); static void upcall_uninit(struct upcall *); static void udpif_flow_rebalance(struct udpif *udpif); @@ -414,8 +446,8 @@ static int udpif_flow_unprogram(struct udpif *udpif, struct udpif_key *ukey, static upcall_callback upcall_cb; static dp_purge_callback dp_purge_cb; -static atomic_bool enable_megaflows = ATOMIC_VAR_INIT(true); -static atomic_bool enable_ufid = ATOMIC_VAR_INIT(true); +static atomic_bool enable_megaflows = true; +static atomic_bool enable_ufid = true; void udpif_init(void) @@ -442,6 +474,8 @@ udpif_init(void) upcall_unixctl_pause, NULL); unixctl_command_register("revalidator/resume", NULL, 0, 0, upcall_unixctl_resume, NULL); + unixctl_command_register("ofproto/detrace", "UFID [pmd=PMD-ID]", 1, 2, + upcall_unixctl_ofproto_detrace, NULL); ovsthread_once_done(&once); } } @@ -573,7 +607,7 @@ static void udpif_start_threads(struct udpif *udpif, uint32_t n_handlers_, uint32_t n_revalidators_) { - if (udpif && n_handlers_ && n_revalidators_) { + if (udpif && n_revalidators_) { /* Creating a thread can take a significant amount of time on some * systems, even hundred of milliseconds, so quiesce around it. */ ovsrcu_quiesce_start(); @@ -581,14 +615,19 @@ udpif_start_threads(struct udpif *udpif, uint32_t n_handlers_, udpif->n_handlers = n_handlers_; udpif->n_revalidators = n_revalidators_; - udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers); - for (size_t i = 0; i < udpif->n_handlers; i++) { - struct handler *handler = &udpif->handlers[i]; + if (udpif->n_handlers) { + udpif->handlers = xzalloc(udpif->n_handlers + * sizeof *udpif->handlers); + for (size_t i = 0; i < udpif->n_handlers; i++) { + struct handler *handler = &udpif->handlers[i]; - handler->udpif = udpif; - handler->handler_id = i; - handler->thread = ovs_thread_create( - "handler", udpif_upcall_handler, handler); + handler->udpif = udpif; + handler->handler_id = i; + handler->thread = ovs_thread_create( + "handler", udpif_upcall_handler, handler); + } + } else { + udpif->handlers = NULL; } atomic_init(&udpif->enable_ufid, udpif->backer->rt_support.ufid); @@ -651,7 +690,9 @@ udpif_set_threads(struct udpif *udpif, uint32_t n_handlers_, if (dpif_number_handlers_required(udpif->dpif, &n_handlers_requested)) { forced = true; if (!n_revalidators_) { - n_revalidators_requested = n_handlers_requested / 4 + 1; + n_revalidators_requested = (n_handlers_requested + ? n_handlers_requested + : MAX(count_cpu_cores(), 2)) / 4 + 1; } else { n_revalidators_requested = n_revalidators_; } @@ -779,6 +820,17 @@ udpif_get_n_flows(struct udpif *udpif) atomic_store_relaxed(&udpif->n_flows_timestamp, now); dpif_get_dp_stats(udpif->dpif, &stats); flow_count = stats.n_flows; + + if (!dpif_synced_dp_layers(udpif->dpif)) { + /* If the dpif layer does not sync the flows, we need to include + * the hardware offloaded flows separately. */ + uint64_t hw_flows; + + if (!dpif_get_n_offloaded_flows(udpif->dpif, &hw_flows)) { + flow_count += hw_flows; + } + } + atomic_store_relaxed(&udpif->n_flows, flow_count); ovs_mutex_unlock(&udpif->n_flows_mutex); } else { @@ -827,6 +879,7 @@ recv_upcalls(struct handler *handler) struct upcall *upcall = &upcalls[n_upcalls]; struct flow *flow = &flows[n_upcalls]; unsigned int mru = 0; + char *errorp = NULL; uint64_t hash = 0; int error; @@ -853,7 +906,7 @@ recv_upcalls(struct handler *handler) error = upcall_receive(upcall, udpif->backer, &dupcall->packet, dupcall->type, dupcall->userdata, flow, mru, - &dupcall->ufid, PMD_ID_NULL); + &dupcall->ufid, PMD_ID_NULL, &errorp); if (error) { if (error == ENODEV) { /* Received packet on datapath port for which we couldn't @@ -864,8 +917,11 @@ recv_upcalls(struct handler *handler) dupcall->key_len, NULL, 0, NULL, 0, &dupcall->ufid, PMD_ID_NULL, NULL); VLOG_INFO_RL(&rl, "received packet on unassociated datapath " - "port %"PRIu32, flow->in_port.odp_port); + "port %"PRIu32"%s%s%s", flow->in_port.odp_port, + errorp ? " (" : "", errorp ? errorp : "", + errorp ? ")" : ""); } + free(errorp); goto free_dupcall; } @@ -967,19 +1023,25 @@ udpif_revalidator(void *arg) udpif->reval_exit = latch_is_set(&udpif->exit_latch); start_time = time_msec(); - if (!udpif->reval_exit) { + if (!udpif->reval_exit && !udpif->pause) { bool terse_dump; terse_dump = udpif_use_ufid(udpif); udpif->dump = dpif_flow_dump_create(udpif->dpif, terse_dump, NULL); + OVS_USDT_PROBE(udpif_revalidator, start_dump, udpif, n_flows); } } - /* Wait for the leader to start the flow dump. */ + /* Wait for the leader to reach this point. */ ovs_barrier_block(&udpif->reval_barrier); if (udpif->pause) { revalidator_pause(revalidator); + if (!udpif->reval_exit) { + /* The main thread resumed all validators, but the leader + * didn't start the dump, go to next iteration. */ + continue; + } } if (udpif->reval_exit) { @@ -1010,20 +1072,26 @@ udpif_revalidator(void *arg) udpif->dump_duration = duration; if (duration > 2000) { flow_limit /= duration / 1000; + COVERAGE_INC(upcall_flow_limit_scaled); } else if (duration > 1300) { flow_limit = flow_limit * 3 / 4; + COVERAGE_INC(upcall_flow_limit_reduced); } else if (duration < 1000 && flow_limit < n_flows * 1000 / duration) { flow_limit += 1000; + COVERAGE_INC(upcall_flow_limit_grew); } flow_limit = MIN(ofproto_flow_limit, MAX(flow_limit, 1000)); atomic_store_relaxed(&udpif->flow_limit, flow_limit); if (duration > 2000) { - VLOG_INFO("Spent an unreasonably long %lldms dumping flows", + VLOG_WARN("Spent an unreasonably long %lldms dumping flows", duration); } + OVS_USDT_PROBE(udpif_revalidator, sweep_done, udpif, n_flows, + MIN(ofproto_max_idle, ofproto_max_revalidator)); + poll_timer_wait_until(start_time + MIN(ofproto_max_idle, ofproto_max_revalidator)); seq_wait(udpif->reval_seq, last_reval_seq); @@ -1151,7 +1219,8 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, const struct dp_packet *packet, enum dpif_upcall_type type, const struct nlattr *userdata, const struct flow *flow, const unsigned int mru, - const ovs_u128 *ufid, const unsigned pmd_id) + const ovs_u128 *ufid, const unsigned pmd_id, + char **errorp) { int error; @@ -1160,7 +1229,8 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, return EAGAIN; } else if (upcall->type == MISS_UPCALL) { error = xlate_lookup(backer, flow, &upcall->ofproto, &upcall->ipfix, - &upcall->sflow, NULL, &upcall->ofp_in_port); + &upcall->sflow, NULL, &upcall->ofp_in_port, + errorp); if (error) { return error; } @@ -1168,7 +1238,11 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, struct ofproto_dpif *ofproto = ofproto_dpif_lookup_by_uuid(&upcall->cookie.ofproto_uuid); if (!ofproto) { - VLOG_INFO_RL(&rl, "upcall could not find ofproto"); + if (errorp) { + *errorp = xstrdup("upcall could not find ofproto"); + } else { + VLOG_INFO_RL(&rl, "upcall could not find ofproto"); + } return ENODEV; } upcall->ofproto = ofproto; @@ -1358,7 +1432,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi atomic_read_relaxed(&enable_megaflows, &megaflow); error = upcall_receive(&upcall, udpif->backer, packet, type, userdata, - flow, 0, ufid, pmd_id); + flow, 0, ufid, pmd_id, NULL); if (error) { return error; } @@ -1384,8 +1458,6 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi } if (upcall.ukey && !ukey_install(udpif, upcall.ukey)) { - static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1); - VLOG_WARN_RL(&rll, "upcall_cb failure: ukey installation fails"); error = ENOSPC; } out: @@ -1755,6 +1827,7 @@ ukey_create__(const struct nlattr *key, size_t key_len, ukey->created = ukey->flow_time = time_msec(); memset(&ukey->stats, 0, sizeof ukey->stats); ukey->stats.used = used; + ukey->dp_layer = NULL; ukey->xcache = NULL; ukey->offloaded = false; @@ -1877,19 +1950,20 @@ try_ukey_replace(struct umap *umap, struct udpif_key *old_ukey, ovs_mutex_lock(&new_ukey->mutex); cmap_replace(&umap->cmap, &old_ukey->cmap_node, &new_ukey->cmap_node, new_ukey->hash); + new_ukey->dump_seq = old_ukey->dump_seq; ovsrcu_postpone(ukey_delete__, old_ukey); transition_ukey(old_ukey, UKEY_DELETED); transition_ukey(new_ukey, UKEY_VISIBLE); replaced = true; + COVERAGE_INC(upcall_ukey_replace); + } else { + COVERAGE_INC(handler_duplicate_upcall); } ovs_mutex_unlock(&old_ukey->mutex); - } - - if (replaced) { - COVERAGE_INC(upcall_ukey_replace); } else { - COVERAGE_INC(handler_duplicate_upcall); + COVERAGE_INC(ukey_replace_contention); } + return replaced; } @@ -1966,6 +2040,10 @@ transition_ukey_at(struct udpif_key *ukey, enum ukey_state dst, * UKEY_VISIBLE -> UKEY_EVICTED * A handler attempts to install the flow, but the datapath rejects it. * Consider that the datapath has already destroyed it. + * UKEY_OPERATIONAL -> UKEY_INCONSISTENT + * A revalidator modifies the flow with error returns. + * UKEY_INCONSISTENT -> UKEY_EVICTING + * A revalidator decides to evict the datapath flow. * UKEY_OPERATIONAL -> UKEY_EVICTING * A revalidator decides to evict the datapath flow. * UKEY_EVICTING -> UKEY_EVICTED @@ -1973,8 +2051,9 @@ transition_ukey_at(struct udpif_key *ukey, enum ukey_state dst, * UKEY_EVICTED -> UKEY_DELETED * A revalidator has removed the ukey from the umap and is deleting it. */ - if (ukey->state == dst - 1 || (ukey->state == UKEY_VISIBLE && - dst < UKEY_DELETED)) { + if (ukey->state == dst - 1 || + (ukey->state == UKEY_VISIBLE && dst < UKEY_DELETED) || + (ukey->state == UKEY_OPERATIONAL && dst == UKEY_EVICTING)) { ukey->state = dst; } else { struct ds ds = DS_EMPTY_INITIALIZER; @@ -2083,10 +2162,16 @@ ukey_delete(struct umap *umap, struct udpif_key *ukey) } static bool -should_revalidate(const struct udpif *udpif, uint64_t packets, - long long int used) +should_revalidate(const struct udpif *udpif, const struct udpif_key *ukey, + uint64_t packets) + OVS_REQUIRES(ukey->mutex) { long long int metric, now, duration; + long long int used = ukey->stats.used; + + if (!ofproto_min_revalidate_pps) { + return true; + } if (!used) { /* Always revalidate the first time a flow is dumped. */ @@ -2113,8 +2198,12 @@ should_revalidate(const struct udpif *udpif, uint64_t packets, duration = now - used; metric = duration / packets; - if (metric < 1000 / ofproto_min_revalidate_pps) { - /* The flow is receiving more than min-revalidate-pps, so keep it. */ + if (metric < 1000 / ofproto_min_revalidate_pps || + (ukey->offloaded && duration < ofproto_offloaded_stats_delay)) { + /* The flow is receiving more than min-revalidate-pps, so keep it. + * Or it's a hardware offloaded flow that might take up to X seconds + * to update its statistics. Until we are sure the statistics had a + * chance to be updated, also keep it. */ return true; } return false; @@ -2154,7 +2243,7 @@ xlate_key(struct udpif *udpif, const struct nlattr *key, unsigned int len, } error = xlate_lookup(udpif->backer, &ctx->flow, &ofproto, NULL, NULL, - ctx->netflow, &ofp_in_port); + ctx->netflow, &ofp_in_port, NULL); if (error) { return error; } @@ -2211,7 +2300,8 @@ populate_xcache(struct udpif *udpif, struct udpif_key *ukey, static enum reval_result revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, uint16_t tcp_flags, struct ofpbuf *odp_actions, - struct recirc_refs *recircs, struct xlate_cache *xcache) + struct recirc_refs *recircs, struct xlate_cache *xcache, + enum flow_del_reason *del_reason) { struct xlate_out *xoutp; struct netflow *netflow; @@ -2224,16 +2314,21 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, .wc = &wc, }; + OVS_USDT_PROBE(revalidate_ukey__, entry, udpif, ukey, tcp_flags, + odp_actions, recircs, xcache); + result = UKEY_DELETE; xoutp = NULL; netflow = NULL; if (xlate_ukey(udpif, ukey, tcp_flags, &ctx)) { + *del_reason = FDR_XLATION_ERROR; goto exit; } xoutp = &ctx.xout; if (xoutp->avoid_caching) { + *del_reason = FDR_AVOID_CACHING; goto exit; } @@ -2247,6 +2342,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, ofpbuf_clear(odp_actions); if (!ofproto) { + *del_reason = FDR_NO_OFPROTO; goto exit; } @@ -2258,6 +2354,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, &ctx.flow, NULL) == ODP_FIT_ERROR) { + *del_reason = FDR_BAD_ODP_FIT; goto exit; } @@ -2267,6 +2364,7 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, * down. Note that we do not know if the datapath has ignored any of the * wildcarded bits, so we may be overly conservative here. */ if (flow_wildcards_has_extra(&dp_mask, ctx.wc)) { + *del_reason = FDR_FLOW_WILDCARDED; goto exit; } @@ -2287,9 +2385,33 @@ revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, netflow_flow_clear(netflow, &ctx.flow); } xlate_out_uninit(xoutp); + + OVS_USDT_PROBE(revalidate_ukey__, exit, udpif, ukey, result); + return result; } +static void +log_unexpected_stats_jump(struct udpif_key *ukey, + const struct dpif_flow_stats *stats) + OVS_REQUIRES(ukey->mutex) +{ + static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5); + struct ds ds = DS_EMPTY_INITIALIZER; + struct ofpbuf *actions; + + odp_format_ufid(&ukey->ufid, &ds); + ds_put_cstr(&ds, ", "); + odp_flow_key_format(ukey->key, ukey->key_len, &ds); + ds_put_cstr(&ds, ", actions:"); + actions = ovsrcu_get(struct ofpbuf *, &ukey->actions); + format_odp_actions(&ds, actions->data, actions->size, NULL); + VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64 + " to %"PRIu64" when handling ukey %s", + ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds)); + ds_destroy(&ds); +} + /* Verifies that the datapath actions of 'ukey' are still correct, and pushes * 'stats' for it. * @@ -2312,7 +2434,7 @@ static enum reval_result revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, const struct dpif_flow_stats *stats, struct ofpbuf *odp_actions, uint64_t reval_seq, - struct recirc_refs *recircs, bool offloaded) + struct recirc_refs *recircs, enum flow_del_reason *del_reason) OVS_REQUIRES(ukey->mutex) { bool need_revalidate = ukey->reval_seq != reval_seq; @@ -2323,23 +2445,31 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, push.used = stats->used; push.tcp_flags = stats->tcp_flags; - push.n_packets = (stats->n_packets > ukey->stats.n_packets - ? stats->n_packets - ukey->stats.n_packets - : 0); - push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes - ? stats->n_bytes - ukey->stats.n_bytes - : 0); + push.n_packets = stats->n_packets - ukey->stats.n_packets; + push.n_bytes = stats->n_bytes - ukey->stats.n_bytes; + + if (stats->n_packets < ukey->stats.n_packets && + ukey->stats.n_packets < UINT64_THREE_QUARTERS) { + /* Report cases where the packet counter is lower than the previous + * instance, but exclude the potential wrapping of an uint64_t. */ + COVERAGE_INC(ukey_invalid_stat_reset); + log_unexpected_stats_jump(ukey, stats); + } if (need_revalidate) { - if (should_revalidate(udpif, push.n_packets, ukey->stats.used)) { + if (should_revalidate(udpif, ukey, push.n_packets)) { if (!ukey->xcache) { ukey->xcache = xlate_cache_new(); } else { xlate_cache_clear(ukey->xcache); } result = revalidate_ukey__(udpif, ukey, push.tcp_flags, - odp_actions, recircs, ukey->xcache); - } /* else delete; too expensive to revalidate */ + odp_actions, recircs, ukey->xcache, + del_reason); + } else { + /* Delete, since it is too expensive to revalidate. */ + *del_reason = FDR_TOO_EXPENSIVE; + } } else if (!push.n_packets || ukey->xcache || !populate_xcache(udpif, ukey, push.tcp_flags)) { result = UKEY_KEEP; @@ -2347,7 +2477,7 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, /* Stats for deleted flows will be attributed upon flow deletion. Skip. */ if (result != UKEY_DELETE) { - xlate_push_stats(ukey->xcache, &push, offloaded); + xlate_push_stats(ukey->xcache, &push, ukey->offloaded); ukey->stats = *stats; ukey->reval_seq = reval_seq; } @@ -2416,26 +2546,31 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops) for (i = 0; i < n_ops; i++) { struct ukey_op *op = &ops[i]; - struct dpif_flow_stats *push, *stats, push_buf; - - stats = op->dop.flow_del.stats; - push = &push_buf; - - if (op->dop.type != DPIF_OP_FLOW_DEL) { - /* Only deleted flows need their stats pushed. */ - continue; - } if (op->dop.error) { - /* flow_del error, 'stats' is unusable. */ if (op->ukey) { ovs_mutex_lock(&op->ukey->mutex); - transition_ukey(op->ukey, UKEY_EVICTED); + if (op->dop.type == DPIF_OP_FLOW_DEL) { + transition_ukey(op->ukey, UKEY_EVICTED); + } else { + /* Modification of the flow failed. */ + transition_ukey(op->ukey, UKEY_INCONSISTENT); + } ovs_mutex_unlock(&op->ukey->mutex); } continue; } + if (op->dop.type != DPIF_OP_FLOW_DEL) { + /* Only deleted flows need their stats pushed. */ + continue; + } + + struct dpif_flow_stats *push, *stats, push_buf; + + stats = op->dop.flow_del.stats; + push = &push_buf; + if (op->ukey) { ovs_mutex_lock(&op->ukey->mutex); transition_ukey(op->ukey, UKEY_EVICTED); @@ -2443,6 +2578,15 @@ push_dp_ops(struct udpif *udpif, struct ukey_op *ops, size_t n_ops) push->tcp_flags = stats->tcp_flags | op->ukey->stats.tcp_flags; push->n_packets = stats->n_packets - op->ukey->stats.n_packets; push->n_bytes = stats->n_bytes - op->ukey->stats.n_bytes; + + if (stats->n_packets < op->ukey->stats.n_packets && + op->ukey->stats.n_packets < UINT64_THREE_QUARTERS) { + /* Report cases where the packet counter is lower than the + * previous instance, but exclude the potential wrapping of an + * uint64_t. */ + COVERAGE_INC(ukey_invalid_stat_reset); + } + ovs_mutex_unlock(&op->ukey->mutex); } else { push = stats; @@ -2685,8 +2829,6 @@ revalidate(struct revalidator *revalidator) break; } - now = time_msec(); - /* In normal operation we want to keep flows around until they have * been idle for 'ofproto_max_idle' milliseconds. However: * @@ -2723,10 +2865,11 @@ revalidate(struct revalidator *revalidator) max_idle = n_dp_flows > flow_limit ? 100 : ofproto_max_idle; - udpif->dpif->current_ms = time_msec(); + udpif->dpif->current_ms = now = time_msec(); for (f = flows; f < &flows[n_dumped]; f++) { long long int used = f->stats.used; struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; + enum flow_del_reason del_reason = FDR_NONE; struct dpif_flow_stats stats = f->stats; enum reval_result result; struct udpif_key *ukey; @@ -2747,6 +2890,22 @@ revalidate(struct revalidator *revalidator) continue; } + ukey->offloaded = f->attrs.offloaded; + if (!ukey->dp_layer + || (!dpif_synced_dp_layers(udpif->dpif) + && strcmp(ukey->dp_layer, f->attrs.dp_layer))) { + + if (ukey->dp_layer) { + /* The dp_layer has changed this is probably due to an + * earlier revalidate cycle moving it to/from hw offload. + * In this case we should reset the ukey stored statistics, + * as they are from the deleted DP flow. */ + COVERAGE_INC(ukey_dp_change); + memset(&ukey->stats, 0, sizeof ukey->stats); + } + ukey->dp_layer = f->attrs.dp_layer; + } + already_dumped = ukey->dump_seq == dump_seq; if (already_dumped) { /* The flow has already been handled during this flow dump @@ -2760,6 +2919,15 @@ revalidate(struct revalidator *revalidator) continue; } + if (ukey->state == UKEY_INCONSISTENT) { + ukey->dump_seq = dump_seq; + reval_op_init(&ops[n_ops++], UKEY_DELETE, udpif, ukey, + &recircs, &odp_actions); + ovs_mutex_unlock(&ukey->mutex); + COVERAGE_INC(dumped_inconsistent_flow); + continue; + } + if (ukey->state <= UKEY_OPERATIONAL) { /* The flow is now confirmed to be in the datapath. */ transition_ukey(ukey, UKEY_OPERATIONAL); @@ -2776,10 +2944,10 @@ revalidate(struct revalidator *revalidator) } if (kill_them_all || (used && used < now - max_idle)) { result = UKEY_DELETE; + del_reason = (kill_them_all) ? FDR_FLOW_LIMIT : FDR_FLOW_IDLE; } else { result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs, - f->attrs.offloaded); + reval_seq, &recircs, &del_reason); } ukey->dump_seq = dump_seq; @@ -2788,6 +2956,8 @@ revalidate(struct revalidator *revalidator) udpif_update_flow_pps(udpif, ukey, f); } + OVS_USDT_PROBE(revalidate, flow_result, udpif, ukey, result, + del_reason); if (result != UKEY_KEEP) { /* Takes ownership of 'recircs'. */ reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs, @@ -2840,37 +3010,43 @@ revalidator_sweep__(struct revalidator *revalidator, bool purge) size_t n_ops = 0; CMAP_FOR_EACH(ukey, cmap_node, &umap->cmap) { + enum flow_del_reason del_reason = FDR_NONE; enum ukey_state ukey_state; /* Handler threads could be holding a ukey lock while it installs a * new flow, so don't hang around waiting for access to it. */ if (ovs_mutex_trylock(&ukey->mutex)) { + COVERAGE_INC(upcall_ukey_contention); continue; } ukey_state = ukey->state; if (ukey_state == UKEY_OPERATIONAL + || (ukey_state == UKEY_INCONSISTENT) || (ukey_state == UKEY_VISIBLE && purge)) { struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; bool seq_mismatch = (ukey->dump_seq != dump_seq && ukey->reval_seq != reval_seq); enum reval_result result; - if (purge) { + if (purge || ukey_state == UKEY_INCONSISTENT) { result = UKEY_DELETE; + del_reason = purge ? FDR_PURGE : FDR_UPDATE_FAIL; } else if (!seq_mismatch) { result = UKEY_KEEP; } else { struct dpif_flow_stats stats; COVERAGE_INC(revalidate_missed_dp_flow); - memset(&stats, 0, sizeof stats); + memcpy(&stats, &ukey->stats, sizeof stats); result = revalidate_ukey(udpif, ukey, &stats, &odp_actions, - reval_seq, &recircs, false); + reval_seq, &recircs, &del_reason); } if (result != UKEY_KEEP) { /* Clears 'recircs' if filled by revalidate_ukey(). */ reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs, &odp_actions); } + OVS_USDT_PROBE(revalidator_sweep__, flow_sweep_result, udpif, + ukey, result, del_reason); } ovs_mutex_unlock(&ukey->mutex); @@ -3099,11 +3275,19 @@ upcall_unixctl_purge(struct unixctl_conn *conn, int argc OVS_UNUSED, struct udpif *udpif; LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + bool wake_up = false; int n; + if (!latch_is_set(&udpif->pause_latch)) { + udpif_pause_revalidators(udpif); + wake_up = true; + } for (n = 0; n < udpif->n_revalidators; n++) { revalidator_purge(&udpif->revalidators[n]); } + if (wake_up) { + udpif_resume_revalidators(udpif); + } } unixctl_command_reply(conn, ""); } @@ -3132,6 +3316,51 @@ upcall_unixctl_resume(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, ""); } +static void +upcall_unixctl_ofproto_detrace(struct unixctl_conn *conn, int argc, + const char *argv[], void *aux OVS_UNUSED) +{ + unsigned int pmd_id = NON_PMD_CORE_ID; + const char *key_s = argv[1]; + ovs_u128 ufid; + + if (odp_ufid_from_string(key_s, &ufid) <= 0) { + unixctl_command_reply_error(conn, "failed to parse ufid"); + return; + } + + if (argc == 3) { + const char *pmd_str = argv[2]; + if (!ovs_scan(pmd_str, "pmd=%d", &pmd_id)) { + unixctl_command_reply_error(conn, + "Invalid pmd argument format. " + "Expecting 'pmd=PMD-ID'"); + return; + } + } + + struct ds ds = DS_EMPTY_INITIALIZER; + struct udpif *udpif; + + LIST_FOR_EACH (udpif, list_node, &all_udpifs) { + struct udpif_key *ukey = ukey_lookup(udpif, &ufid, pmd_id); + if (!ukey) { + continue; + } + + ovs_mutex_lock(&ukey->mutex); + /* It only makes sense to format rules for ukeys that are (still) + * in use. */ + if ((ukey->state == UKEY_VISIBLE || ukey->state == UKEY_OPERATIONAL) + && ukey->xcache) { + xlate_xcache_format(&ds, ukey->xcache); + } + ovs_mutex_unlock(&ukey->mutex); + } + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); +} + /* Flows are sorted in the following order: * netdev, flow state (offloaded/kernel path), flow_pps_rate. diff --git a/ofproto/ofproto-dpif-xlate-cache.c b/ofproto/ofproto-dpif-xlate-cache.c index 9224ee2e6d5..c6d935cf0ae 100644 --- a/ofproto/ofproto-dpif-xlate-cache.c +++ b/ofproto/ofproto-dpif-xlate-cache.c @@ -125,7 +125,7 @@ xlate_push_stats_entry(struct xc_entry *entry, case XC_LEARN: { enum ofperr error; error = ofproto_flow_mod_learn(entry->learn.ofm, true, - entry->learn.limit, NULL); + entry->learn.limit, NULL, stats->used); if (error) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); VLOG_WARN_RL(&rl, "xcache LEARN action execution failed."); @@ -301,3 +301,37 @@ xlate_cache_steal_entries(struct xlate_cache *dst, struct xlate_cache *src) memcpy(p, src_entries->data, src_entries->size); ofpbuf_clear(src_entries); } + +void +xlate_xcache_format(struct ds *s, const struct xlate_cache *xcache) +{ + struct ofpbuf entries = xcache->entries; + struct xc_entry *entry; + struct ofgroup *ofg; + + XC_ENTRY_FOR_EACH (entry, &entries) { + switch (entry->type) { + case XC_RULE: + ofproto_rule_stats_ds(s, &entry->rule->up, true); + break; + case XC_GROUP: + ofg = &entry->group.group->up; + ofputil_group_format(s, ofg->group_id, ofg->type, + entry->group.bucket, &ofg->buckets, + &ofg->props, OFP15_VERSION, + false, NULL, NULL); + break; + case XC_TABLE: + case XC_BOND: + case XC_NETDEV: + case XC_NETFLOW: + case XC_MIRROR: + case XC_LEARN: + case XC_NORMAL: + case XC_FIN_TIMEOUT: + case XC_TNL_NEIGH: + case XC_TUNNEL_HEADER: + break; + } + } +} diff --git a/ofproto/ofproto-dpif-xlate-cache.h b/ofproto/ofproto-dpif-xlate-cache.h index 0fc6d2ea60c..e701734d796 100644 --- a/ofproto/ofproto-dpif-xlate-cache.h +++ b/ofproto/ofproto-dpif-xlate-cache.h @@ -151,4 +151,6 @@ void xlate_cache_uninit(struct xlate_cache *); void xlate_cache_delete(struct xlate_cache *); void xlate_cache_steal_entries(struct xlate_cache *, struct xlate_cache *); +void xlate_xcache_format(struct ds *, const struct xlate_cache *); + #endif /* ofproto-dpif-xlate-cache.h */ diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c index f8bc857d438..2cddc4a846b 100644 --- a/ofproto/ofproto-dpif-xlate.c +++ b/ofproto/ofproto-dpif-xlate.c @@ -46,6 +46,7 @@ #include "nx-match.h" #include "odp-execute.h" #include "ofproto/ofproto-dpif-ipfix.h" +#include "ofproto/ofproto-dpif-lsample.h" #include "ofproto/ofproto-dpif-mirror.h" #include "ofproto/ofproto-dpif-monitor.h" #include "ofproto/ofproto-dpif-sflow.h" @@ -67,6 +68,18 @@ #include "tunnel.h" #include "util.h" #include "uuid.h" +#include "vlan-bitmap.h" + +#if defined(P4OVS) +#include +#include +#include +#include + +#include "lib/netdev.h" +#include "lib/p4ovs.h" +#include "ovsp4rt/ovs-p4rt.h" +#endif #if defined(P4OVS) #include @@ -127,6 +140,7 @@ struct xbridge { struct mbridge *mbridge; /* Mirroring. */ struct dpif_sflow *sflow; /* SFlow handle, or null. */ struct dpif_ipfix *ipfix; /* Ipfix handle, or null. */ + struct dpif_lsample *lsample; /* Local sample handle, or null. */ struct netflow *netflow; /* Netflow handle, or null. */ struct stp *stp; /* STP or null if disabled. */ struct rstp *rstp; /* RSTP or null if disabled. */ @@ -243,6 +257,9 @@ struct xlate_ctx { * wants actions. */ struct ofpbuf *odp_actions; + /* Set of matching conjunctive flows, or NULL. */ + struct hmapx *conj_flows; + /* Statistics maintained by xlate_table_action(). * * These statistics limit the amount of work that a single flow @@ -515,6 +532,84 @@ ctx_cancel_freeze(struct xlate_ctx *ctx) static void finish_freezing(struct xlate_ctx *ctx); +/* These functions and structure are used to save stack space in actions that + * need to retain a large amount of xlate_ctx state. */ +struct xretained_state { + union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; + uint64_t actset_stub[1024 / 8]; + struct ofpbuf old_stack; + struct ofpbuf old_action_set; + struct flow old_flow; + struct flow old_base; + struct flow_tnl flow_tnl_mask; +}; + +/* The return of this function must be freed by + * xretain_state_restore_and_free(). */ +static struct xretained_state * +xretain_state_save(struct xlate_ctx *ctx) +{ + struct xretained_state *retained = xmalloc(sizeof *retained); + + retained->old_flow = ctx->xin->flow; + retained->old_stack = ctx->stack; + retained->old_action_set = ctx->action_set; + ofpbuf_use_stub(&ctx->stack, retained->new_stack, + sizeof retained->new_stack); + ofpbuf_use_stub(&ctx->action_set, retained->actset_stub, + sizeof retained->actset_stub); + + return retained; +} + +static void +xretain_tunnel_mask_save(const struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + retained->flow_tnl_mask = ctx->wc->masks.tunnel; +} + +static void +xretain_base_flow_save(const struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + retained->old_base = ctx->base_flow; +} + +static void +xretain_base_flow_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->base_flow = retained->old_base; +} + +static void +xretain_flow_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->xin->flow = retained->old_flow; +} + +static void +xretain_tunnel_mask_restore(struct xlate_ctx *ctx, + const struct xretained_state *retained) +{ + ctx->wc->masks.tunnel = retained->flow_tnl_mask; +} + +static void +xretain_state_restore_and_free(struct xlate_ctx *ctx, + struct xretained_state *retained) +{ + ctx->xin->flow = retained->old_flow; + ofpbuf_uninit(&ctx->action_set); + ctx->action_set = retained->old_action_set; + ofpbuf_uninit(&ctx->stack); + ctx->stack = retained->old_stack; + + free(retained); +} + /* A controller may use OFPP_NONE as the ingress port to indicate that * it did not arrive on a "real" port. 'ofpp_none_bundle' exists for * when an input bundle is needed for validation (e.g., mirroring or @@ -619,6 +714,7 @@ static void xlate_xbridge_set(struct xbridge *, struct dpif *, const struct mbridge *, const struct dpif_sflow *, const struct dpif_ipfix *, + const struct dpif_lsample *, const struct netflow *, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *, @@ -812,6 +908,34 @@ xlate_report_action_set(const struct xlate_ctx *ctx, const char *verb) } } +static void +xlate_report_conj_matches(const struct xlate_ctx *ctx, + const struct ofputil_port_map *map) +{ + struct ds s = DS_EMPTY_INITIALIZER; + struct hmapx_node *node; + struct cls_rule *rule; + + /* NOTE: The conj flows have meaning in order. For each flow that is a + * component of conj flows, 'k' in 'conjunction(id, k/n)' represents the + * dimension. When there are multiple flows with the same id, it may be + * implicitly expected that they would be output in ascending order of 'k'. + * + * However, because of the use of hmapx strucutre and the fact that the + * classifier returns them in arbitrary order, they are output in arbitrary + * order here. */ + HMAPX_FOR_EACH (node, ctx->conj_flows) { + ds_clear(&s); + + rule = node->data; + + cls_rule_format(rule, ofproto_get_tun_tab(&ctx->xin->ofproto->up), + map, &s); + xlate_report(ctx, OFT_DETAIL, "conj. %s", ds_cstr(&s)); + } + + ds_destroy(&s); +} /* If tracing is enabled in 'ctx', appends a node representing 'rule' (in * OpenFlow table 'table_id') to the trace and makes this node the parent for @@ -828,6 +952,8 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, return; } + struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); + struct ds s = DS_EMPTY_INITIALIZER; ds_put_format(&s, "%2d. ", table_id); if (rule == ctx->xin->ofproto->miss_rule) { @@ -838,8 +964,6 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ds_put_cstr(&s, "Packets are IP fragments and " "the fragment handling mode is \"drop\"."); } else { - struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); - if (ctx->xin->names) { struct ofproto_dpif *ofprotop; ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); @@ -850,8 +974,6 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ofproto_get_tun_tab(&ctx->xin->ofproto->up), &map, &s, OFP_DEFAULT_PRIORITY); - ofputil_port_map_destroy(&map); - if (ds_last(&s) != ' ') { ds_put_cstr(&s, ", "); } @@ -864,6 +986,9 @@ xlate_report_table(const struct xlate_ctx *ctx, struct rule_dpif *rule, ctx->xin->trace = &oftrace_report(ctx->xin->trace, OFT_TABLE, ds_cstr(&s))->subs; ds_destroy(&s); + + xlate_report_conj_matches(ctx, &map); + ofputil_port_map_destroy(&map); } /* If tracing is enabled in 'ctx', adds an OFT_DETAIL trace node to 'ctx' @@ -983,6 +1108,7 @@ xlate_xbridge_set(struct xbridge *xbridge, const struct mbridge *mbridge, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, + const struct dpif_lsample *lsample, const struct netflow *netflow, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *support, @@ -1013,6 +1139,11 @@ xlate_xbridge_set(struct xbridge *xbridge, xbridge->ipfix = dpif_ipfix_ref(ipfix); } + if (xbridge->lsample != lsample) { + dpif_lsample_unref(xbridge->lsample); + xbridge->lsample = dpif_lsample_ref(lsample); + } + if (xbridge->stp != stp) { stp_unref(xbridge->stp); xbridge->stp = stp_ref(stp); @@ -1063,7 +1194,10 @@ xlate_xbundle_set(struct xbundle *xbundle, xbundle->qinq_ethtype = qinq_ethtype; xbundle->vlan = vlan; xbundle->trunks = trunks; - xbundle->cvlans = cvlans; + if (!vlan_bitmap_equal(xbundle->cvlans, cvlans)) { + free(xbundle->cvlans); + xbundle->cvlans = vlan_bitmap_clone(cvlans); + } xbundle->use_priority_tags = use_priority_tags; xbundle->floodable = floodable; xbundle->protected = protected; @@ -1138,9 +1272,10 @@ xlate_xbridge_copy(struct xbridge *xbridge) xlate_xbridge_set(new_xbridge, xbridge->dpif, xbridge->ml, xbridge->stp, xbridge->rstp, xbridge->ms, xbridge->mbridge, - xbridge->sflow, xbridge->ipfix, xbridge->netflow, - xbridge->forward_bpdu, xbridge->has_in_band, - &xbridge->support, xbridge->addr); + xbridge->sflow, xbridge->ipfix, xbridge->lsample, + xbridge->netflow, xbridge->forward_bpdu, + xbridge->has_in_band, &xbridge->support, + xbridge->addr); LIST_FOR_EACH (xbundle, list_node, &xbridge->xbundles) { xlate_xbundle_copy(new_xbridge, xbundle); } @@ -1305,6 +1440,7 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, const struct mbridge *mbridge, const struct dpif_sflow *sflow, const struct dpif_ipfix *ipfix, + const struct dpif_lsample *lsample, const struct netflow *netflow, bool forward_bpdu, bool has_in_band, const struct dpif_backer_support *support) @@ -1329,7 +1465,7 @@ xlate_ofproto_set(struct ofproto_dpif *ofproto, const char *name, old_addr = xbridge->addr; xlate_xbridge_set(xbridge, dpif, ml, stp, rstp, ms, mbridge, sflow, ipfix, - netflow, forward_bpdu, has_in_band, support, + lsample, netflow, forward_bpdu, has_in_band, support, xbridge_addr); if (xbridge_addr != old_addr) { @@ -1361,6 +1497,7 @@ xlate_xbridge_remove(struct xlate_cfg *xcfg, struct xbridge *xbridge) mbridge_unref(xbridge->mbridge); dpif_sflow_unref(xbridge->sflow); dpif_ipfix_unref(xbridge->ipfix); + dpif_lsample_unref(xbridge->lsample); netflow_unref(xbridge->netflow); stp_unref(xbridge->stp); rstp_unref(xbridge->rstp); @@ -1444,6 +1581,7 @@ xlate_xbundle_remove(struct xlate_cfg *xcfg, struct xbundle *xbundle) ovs_list_remove(&xbundle->list_node); bond_unref(xbundle->bond); lacp_unref(xbundle->lacp); + free(xbundle->cvlans); free(xbundle->name); free(xbundle); } @@ -1596,7 +1734,8 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, } ofp_port_t in_port = recirc_id_node->state.metadata.in_port; - if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) { + if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER && + !uuid_is_zero(&recirc_id_node->state.xport_uuid)) { struct uuid xport_uuid = recirc_id_node->state.xport_uuid; xport = xport_lookup_by_uuid(xcfg, &xport_uuid); if (xport && xport->xbridge && xport->xbridge->ofproto) { @@ -1607,11 +1746,19 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, * that the packet originated from the controller via an OpenFlow * "packet-out". The right thing to do is to find just the * ofproto. There is no xport, which is OK. + * Also a zeroed xport_uuid with a valid in_port, means that + * the packet originated from OFPP_CONTROLLER passed + * through a patch port. * * OFPP_NONE can also indicate that a bond caused recirculation. */ struct uuid uuid = recirc_id_node->state.ofproto_uuid; const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid); + if (bridge && bridge->ofproto) { + if (in_port != OFPP_CONTROLLER && in_port != OFPP_NONE && + !get_ofp_port(bridge, in_port)) { + goto xport_lookup; + } if (errorp) { *errorp = NULL; } @@ -1624,6 +1771,7 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, } } +xport_lookup: xport = xport_lookup(xcfg, tnl_port_should_receive(flow) ? tnl_port_receive(flow) : odp_port_to_ofport(backer, flow->in_port.odp_port)); @@ -1667,17 +1815,19 @@ xlate_lookup_ofproto(const struct dpif_backer *backer, const struct flow *flow, * be taken. * * Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto. + * Sets an extended error string to 'errorp'. Callers are responsible for + * freeing that string. */ int xlate_lookup(const struct dpif_backer *backer, const struct flow *flow, struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix, struct dpif_sflow **sflow, struct netflow **netflow, - ofp_port_t *ofp_in_port) + ofp_port_t *ofp_in_port, char **errorp) { struct ofproto_dpif *ofproto; const struct xport *xport; - ofproto = xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, NULL); + ofproto = xlate_lookup_ofproto_(backer, flow, ofp_in_port, &xport, errorp); if (!ofproto) { return ENODEV; @@ -1986,8 +2136,8 @@ group_is_alive(const struct xlate_ctx *ctx, uint32_t group_id, int depth) #define MAX_LIVENESS_RECURSION 128 /* Arbitrary limit */ static bool -bucket_is_alive(const struct xlate_ctx *ctx, - struct ofputil_bucket *bucket, int depth) +bucket_is_alive(const struct xlate_ctx *ctx, const struct group_dpif *group, + const struct ofputil_bucket *bucket, int depth) { if (depth >= MAX_LIVENESS_RECURSION) { xlate_report_error(ctx, "bucket chaining exceeded %d links", @@ -1995,6 +2145,12 @@ bucket_is_alive(const struct xlate_ctx *ctx, return false; } + /* In "select" groups, buckets with weight 0 are not used. + * In other kinds of groups, weight does not matter. */ + if (group->up.type == OFPGT11_SELECT && bucket->weight == 0) { + return false; + } + return (!ofputil_bucket_has_liveness(bucket) || (bucket->watch_port != OFPP_ANY && bucket->watch_port != OFPP_CONTROLLER @@ -2035,7 +2191,7 @@ group_first_live_bucket(const struct xlate_ctx *ctx, { struct ofputil_bucket *bucket; LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { - if (bucket_is_alive(ctx, bucket, depth)) { + if (bucket_is_alive(ctx, group, bucket, depth)) { return bucket; } xlate_report_bucket_not_live(ctx, bucket); @@ -2054,7 +2210,7 @@ group_best_live_bucket(const struct xlate_ctx *ctx, struct ofputil_bucket *bucket; LIST_FOR_EACH (bucket, list_node, &group->up.buckets) { - if (bucket_is_alive(ctx, bucket, 0)) { + if (bucket_is_alive(ctx, group, bucket, 0)) { uint32_t score = (hash_int(bucket->bucket_id, basis) & 0xffff) * bucket->weight; if (score >= best_score) { @@ -2181,7 +2337,8 @@ lookup_input_bundle(const struct xlate_ctx *ctx, /* Mirrors the packet represented by 'ctx' to appropriate mirror destinations, * given the packet is ingressing or egressing on 'xbundle', which has ingress - * or egress (as appropriate) mirrors 'mirrors'. */ + * or egress (as appropriate) mirrors 'mirrors'. In cases where a mirror is + * filtered, the current wildcard for the flow's current filter is modified. */ static void mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, mirror_mask_t mirrors) @@ -2210,16 +2367,11 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, * 'used_mirrors', as long as some candidates remain. */ mirror_mask_t used_mirrors = 0; while (mirrors) { - const unsigned long *vlans; - mirror_mask_t dup_mirrors; - struct ofbundle *out; - int out_vlan; - int snaplen; + struct mirror_config mc; /* Get the details of the mirror represented by the rightmost 1-bit. */ - if (OVS_UNLIKELY(!mirror_get(xbridge->mbridge, raw_ctz(mirrors), - &vlans, &dup_mirrors, - &out, &snaplen, &out_vlan))) { + if (OVS_UNLIKELY(!mirror_get(xbridge->mbridge, + raw_ctz(mirrors), &mc))) { /* The mirror got reconfigured before we got to read it's * configuration. */ mirrors = zero_rightmost_1bit(mirrors); @@ -2229,14 +2381,26 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, /* If this mirror selects on the basis of VLAN, and it does not select * 'vlan', then discard this mirror and go on to the next one. */ - if (vlans) { + if (mc.vlans) { ctx->wc->masks.vlans[0].tci |= htons(VLAN_CFI | VLAN_VID_MASK); } - if (vlans && !bitmap_is_set(vlans, xvlan.v[0].vid)) { + if (mc.vlans && !bitmap_is_set(mc.vlans, xvlan.v[0].vid)) { mirrors = zero_rightmost_1bit(mirrors); continue; } + /* After the VLAN check, apply a flow mask if a filter is specified. */ + if (mc.filter_flow) { + flow_wildcards_union_with_minimask(ctx->wc, mc.filter_mask); + if (!OVS_UNLIKELY( + miniflow_equal_flow_in_minimask(mc.filter_flow, + &ctx->xin->flow, + mc.filter_mask))) { + mirrors = zero_rightmost_1bit(mirrors); + continue; + } + } + /* We sent a packet to this mirror. */ used_mirrors |= rightmost_1bit(mirrors); @@ -2244,21 +2408,22 @@ mirror_packet(struct xlate_ctx *ctx, struct xbundle *xbundle, * destination, so that we don't mirror to them again. This must be * done now to ensure that output_normal(), below, doesn't recursively * output to the same mirrors. */ - ctx->mirrors |= dup_mirrors; - ctx->mirror_snaplen = snaplen; + ctx->mirrors |= mc.dup_mirrors; + ctx->mirror_snaplen = mc.snaplen; /* Send the packet to the mirror. */ - if (out) { - struct xbundle *out_xbundle = xbundle_lookup(ctx->xcfg, out); + if (mc.out_bundle) { + struct xbundle *out_xbundle = xbundle_lookup(ctx->xcfg, + mc.out_bundle); if (out_xbundle) { output_normal(ctx, out_xbundle, &xvlan); } - } else if (xvlan.v[0].vid != out_vlan + } else if (xvlan.v[0].vid != mc.out_vlan && !eth_addr_is_reserved(ctx->xin->flow.dl_dst)) { struct xbundle *xb; uint16_t old_vid = xvlan.v[0].vid; - xvlan.v[0].vid = out_vlan; + xvlan.v[0].vid = mc.out_vlan; LIST_FOR_EACH (xb, list_node, &xbridge->xbundles) { if (xbundle_includes_vlan(xb, &xvlan) && !xbundle_mirror_out(xbridge, xb)) { @@ -2727,6 +2892,7 @@ update_mcast_snooping_table4__(const struct xlate_ctx *ctx, OVS_REQ_WRLOCK(ms->rwlock) { const struct igmp_header *igmp; + enum mcast_group_proto grp_proto; int count; size_t offset; ovs_be32 ip4 = flow->igmp_group_ip4; @@ -2744,7 +2910,11 @@ update_mcast_snooping_table4__(const struct xlate_ctx *ctx, switch (ntohs(flow->tp_src)) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - if (mcast_snooping_add_group4(ms, ip4, vlan, in_xbundle->ofbundle)) { + grp_proto = ntohs(flow->tp_src) == IGMP_HOST_MEMBERSHIP_REPORT + ? MCAST_GROUP_IGMPV1 + : MCAST_GROUP_IGMPV2; + if (mcast_snooping_add_group4(ms, ip4, vlan, in_xbundle->ofbundle, + grp_proto)) { xlate_report_debug(ctx, OFT_DETAIL, "multicast snooping learned that " IP_FMT" is on port %s in VLAN %d", @@ -3198,17 +3368,20 @@ update_ip_mac_map_info(const struct flow *flow, return -1; } - memcpy(ip_mac_map_info->src_mac_addr, flow->dl_src.ea, sizeof(ip_mac_map_info->src_mac_addr)); - memcpy(ip_mac_map_info->dst_mac_addr, flow->dl_dst.ea, sizeof(ip_mac_map_info->dst_mac_addr)); + memcpy(ip_mac_map_info->src_mac_addr, flow->dl_src.ea, + sizeof(ip_mac_map_info->src_mac_addr)); + memcpy(ip_mac_map_info->dst_mac_addr, flow->dl_dst.ea, + sizeof(ip_mac_map_info->dst_mac_addr)); - //Program the entiry only for an ARP response where we have valid IP's and MAC for both src and dst + // Program the entry only for an ARP response where we have valid IPs + // and MAC for both src and dst. if (valid_ip_addr(flow->nw_src) && !eth_addr_is_broadcast(flow->dl_src) && - valid_ip_addr(flow->nw_dst) && !eth_addr_is_broadcast(flow->dl_dst)) { - ip_mac_map_info->src_ip_addr.family = AF_INET; - ip_mac_map_info->src_ip_addr.ip.v4addr.s_addr = flow->nw_src; + valid_ip_addr(flow->nw_dst) && !eth_addr_is_broadcast(flow->dl_dst)) { + ip_mac_map_info->src_ip_addr.family = AF_INET; + ip_mac_map_info->src_ip_addr.ip.v4addr.s_addr = flow->nw_src; - ip_mac_map_info->dst_ip_addr.family = AF_INET; - ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; + ip_mac_map_info->dst_ip_addr.family = AF_INET; + ip_mac_map_info->dst_ip_addr.ip.v4addr.s_addr = flow->nw_dst; } return -1; @@ -3221,10 +3394,10 @@ xlate_normal(struct xlate_ctx *ctx) { struct flow_wildcards *wc = ctx->wc; struct flow *flow = &ctx->xin->flow; + struct xbundle *in_xbundle; #if defined(P4OVS) - bool is_mac_learn_required = false; + bool need_update = false; #endif - struct xbundle *in_xbundle; struct xport *in_port; struct mac_entry *mac; void *mac_port; @@ -3288,18 +3461,22 @@ xlate_normal(struct xlate_ctx *ctx) && in_port && in_port->pt_mode != NETDEV_PT_LEGACY_L3 ) { #if defined(P4OVS) - is_mac_learn_required = is_mac_learning_update_needed(ctx->xbridge->ml, - flow->dl_src, vlan,is_grat_arp, - in_xbundle->bond != NULL, - in_xbundle->ofbundle); + bool is_static_move = false; + need_update = is_mac_learning_update_needed(ctx->xbridge->ml, + flow->dl_src, + vlan, is_grat_arp, + in_xbundle->bond != NULL, + in_xbundle->ofbundle, + &is_static_move); + /* ignore is_static_move */ #endif - //The function below calls mac_learning_insert + // The function below calls mac_learning_insert update_learning_table(ctx, in_xbundle, flow->dl_src, vlan, is_grat_arp); } #if defined(P4OVS) - if (is_mac_learn_required) { + if (need_update) { /* Dynamic MAC is learnt, program P4 forwarding table */ struct xport *ovs_port = get_ofp_port(in_xbundle->xbridge, flow->in_port.ofp_port); @@ -3496,61 +3673,100 @@ xlate_normal(struct xlate_ctx *ctx) } } -/* Appends a "sample" action for sFlow or IPFIX to 'ctx->odp_actions'. The - * 'probability' is the number of packets out of UINT32_MAX to sample. The - * 'cookie' is passed back in the callback for each sampled packet. - * 'tunnel_out_port', if not ODPP_NONE, is added as the - * OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute. If 'include_actions', - * an OVS_USERSPACE_ATTR_ACTIONS attribute is added. If - * 'emit_set_tunnel', sample(sampling_port=1) would translate into - * datapath sample action set(tunnel(...)), sample(...) and it is used - * for sampling egress tunnel information. - */ +/* Psample-related arguments for compose_sample_action. */ +struct sample_psample_args { + uint32_t group_id; /* Group to be used in psample. */ + ovs_32aligned_be64 cookie; /* Cookie to be used in psample. */ +}; + +/* Userspace-related arguments for compose_sample_action. */ +struct sample_userspace_args { + struct user_action_cookie cookie; /* Data passed back in the upcall + * for each sampled packet. */ + odp_port_t tunnel_out_port; /* If not ODPP_NONE, it is added in + * OVS_USERSPACE_ATTR_EGRESS_TUN_PORT + * attribute. */ + bool include_actions; /* Whether OVS_USERSPACE_ATTR_ACTIONS + * is to be set. */ + +}; + +/* Arguments for compose_sample_action. */ +struct compose_sample_args { + uint32_t probability; /* Number of packets out of + * UINT32_MAX to sample. */ + struct sample_userspace_args *userspace; /* Optional, + * arguments for userspace. */ + struct sample_psample_args *psample; /* Optional, + * arguments for psample. */ +}; + +/* Composes sample action according to 'args'. */ static size_t compose_sample_action(struct xlate_ctx *ctx, - const uint32_t probability, - const struct user_action_cookie *cookie, - const odp_port_t tunnel_out_port, - bool include_actions) + const struct compose_sample_args *args) { - if (probability == 0) { + if (args->probability == 0) { /* No need to generate sampling or the inner action. */ return 0; } + /* At least one of userspace or psample config must be provided. */ + ovs_assert(args->userspace || args->psample); + /* If the slow path meter is configured by the controller, * insert a meter action before the user space action. */ struct ofproto *ofproto = &ctx->xin->ofproto->up; uint32_t meter_id = ofproto->slowpath_meter_id; - - /* When meter action is not required, avoid generate sample action - * for 100% sampling rate. */ - bool is_sample = probability < UINT32_MAX || meter_id != UINT32_MAX; + size_t observe_offset = UINT32_MAX; + size_t cookie_offset = 0; + + /* The meter action is only used to throttle userspace actions. + * If they are not needed and the sampling rate is 100%, avoid generating + * a sample action. */ + bool is_sample = (args->probability < UINT32_MAX || + (args->userspace && meter_id != UINT32_MAX)); size_t sample_offset = 0, actions_offset = 0; if (is_sample) { sample_offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, - probability); + args->probability); actions_offset = nl_msg_start_nested(ctx->odp_actions, OVS_SAMPLE_ATTR_ACTIONS); } - if (meter_id != UINT32_MAX) { - nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + if (args->psample) { + observe_offset = ctx->odp_actions->size; + odp_put_psample_action(ctx->odp_actions, + args->psample->group_id, + (void *) &args->psample->cookie, + sizeof args->psample->cookie); + } + + if (args->userspace) { + if (meter_id != UINT32_MAX) { + nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + } + + observe_offset = ctx->odp_actions->size; + odp_port_t odp_port = ofp_port_to_odp_port( + ctx->xbridge, ctx->xin->flow.in_port.ofp_port); + uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); + int res = odp_put_userspace_action(pid, &args->userspace->cookie, + sizeof args->userspace->cookie, + args->userspace->tunnel_out_port, + args->userspace->include_actions, + ctx->odp_actions, &cookie_offset); + ovs_assert(res == 0); } - odp_port_t odp_port = ofp_port_to_odp_port( - ctx->xbridge, ctx->xin->flow.in_port.ofp_port); - uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); - size_t cookie_offset; - int res = odp_put_userspace_action(pid, cookie, sizeof *cookie, - tunnel_out_port, include_actions, - ctx->odp_actions, &cookie_offset); - ovs_assert(res == 0); if (is_sample) { nl_msg_end_nested(ctx->odp_actions, actions_offset); nl_msg_end_nested(ctx->odp_actions, sample_offset); + ctx->xout->last_observe_offset = sample_offset; + } else { + ctx->xout->last_observe_offset = observe_offset; } return cookie_offset; @@ -3567,19 +3783,24 @@ static size_t compose_sflow_action(struct xlate_ctx *ctx) { struct dpif_sflow *sflow = ctx->xbridge->sflow; + struct sample_userspace_args userspace; + struct compose_sample_args args = {0}; + if (!sflow || ctx->xin->flow.in_port.ofp_port == OFPP_NONE) { return 0; } - struct user_action_cookie cookie; + memset(&userspace, 0, sizeof userspace); + userspace.cookie.type = USER_ACTION_COOKIE_SFLOW; + userspace.cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace.cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace.tunnel_out_port = ODPP_NONE; + userspace.include_actions = true; - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_SFLOW; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + args.probability = dpif_sflow_get_probability(sflow); + args.userspace = &userspace; - return compose_sample_action(ctx, dpif_sflow_get_probability(sflow), - &cookie, ODPP_NONE, true); + return compose_sample_action(ctx, &args); } /* If flow IPFIX is enabled, make sure IPFIX flow sample action @@ -3590,7 +3811,11 @@ static void compose_ipfix_action(struct xlate_ctx *ctx, odp_port_t output_odp_port) { struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; - odp_port_t tunnel_out_port = ODPP_NONE; + struct sample_userspace_args userspace; + struct compose_sample_args args = {0}; + + memset(&userspace, 0, sizeof userspace); + userspace.tunnel_out_port = ODPP_NONE; if (!ipfix || (output_odp_port == ODPP_NONE && @@ -3615,21 +3840,20 @@ compose_ipfix_action(struct xlate_ctx *ctx, odp_port_t output_odp_port) */ if (dpif_ipfix_get_bridge_exporter_tunnel_sampling(ipfix) && dpif_ipfix_is_tunnel_port(ipfix, output_odp_port) ) { - tunnel_out_port = output_odp_port; + userspace.tunnel_out_port = output_odp_port; } } - struct user_action_cookie cookie; + userspace.cookie.type = USER_ACTION_COOKIE_IPFIX; + userspace.cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace.cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace.cookie.ipfix.output_odp_port = output_odp_port; + userspace.include_actions = false; - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_IPFIX; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; - cookie.ipfix.output_odp_port = output_odp_port; + args.probability = dpif_ipfix_get_bridge_exporter_probability(ipfix); + args.userspace = &userspace; - compose_sample_action(ctx, - dpif_ipfix_get_bridge_exporter_probability(ipfix), - &cookie, tunnel_out_port, false); + compose_sample_action(ctx, &args); } /* Fix "sample" action according to data collected while composing ODP actions, @@ -3901,6 +4125,10 @@ propagate_tunnel_data_to_flow(struct xlate_ctx *ctx, struct eth_addr dmac, case OVS_VPORT_TYPE_BAREUDP: nw_proto = IPPROTO_UDP; break; + case OVS_VPORT_TYPE_SRV6: + nw_proto = (flow->dl_type == htons(ETH_TYPE_IP)) + ? IPPROTO_IPIP : IPPROTO_IPV6; + break; case OVS_VPORT_TYPE_LISP: case OVS_VPORT_TYPE_STT: case OVS_VPORT_TYPE_UNSPEC: @@ -3950,6 +4178,8 @@ native_tunnel_output(struct xlate_ctx *ctx, const struct xport *xport, if (flow->tunnel.ip_src) { in6_addr_set_mapped_ipv4(&s_ip6, flow->tunnel.ip_src); + } else if (ipv6_addr_is_set(&flow->tunnel.ipv6_src)) { + s_ip6 = flow->tunnel.ipv6_src; } err = tnl_route_lookup_flow(ctx, flow, &d_ip6, &s_ip6, &out_dev); @@ -4175,20 +4405,17 @@ static void patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, struct xport *out_dev, bool is_last_action) { + bool old_was_mpls = ctx->was_mpls; struct flow *flow = &ctx->xin->flow; - struct flow old_flow = ctx->xin->flow; - struct flow_tnl old_flow_tnl_wc = ctx->wc->masks.tunnel; bool old_conntrack = ctx->conntracked; - bool old_was_mpls = ctx->was_mpls; - ovs_version_t old_version = ctx->xin->tables_version; - struct ofpbuf old_stack = ctx->stack; - uint8_t new_stack[1024]; - struct ofpbuf old_action_set = ctx->action_set; + struct xretained_state *retained_state; struct ovs_list *old_trace = ctx->xin->trace; - uint64_t actset_stub[1024 / 8]; + ovs_version_t old_version = ctx->xin->tables_version; + + retained_state = xretain_state_save(ctx); + + xretain_tunnel_mask_save(ctx, retained_state); - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); flow->in_port.ofp_port = out_dev->ofp_port; flow->metadata = htonll(0); memset(&flow->tunnel, 0, sizeof flow->tunnel); @@ -4227,14 +4454,15 @@ patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, } else { /* Forwarding is disabled by STP and RSTP. Let OFPP_NORMAL and * the learning action look at the packet, then drop it. */ - struct flow old_base_flow = ctx->base_flow; size_t old_size = ctx->odp_actions->size; + + xretain_base_flow_save(ctx, retained_state); mirror_mask_t old_mirrors2 = ctx->mirrors; xlate_table_action(ctx, flow->in_port.ofp_port, 0, true, true, false, is_last_action, clone_xlate_actions); ctx->mirrors = old_mirrors2; - ctx->base_flow = old_base_flow; + xretain_base_flow_restore(ctx, retained_state); ctx->odp_actions->size = old_size; /* Undo changes that may have been done for freezing. */ @@ -4246,18 +4474,15 @@ patch_port_output(struct xlate_ctx *ctx, const struct xport *in_dev, if (independent_mirrors) { ctx->mirrors = old_mirrors; } - ctx->xin->flow = old_flow; ctx->xbridge = in_dev->xbridge; - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; /* Restore calling bridge's lookup version. */ ctx->xin->tables_version = old_version; - /* Restore to calling bridge tunneling information */ - ctx->wc->masks.tunnel = old_flow_tnl_wc; + /* Restore to calling bridge tunneling information; the ctx flow, actions, + * and stack. And free the retained state. */ + xretain_tunnel_mask_restore(ctx, retained_state); + xretain_state_restore_and_free(ctx, retained_state); /* The out bridge popping MPLS should have no effect on the original * bridge. */ @@ -4447,6 +4672,16 @@ xport_has_ip(const struct xport *xport) return n_in6 ? true : false; } +static bool check_neighbor_reply(struct xlate_ctx *ctx, struct flow *flow) +{ + if (flow->dl_type == htons(ETH_TYPE_ARP) || + flow->nw_proto == IPPROTO_ICMPV6) { + return is_neighbor_reply_correct(ctx, flow); + } + + return false; +} + static bool terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport, struct flow *flow, struct flow_wildcards *wc, @@ -4467,9 +4702,7 @@ terminate_native_tunnel(struct xlate_ctx *ctx, const struct xport *xport, /* If no tunnel port was found and it's about an ARP or ICMPv6 packet, * do tunnel neighbor snooping. */ if (*tnl_port == ODPP_NONE && - (flow->dl_type == htons(ETH_TYPE_ARP) || - flow->nw_proto == IPPROTO_ICMPV6) && - is_neighbor_reply_correct(ctx, flow)) { + (check_neighbor_reply(ctx, flow) || is_garp(flow, wc))) { tnl_neigh_snoop(flow, wc, ctx->xbridge->name, ctx->xin->allow_side_effects); } else if (*tnl_port != ODPP_NONE && @@ -4499,7 +4732,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, const struct xport *xport = get_ofp_port(ctx->xbridge, ofp_port); struct flow_wildcards *wc = ctx->wc; struct flow *flow = &ctx->xin->flow; - struct flow_tnl flow_tnl; + struct flow_tnl *flow_tnl = NULL; union flow_vlan_hdr flow_vlans[FLOW_MAX_VLAN_HEADERS]; uint8_t flow_nw_tos; odp_port_t out_port, odp_port, odp_tnl_port; @@ -4513,7 +4746,6 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, /* If 'struct flow' gets additional metadata, we'll need to zero it out * before traversing a patch port. */ BUILD_ASSERT_DECL(FLOW_WC_SEQ == 42); - memset(&flow_tnl, 0, sizeof flow_tnl); if (!check_output_prerequisites(ctx, xport, flow, check_stp)) { return; @@ -4557,7 +4789,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, * the Logical (tunnel) Port are not visible for any further * matches, while explicit set actions on tunnel metadata are. */ - flow_tnl = flow->tunnel; + flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl); odp_port = tnl_port_send(xport->ofport, flow, ctx->wc); if (odp_port == ODPP_NONE) { xlate_report(ctx, OFT_WARN, "Tunneling decided against output"); @@ -4588,7 +4820,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, tnl_type = tnl_port_get_type(xport->ofport); commit_odp_tunnel_action(flow, &ctx->base_flow, ctx->odp_actions, tnl_type); - flow->tunnel = flow_tnl; /* Restore tunnel metadata */ + flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */ } } else { odp_port = xport->odp_port; @@ -4632,7 +4864,8 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, /* Output to native tunnel port. */ native_tunnel_output(ctx, xport, flow, odp_port, truncate, is_last_action); - flow->tunnel = flow_tnl; /* Restore tunnel metadata */ + ovs_assert(flow_tnl); + flow->tunnel = *flow_tnl; /* Restore tunnel metadata. */ } else if (terminate_native_tunnel(ctx, xport, flow, wc, &odp_tnl_port)) { @@ -4675,7 +4908,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, xport->xbundle)); } - out: +out: /* Restore flow */ memcpy(flow->vlans, flow_vlans, sizeof flow->vlans); flow->nw_tos = flow_nw_tos; @@ -4683,6 +4916,7 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, flow->dl_src = flow_dl_src; flow->packet_type = flow_packet_type; flow->dl_type = flow_dl_type; + free(flow_tnl); } static void @@ -4821,7 +5055,7 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id, ctx->xin->resubmit_stats, &ctx->table_id, in_port, may_packet_in, honor_table_miss, - ctx->xin->xcache); + ctx->xin->xcache, ctx->conj_flows); /* Swap back. */ if (with_ct_orig) { tuple_swap(&ctx->xin->flow, ctx->wc); @@ -4842,6 +5076,11 @@ xlate_table_action(struct xlate_ctx *ctx, ofp_port_t in_port, uint8_t table_id, struct ovs_list *old_trace = ctx->xin->trace; xlate_report_table(ctx, rule, table_id); + + if (OVS_UNLIKELY(ctx->xin->trace)) { + hmapx_clear(ctx->conj_flows); + } + xlate_recursively(ctx, rule, table_id <= old_table_id, is_last_action, xlator); ctx->xin->trace = old_trace; @@ -5030,7 +5269,7 @@ pick_dp_hash_select_group(struct xlate_ctx *ctx, struct group_dpif *group) for (int i = 0; i <= hash_mask; i++) { struct ofputil_bucket *b = group->hash_map[(dp_hash + i) & hash_mask]; - if (bucket_is_alive(ctx, b, 0)) { + if (bucket_is_alive(ctx, group, b, 0)) { return b; } } @@ -5204,10 +5443,37 @@ put_controller_user_action(struct xlate_ctx *ctx, bool dont_send, bool continuation, uint32_t recirc_id, int len, enum ofp_packet_in_reason reason, + uint32_t provider_meter_id, uint16_t controller_id) { struct user_action_cookie cookie; + /* If the controller action didn't request a meter (indicated by a + * 'meter_id' argument other than NX_CTLR_NO_METER), see if one was + * configured through the "controller" virtual meter. + * + * Internally, ovs-vswitchd uses UINT32_MAX to indicate no meter is + * configured. */ + uint32_t meter_id; + if (provider_meter_id == UINT32_MAX) { + meter_id = ctx->xbridge->ofproto->up.controller_meter_id; + } else { + meter_id = provider_meter_id; + } + + size_t offset; + size_t ac_offset; + if (meter_id != UINT32_MAX) { + /* If controller meter is configured, generate + * clone(meter,userspace) action. */ + offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); + nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, + UINT32_MAX); + ac_offset = nl_msg_start_nested(ctx->odp_actions, + OVS_SAMPLE_ATTR_ACTIONS); + nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); + } + memset(&cookie, 0, sizeof cookie); cookie.type = USER_ACTION_COOKIE_CONTROLLER; cookie.ofp_in_port = OFPP_NONE, @@ -5225,6 +5491,11 @@ put_controller_user_action(struct xlate_ctx *ctx, uint32_t pid = dpif_port_get_pid(ctx->xbridge->dpif, odp_port); odp_put_userspace_action(pid, &cookie, sizeof cookie, ODPP_NONE, false, ctx->odp_actions, NULL); + + if (meter_id != UINT32_MAX) { + nl_msg_end_nested(ctx->odp_actions, ac_offset); + nl_msg_end_nested(ctx->odp_actions, offset); + } } static void @@ -5269,32 +5540,6 @@ xlate_controller_action(struct xlate_ctx *ctx, int len, } recirc_refs_add(&ctx->xout->recircs, recirc_id); - /* If the controller action didn't request a meter (indicated by a - * 'meter_id' argument other than NX_CTLR_NO_METER), see if one was - * configured through the "controller" virtual meter. - * - * Internally, ovs-vswitchd uses UINT32_MAX to indicate no meter is - * configured. */ - uint32_t meter_id; - if (provider_meter_id == UINT32_MAX) { - meter_id = ctx->xbridge->ofproto->up.controller_meter_id; - } else { - meter_id = provider_meter_id; - } - - size_t offset; - size_t ac_offset; - if (meter_id != UINT32_MAX) { - /* If controller meter is configured, generate clone(meter, userspace) - * action. */ - offset = nl_msg_start_nested(ctx->odp_actions, OVS_ACTION_ATTR_SAMPLE); - nl_msg_put_u32(ctx->odp_actions, OVS_SAMPLE_ATTR_PROBABILITY, - UINT32_MAX); - ac_offset = nl_msg_start_nested(ctx->odp_actions, - OVS_SAMPLE_ATTR_ACTIONS); - nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_METER, meter_id); - } - /* Generate the datapath flows even if we don't send the packet-in * so that debugging more closely represents normal state. */ bool dont_send = false; @@ -5302,12 +5547,7 @@ xlate_controller_action(struct xlate_ctx *ctx, int len, dont_send = true; } put_controller_user_action(ctx, dont_send, false, recirc_id, len, - reason, controller_id); - - if (meter_id != UINT32_MAX) { - nl_msg_end_nested(ctx->odp_actions, ac_offset); - nl_msg_end_nested(ctx->odp_actions, offset); - } + reason, provider_meter_id, controller_id); } /* Creates a frozen state, and allocates a unique recirc id for the given @@ -5359,6 +5599,7 @@ finish_freezing__(struct xlate_ctx *ctx, uint8_t table) put_controller_user_action(ctx, false, true, recirc_id, ctx->pause->max_len, ctx->pause->reason, + ctx->pause->provider_meter_id, ctx->pause->controller_id); } else { if (ctx->recirc_update_dp_hash) { @@ -5472,6 +5713,7 @@ compose_dec_ttl(struct xlate_ctx *ctx, struct ofpact_cnt_ids *ids) } ctx->wc->masks.nw_ttl = 0xff; + WC_MASK_FIELD(ctx->wc, nw_proto); if (flow->nw_ttl > 1) { flow->nw_ttl--; return false; @@ -5661,11 +5903,8 @@ xlate_output_reg_action(struct xlate_ctx *ctx, uint64_t port = mf_get_subfield(&or->src, &ctx->xin->flow); if (port <= UINT16_MAX) { xlate_report(ctx, OFT_DETAIL, "output port is %"PRIu64, port); - - union mf_subvalue value; - - memset(&value, 0xff, sizeof value); - mf_write_subfield_flow(&or->src, &value, &ctx->wc->masks); + mf_write_subfield_flow(&or->src, &exact_sub_match_mask, + &ctx->wc->masks); xlate_output_action(ctx, u16_to_ofp(port), or->max_len, false, is_last_action, false, group_bucket_action); @@ -5877,8 +6116,16 @@ xlate_learn_action(struct xlate_ctx *ctx, const struct ofpact_learn *learn) if (!error) { bool success = true; if (ctx->xin->allow_side_effects) { + long long int last_used; + + if (ctx->xin->resubmit_stats) { + last_used = ctx->xin->resubmit_stats->used; + } else { + last_used = time_msec(); + } error = ofproto_flow_mod_learn(ofm, ctx->xin->xcache != NULL, - learn->limit, &success); + learn->limit, &success, + last_used); } else if (learn->limit) { if (!ofm->temp_rule || ofm->temp_rule->state != RULE_INSERTED) { @@ -5959,23 +6206,51 @@ xlate_fin_timeout(struct xlate_ctx *ctx, } } +static uint32_t +ofpact_sample_get_domain(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + if (os->obs_domain_src.field) { + uint32_t obs_domain_id; + + obs_domain_id = mf_get_subfield(&os->obs_domain_src, &ctx->xin->flow); + mf_write_subfield_flow(&os->obs_domain_src, &exact_sub_match_mask, + &ctx->wc->masks); + + return obs_domain_id; + } else { + return os->obs_domain_imm; + } +} + +static uint32_t +ofpact_sample_get_point(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + if (os->obs_point_src.field) { + uint32_t obs_point_id; + + obs_point_id = mf_get_subfield(&os->obs_point_src, &ctx->xin->flow); + mf_write_subfield_flow(&os->obs_point_src, &exact_sub_match_mask, + &ctx->wc->masks); + + return obs_point_id; + } else { + return os->obs_point_imm; + } +} + static void -xlate_sample_action(struct xlate_ctx *ctx, - const struct ofpact_sample *os) +xlate_fill_ipfix_sample(struct xlate_ctx *ctx, + const struct ofpact_sample *os, + const struct dpif_ipfix *ipfix, + struct sample_userspace_args *userspace) { odp_port_t output_odp_port = ODPP_NONE; - odp_port_t tunnel_out_port = ODPP_NONE; - struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; bool emit_set_tunnel = false; - if (!ipfix || ctx->xin->flow.in_port.ofp_port == OFPP_NONE) { - return; - } - - /* Scale the probability from 16-bit to 32-bit while representing - * the same percentage. */ - uint32_t probability = - ((uint32_t) os->probability << 16) | os->probability; + memset(userspace, 0, sizeof *userspace); + userspace->tunnel_out_port = ODPP_NONE; /* If ofp_port in flow sample action is equel to ofp_port, * this sample action is a input port action. */ @@ -5992,7 +6267,7 @@ xlate_sample_action(struct xlate_ctx *ctx, if (dpif_ipfix_get_flow_exporter_tunnel_sampling(ipfix, os->collector_set_id) && dpif_ipfix_is_tunnel_port(ipfix, output_odp_port)) { - tunnel_out_port = output_odp_port; + userspace->tunnel_out_port = output_odp_port; emit_set_tunnel = true; } } @@ -6009,13 +6284,15 @@ xlate_sample_action(struct xlate_ctx *ctx, struct flow *flow = &ctx->xin->flow; tnl_port_send(xport->ofport, flow, ctx->wc); if (!ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) { - struct flow_tnl flow_tnl = flow->tunnel; + struct flow_tnl *flow_tnl; const char *tnl_type; + flow_tnl = xmemdup(&flow->tunnel, sizeof *flow_tnl); tnl_type = tnl_port_get_type(xport->ofport); commit_odp_tunnel_action(flow, &ctx->base_flow, ctx->odp_actions, tnl_type); - flow->tunnel = flow_tnl; + flow->tunnel = *flow_tnl; + free(flow_tnl); } } else { xlate_report_error(ctx, @@ -6024,20 +6301,59 @@ xlate_sample_action(struct xlate_ctx *ctx, } } - struct user_action_cookie cookie; + userspace->cookie.type = USER_ACTION_COOKIE_FLOW_SAMPLE; + userspace->cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; + userspace->cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; + userspace->cookie.flow_sample.probability = os->probability; + userspace->cookie.flow_sample.collector_set_id = os->collector_set_id; + userspace->cookie.flow_sample.obs_domain_id = + ofpact_sample_get_domain(ctx, os); + userspace->cookie.flow_sample.obs_point_id = + ofpact_sample_get_point(ctx, os); + userspace->cookie.flow_sample.output_odp_port = output_odp_port; + userspace->cookie.flow_sample.direction = os->direction; + userspace->include_actions = false; +} - memset(&cookie, 0, sizeof cookie); - cookie.type = USER_ACTION_COOKIE_FLOW_SAMPLE; - cookie.ofp_in_port = ctx->xin->flow.in_port.ofp_port; - cookie.ofproto_uuid = ctx->xbridge->ofproto->uuid; - cookie.flow_sample.probability = os->probability; - cookie.flow_sample.collector_set_id = os->collector_set_id; - cookie.flow_sample.obs_domain_id = os->obs_domain_id; - cookie.flow_sample.obs_point_id = os->obs_point_id; - cookie.flow_sample.output_odp_port = output_odp_port; - cookie.flow_sample.direction = os->direction; +static void +xlate_sample_action(struct xlate_ctx *ctx, + const struct ofpact_sample *os) +{ + struct dpif_lsample *lsample = ctx->xbridge->lsample; + struct dpif_ipfix *ipfix = ctx->xbridge->ipfix; + struct compose_sample_args compose_args = {0}; + struct sample_userspace_args userspace; + struct sample_psample_args psample; + + if (!ipfix && !lsample) { + return; + } - compose_sample_action(ctx, probability, &cookie, tunnel_out_port, false); + /* Scale the probability from 16-bit to 32-bit while representing + * the same percentage. */ + compose_args.probability = + ((uint32_t) os->probability << 16) | os->probability; + + if (ipfix) { + xlate_fill_ipfix_sample(ctx, os, ipfix, &userspace); + compose_args.userspace = &userspace; + } + + if (lsample && + dpif_lsample_get_group_id(lsample, + os->collector_set_id, + &psample.group_id)) { + psample.cookie.hi = htonl(ofpact_sample_get_domain(ctx, os)); + psample.cookie.lo = htonl(ofpact_sample_get_point(ctx, os)); + + compose_args.psample = &psample; + } + + if (!compose_args.userspace && !compose_args.psample) { + return; + } + + compose_sample_action(ctx, &compose_args); } /* Determine if an datapath action translated from the openflow action @@ -6125,21 +6441,12 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, struct xlate_ctx *ctx, bool is_last_action, bool group_bucket_action OVS_UNUSED) { - struct ofpbuf old_stack = ctx->stack; - union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_put(&ctx->stack, old_stack.data, old_stack.size); - - struct ofpbuf old_action_set = ctx->action_set; - uint64_t actset_stub[1024 / 8]; - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); - ofpbuf_put(&ctx->action_set, old_action_set.data, old_action_set.size); - + struct xretained_state *retained_state; size_t offset, ac_offset; - struct flow old_flow = ctx->xin->flow; + + retained_state = xretain_state_save(ctx); if (reversible_actions(actions, actions_len) || is_last_action) { - old_flow = ctx->xin->flow; do_xlate_actions(actions, actions_len, ctx, is_last_action, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -6154,7 +6461,8 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, * avoid emitting those actions twice. Once inside * the clone, another time for the action after clone. */ xlate_commit_actions(ctx); - struct flow old_base = ctx->base_flow; + xretain_base_flow_save(ctx, retained_state); + bool old_was_mpls = ctx->was_mpls; bool old_conntracked = ctx->conntracked; @@ -6211,14 +6519,10 @@ clone_xlate_actions(const struct ofpact *actions, size_t actions_len, ctx->was_mpls = old_was_mpls; /* Restore the 'base_flow' for the next action. */ - ctx->base_flow = old_base; + xretain_base_flow_restore(ctx, retained_state); xlate_done: - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; - ctx->xin->flow = old_flow; + xretain_state_restore_and_free(ctx, retained_state); } static void @@ -6594,9 +6898,6 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, { uint16_t zone; if (ofc->zone_src.field) { - union mf_subvalue value; - memset(&value, 0xff, sizeof(value)); - zone = mf_get_subfield(&ofc->zone_src, &ctx->xin->flow); if (ctx->xin->frozen_state) { /* If the upcall is a resume of a recirculation, we only need to @@ -6605,11 +6906,12 @@ compose_conntrack_action(struct xlate_ctx *ctx, struct ofpact_conntrack *ofc, * which will invalidate the megaflow with old the recirc_id. */ if (!mf_is_frozen_metadata(ofc->zone_src.field)) { - mf_write_subfield_flow(&ofc->zone_src, &value, + mf_write_subfield_flow(&ofc->zone_src, &exact_sub_match_mask, &ctx->wc->masks); } } else { - mf_write_subfield_flow(&ofc->zone_src, &value, &ctx->wc->masks); + mf_write_subfield_flow(&ofc->zone_src, &exact_sub_match_mask, + &ctx->wc->masks); } } else { zone = ofc->zone_imm; @@ -6700,16 +7002,16 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, const struct ofpact *remaining_acts, size_t remaining_acts_len) { - union mf_subvalue value; - memset(&value, 0, sizeof value); + union mf_subvalue *value = xmalloc(sizeof *value); + memset(value, 0, sizeof *value); if (!ctx->xbridge->support.check_pkt_len) { uint8_t is_pkt_larger = 0; if (ctx->xin->packet) { is_pkt_larger = dp_packet_size(ctx->xin->packet) > check_pkt_larger->pkt_len; } - value.u8_val = is_pkt_larger; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, + value->u8_val = is_pkt_larger; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); /* If datapath doesn't support check_pkt_len action, then set the * SLOW_ACTION flag. If we don't set SLOW_ACTION, we @@ -6719,22 +7021,17 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, * the packet length. This results in wrong actions being applied. */ ctx->xout->slow |= SLOW_ACTION; + free(value); return; } - struct ofpbuf old_stack = ctx->stack; - union mf_subvalue new_stack[1024 / sizeof(union mf_subvalue)]; - ofpbuf_use_stub(&ctx->stack, new_stack, sizeof new_stack); - ofpbuf_put(&ctx->stack, old_stack.data, old_stack.size); + struct xretained_state *retained_state; - struct ofpbuf old_action_set = ctx->action_set; - uint64_t actset_stub[1024 / 8]; - ofpbuf_use_stub(&ctx->action_set, actset_stub, sizeof actset_stub); - ofpbuf_put(&ctx->action_set, old_action_set.data, old_action_set.size); + retained_state = xretain_state_save(ctx); - struct flow old_flow = ctx->xin->flow; xlate_commit_actions(ctx); - struct flow old_base = ctx->base_flow; + xretain_base_flow_save(ctx, retained_state); + bool old_was_mpls = ctx->was_mpls; bool old_conntracked = ctx->conntracked; @@ -6744,8 +7041,8 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, check_pkt_larger->pkt_len); size_t offset_attr = nl_msg_start_nested( ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); - value.u8_val = 1; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, &ctx->xin->flow); + value->u8_val = 1; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -6755,10 +7052,10 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, } nl_msg_end_nested(ctx->odp_actions, offset_attr); - ctx->base_flow = old_base; + xretain_base_flow_restore(ctx, retained_state); + xretain_flow_restore(ctx, retained_state); ctx->was_mpls = old_was_mpls; ctx->conntracked = old_conntracked; - ctx->xin->flow = old_flow; /* If the flow translation for the IF_GREATER case requires freezing, * then ctx->exit would be true. Reset to false so that we can @@ -6769,8 +7066,8 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, offset_attr = nl_msg_start_nested( ctx->odp_actions, OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); - value.u8_val = 0; - mf_write_subfield_flow(&check_pkt_larger->dst, &value, &ctx->xin->flow); + value->u8_val = 0; + mf_write_subfield_flow(&check_pkt_larger->dst, value, &ctx->xin->flow); do_xlate_actions(remaining_acts, remaining_acts_len, ctx, true, false); if (!ctx->freezing) { xlate_action_set(ctx); @@ -6781,15 +7078,12 @@ xlate_check_pkt_larger(struct xlate_ctx *ctx, nl_msg_end_nested(ctx->odp_actions, offset_attr); nl_msg_end_nested(ctx->odp_actions, offset); - ofpbuf_uninit(&ctx->action_set); - ctx->action_set = old_action_set; - ofpbuf_uninit(&ctx->stack); - ctx->stack = old_stack; - ctx->base_flow = old_base; ctx->was_mpls = old_was_mpls; ctx->conntracked = old_conntracked; - ctx->xin->flow = old_flow; ctx->exit = old_exit; + xretain_base_flow_restore(ctx, retained_state); + xretain_state_restore_and_free(ctx, retained_state); + free(value); } static void @@ -7240,6 +7534,132 @@ xlate_ofpact_unroll_xlate(struct xlate_ctx *ctx, "cookie=%#"PRIx64, a->rule_table_id, a->rule_cookie); } +/* Reset the mirror context if we modify the packet and would like to mirror + * the new copy. */ +static void +reset_mirror_ctx(struct xlate_ctx *ctx, const struct flow *flow, + const struct ofpact *a) +{ + switch (a->type) { + case OFPACT_STRIP_VLAN: + case OFPACT_PUSH_VLAN: + case OFPACT_SET_ETH_SRC: + case OFPACT_SET_ETH_DST: + case OFPACT_PUSH_MPLS: + case OFPACT_POP_MPLS: + case OFPACT_SET_MPLS_LABEL: + case OFPACT_SET_MPLS_TC: + case OFPACT_SET_MPLS_TTL: + case OFPACT_DEC_MPLS_TTL: + case OFPACT_DEC_NSH_TTL: + case OFPACT_DEC_TTL: + case OFPACT_SET_VLAN_VID: + case OFPACT_SET_VLAN_PCP: + case OFPACT_ENCAP: + case OFPACT_DECAP: + case OFPACT_NAT: + ctx->mirrors = 0; + return; + + case OFPACT_SET_FIELD: { + const struct ofpact_set_field *set_field; + const struct mf_field *mf; + + set_field = ofpact_get_SET_FIELD(a); + mf = set_field->field; + if (mf_are_prereqs_ok(mf, flow, NULL) && !mf_is_tun_metadata(mf)) { + ctx->mirrors = 0; + } + return; + } + + case OFPACT_SET_IPV4_SRC: + case OFPACT_SET_IPV4_DST: + if (flow->dl_type == htons(ETH_TYPE_IP)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_SET_IP_DSCP: + case OFPACT_SET_IP_ECN: + case OFPACT_SET_IP_TTL: + if (is_ip_any(flow)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_SET_L4_SRC_PORT: + case OFPACT_SET_L4_DST_PORT: + if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) { + ctx->mirrors = 0; + } + return; + + case OFPACT_OUTPUT_REG: + case OFPACT_OUTPUT_TRUNC: + case OFPACT_GROUP: + case OFPACT_OUTPUT: + case OFPACT_CONTROLLER: + case OFPACT_RESUBMIT: + case OFPACT_GOTO_TABLE: + case OFPACT_WRITE_METADATA: + case OFPACT_SET_TUNNEL: + case OFPACT_REG_MOVE: + case OFPACT_STACK_PUSH: + case OFPACT_STACK_POP: + case OFPACT_LEARN: + case OFPACT_ENQUEUE: + case OFPACT_SET_QUEUE: + case OFPACT_POP_QUEUE: + case OFPACT_MULTIPATH: + case OFPACT_BUNDLE: + case OFPACT_EXIT: + case OFPACT_UNROLL_XLATE: + case OFPACT_FIN_TIMEOUT: + case OFPACT_CLEAR_ACTIONS: + case OFPACT_WRITE_ACTIONS: + case OFPACT_METER: + case OFPACT_SAMPLE: + case OFPACT_CLONE: + case OFPACT_DEBUG_RECIRC: + case OFPACT_DEBUG_SLOW: + case OFPACT_CT: + case OFPACT_CT_CLEAR: + case OFPACT_CHECK_PKT_LARGER: + case OFPACT_DELETE_FIELD: + case OFPACT_NOTE: + case OFPACT_CONJUNCTION: + return; + } + + OVS_NOT_REACHED(); +} + +static void +xlate_trace(struct xlate_ctx *ctx, const struct ofpact *a) +{ + struct ofputil_port_map *map; + + map = xmalloc(sizeof *map); + ofputil_port_map_init(map); + + if (ctx->xin->names) { + struct ofproto_dpif *ofprotop; + + ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); + ofproto_append_ports_to_map(map, ofprotop->up.ports); + } + + struct ds s = DS_EMPTY_INITIALIZER; + struct ofpact_format_params fp = { .s = &s, .port_map = map }; + + ofpacts_format(a, OFPACT_ALIGN(a->len), &fp); + xlate_report(ctx, OFT_ACTION, "%s", ds_cstr(&s)); + ds_destroy(&s); + ofputil_port_map_destroy(map); + free(map); +} + static void do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, struct xlate_ctx *ctx, bool is_last_action, @@ -7281,21 +7701,10 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, break; } - if (OVS_UNLIKELY(ctx->xin->trace)) { - struct ofputil_port_map map = OFPUTIL_PORT_MAP_INITIALIZER(&map); + reset_mirror_ctx(ctx, flow, a); - if (ctx->xin->names) { - struct ofproto_dpif *ofprotop; - ofprotop = ofproto_dpif_lookup_by_name(ctx->xbridge->name); - ofproto_append_ports_to_map(&map, ofprotop->up.ports); - } - - struct ds s = DS_EMPTY_INITIALIZER; - struct ofpact_format_params fp = { .s = &s, .port_map = &map }; - ofpacts_format(a, OFPACT_ALIGN(a->len), &fp); - xlate_report(ctx, OFT_ACTION, "%s", ds_cstr(&s)); - ds_destroy(&s); - ofputil_port_map_destroy(&map); + if (OVS_UNLIKELY(ctx->xin->trace)) { + xlate_trace(ctx, a); } switch (a->type) { @@ -7389,6 +7798,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IPV4_SRC: if (flow->dl_type == htons(ETH_TYPE_IP)) { memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src); + WC_MASK_FIELD(wc, nw_proto); flow->nw_src = ofpact_get_SET_IPV4_SRC(a)->ipv4; } break; @@ -7396,12 +7806,14 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IPV4_DST: if (flow->dl_type == htons(ETH_TYPE_IP)) { memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst); + WC_MASK_FIELD(wc, nw_proto); flow->nw_dst = ofpact_get_SET_IPV4_DST(a)->ipv4; } break; case OFPACT_SET_IP_DSCP: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_tos |= IP_DSCP_MASK; flow->nw_tos &= ~IP_DSCP_MASK; flow->nw_tos |= ofpact_get_SET_IP_DSCP(a)->dscp; @@ -7410,6 +7822,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IP_ECN: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_tos |= IP_ECN_MASK; flow->nw_tos &= ~IP_ECN_MASK; flow->nw_tos |= ofpact_get_SET_IP_ECN(a)->ecn; @@ -7418,6 +7831,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_SET_IP_TTL: if (is_ip_any(flow)) { + WC_MASK_FIELD(wc, nw_proto); wc->masks.nw_ttl = 0xff; flow->nw_ttl = ofpact_get_SET_IP_TTL(a)->ttl; } @@ -7485,6 +7899,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, /* Set the field only if the packet actually has it. */ if (mf_are_prereqs_ok(mf, flow, wc)) { + mf_set_mask_l3_prereqs(mf, flow, wc); mf_mask_field_masked(mf, ofpact_set_field_mask(set_field), wc); mf_set_flow_value_masked(mf, set_field->value, ofpact_set_field_mask(set_field), @@ -7541,6 +7956,7 @@ do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len, case OFPACT_DEC_TTL: wc->masks.nw_ttl = 0xff; + WC_MASK_FIELD(wc, nw_proto); if (compose_dec_ttl(ctx, ofpact_get_DEC_TTL(a))) { return; } @@ -7977,12 +8393,16 @@ xlate_wc_finish(struct xlate_ctx *ctx) } } -/* This will optimize the odp actions generated. For now, it will remove - * trailing clone actions that are unnecessary. */ +/* This will tweak the odp actions generated. For now, it will: + * - Remove trailing clone actions that are unnecessary. + * - Add an explicit drop action if the action list is empty. + * - Add an explicit drop action if the last action is an observability + * sample. This tweak is controlled by a configurable knob. */ static void -xlate_optimize_odp_actions(struct xlate_in *xin) +xlate_tweak_odp_actions(struct xlate_ctx *ctx) { - struct ofpbuf *actions = xin->odp_actions; + uint32_t last_observe_offset = ctx->xout->last_observe_offset; + struct ofpbuf *actions = ctx->xin->odp_actions; struct nlattr *last_action = NULL; struct nlattr *a; int left; @@ -7996,11 +8416,28 @@ xlate_optimize_odp_actions(struct xlate_in *xin) last_action = a; } + if (!last_action) { + if (ovs_explicit_drop_action_supported(ctx->xbridge->ofproto)) { + put_drop_action(actions, XLATE_OK); + } + return; + } + /* Remove the trailing clone() action, by directly embedding the nested * actions. */ - if (last_action && nl_attr_type(last_action) == OVS_ACTION_ATTR_CLONE) { + if (nl_attr_type(last_action) == OVS_ACTION_ATTR_CLONE) { void *dest; + if (last_observe_offset != UINT32_MAX && + (unsigned char *) actions->data + last_observe_offset > + (unsigned char *) last_action) { + /* The last sample is inside the trailing clone. + * Adjust its offset. */ + last_observe_offset -= (unsigned char *) nl_attr_get(last_action) - + (unsigned char *) last_action; + ctx->xout->last_observe_offset = last_observe_offset; + } + nl_msg_reset_size(actions, (unsigned char *) last_action - (unsigned char *) actions->data); @@ -8008,6 +8445,16 @@ xlate_optimize_odp_actions(struct xlate_in *xin) dest = nl_msg_put_uninit(actions, nl_attr_get_size(last_action)); memmove(dest, nl_attr_get(last_action), nl_attr_get_size(last_action)); } + + /* If the last action of the list is an observability action, add an + * explicit drop action so that drop statistics remain reliable. */ + if (ctx->xbridge->ofproto->explicit_sampled_drops && + ovs_explicit_drop_action_supported(ctx->xbridge->ofproto) && + last_observe_offset != UINT32_MAX && + (unsigned char *) last_action == (unsigned char *) actions->data + + last_observe_offset) { + put_drop_action(actions, XLATE_OK); + } } /* Translates the flow, actions, or rule in 'xin' into datapath actions in @@ -8024,6 +8471,7 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) *xout = (struct xlate_out) { .slow = 0, .recircs = RECIRC_REFS_EMPTY_INITIALIZER, + .last_observe_offset = UINT32_MAX, }; struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp); @@ -8098,6 +8546,13 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) COVERAGE_INC(xlate_actions); + ctx.conj_flows = NULL; + + if (OVS_UNLIKELY(xin->trace)) { + ctx.conj_flows = xzalloc(sizeof *ctx.conj_flows); + hmapx_init(ctx.conj_flows); + } + xin->trace = xlate_report(&ctx, OFT_BRIDGE, "bridge(\"%s\")", xbridge->name); if (xin->frozen_state) { @@ -8235,7 +8690,8 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) ctx.rule = rule_dpif_lookup_from_table( ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc, ctx.xin->resubmit_stats, &ctx.table_id, - flow->in_port.ofp_port, true, true, ctx.xin->xcache); + flow->in_port.ofp_port, true, true, ctx.xin->xcache, + ctx.conj_flows); if (ctx.xin->resubmit_stats) { rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats, false); } @@ -8248,6 +8704,10 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) } xlate_report_table(&ctx, ctx.rule, ctx.table_id); + + if (OVS_UNLIKELY(ctx.xin->trace)) { + hmapx_clear(ctx.conj_flows); + } } /* Tunnel stats only for not-thawed packets. */ @@ -8429,22 +8889,26 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) ofpbuf_uninit(&scratch_actions); ofpbuf_delete(ctx.encap_data); + /* Clean up 'conj_flows' as it is no longer needed. */ + if (OVS_UNLIKELY(xin->trace)) { + hmapx_destroy(ctx.conj_flows); + free(ctx.conj_flows); + } + /* Make sure we return a "drop flow" in case of an error. */ if (ctx.error) { xout->slow = 0; if (xin->odp_actions) { ofpbuf_clear(xin->odp_actions); + /* Make the drop explicit if the datapath supports it. */ + if (ovs_explicit_drop_action_supported(ctx.xbridge->ofproto)) { + put_drop_action(xin->odp_actions, ctx.error); + } } } else { - /* In the non-error case, see if we can further optimize the datapath - * rules by removing redundant (clone) actions. */ - xlate_optimize_odp_actions(xin); - } - - /* Install drop action if datapath supports explicit drop action. */ - if (xin->odp_actions && !xin->odp_actions->size && - ovs_explicit_drop_action_supported(ctx.xbridge->ofproto)) { - put_drop_action(xin->odp_actions, ctx.error); + /* In the non-error case, see if we can further optimize or tweak + * datapath actions. */ + xlate_tweak_odp_actions(&ctx); } /* Since congestion drop and forwarding drop are not exactly diff --git a/ofproto/ofproto-dpif-xlate.h b/ofproto/ofproto-dpif-xlate.h index a68357ebdd3..46977dd931f 100644 --- a/ofproto/ofproto-dpif-xlate.h +++ b/ofproto/ofproto-dpif-xlate.h @@ -62,6 +62,10 @@ struct xlate_out { /* Recirc action IDs on which references are held. */ struct recirc_refs recircs; + + /* Keep track of the last action whose purpose is purely observational. + * e.g: IPFIX, sFlow, local sampling. */ + uint32_t last_observe_offset; }; struct xlate_in { @@ -177,8 +181,9 @@ void xlate_ofproto_set(struct ofproto_dpif *, const char *name, struct dpif *, const struct mac_learning *, struct stp *, struct rstp *, const struct mcast_snooping *, const struct mbridge *, const struct dpif_sflow *, - const struct dpif_ipfix *, const struct netflow *, - bool forward_bpdu, bool has_in_band, + const struct dpif_ipfix *, const struct dpif_lsample *, + const struct netflow *, bool forward_bpdu, + bool has_in_band, const struct dpif_backer_support *support); void xlate_remove_ofproto(struct ofproto_dpif *); struct ofproto_dpif *xlate_ofproto_lookup(const struct uuid *uuid); @@ -220,7 +225,7 @@ struct ofproto_dpif * xlate_lookup_ofproto(const struct dpif_backer *, int xlate_lookup(const struct dpif_backer *, const struct flow *, struct ofproto_dpif **, struct dpif_ipfix **, struct dpif_sflow **, struct netflow **, - ofp_port_t *ofp_in_port); + ofp_port_t *ofp_in_port, char **errorp); const char *xlate_strerror(enum xlate_error error); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 825d16883a8..e35a0ac2319 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -29,6 +29,7 @@ #include "fail-open.h" #include "guarded-list.h" #include "hmapx.h" +#include "openvswitch/json.h" #include "lacp.h" #include "learn.h" #include "mac-learning.h" @@ -50,6 +51,7 @@ #include "ofproto-dpif-sflow.h" #include "ofproto-dpif-trace.h" #include "ofproto-dpif-upcall.h" +#include "ofproto-dpif-lsample.h" #include "ofproto-dpif-xlate.h" #include "ofproto-dpif-xlate-cache.h" #include "openvswitch/ofp-actions.h" @@ -228,6 +230,8 @@ static void ofproto_unixctl_init(void); static void ct_zone_config_init(struct dpif_backer *backer); static void ct_zone_config_uninit(struct dpif_backer *backer); static void ct_zone_timeout_policy_sweep(struct dpif_backer *backer); +static void ct_zone_limits_commit(struct dpif_backer *backer); +static bool recheck_support_explicit_drop_action(struct dpif_backer *backer); static inline struct ofproto_dpif * ofproto_dpif_cast(const struct ofproto *ofproto) @@ -398,6 +402,10 @@ type_run(const char *type) udpif_set_threads(backer->udpif, n_handlers, n_revalidators); } + if (recheck_support_explicit_drop_action(backer)) { + backer->need_revalidate = REV_RECONFIGURE; + } + if (backer->need_revalidate) { struct ofproto_dpif *ofproto; struct simap_node *node; @@ -486,7 +494,7 @@ type_run(const char *type) ofproto->backer->dpif, ofproto->ml, ofproto->stp, ofproto->rstp, ofproto->ms, ofproto->mbridge, ofproto->sflow, ofproto->ipfix, - ofproto->netflow, + ofproto->lsample, ofproto->netflow, ofproto->up.forward_bpdu, connmgr_has_in_band(ofproto->up.connmgr), &ofproto->backer->rt_support); @@ -531,6 +539,7 @@ type_run(const char *type) process_dpif_port_changes(backer); ct_zone_timeout_policy_sweep(backer); + ct_zone_limits_commit(backer); return 0; } @@ -732,13 +741,9 @@ close_dpif_backer(struct dpif_backer *backer, bool del) free(backer); } -/* Datapath port slated for removal from datapath. */ -struct odp_garbage { - struct ovs_list list_node; - odp_port_t odp_port; -}; - static void check_support(struct dpif_backer *backer); +static void copy_support(struct dpif_backer_support *dst, + struct dpif_backer_support *src); static int open_dpif_backer(const char *type, struct dpif_backer **backerp) @@ -747,8 +752,6 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) struct dpif_port_dump port_dump; struct dpif_port port; struct shash_node *node; - struct ovs_list garbage_list; - struct odp_garbage *garbage; struct sset names; char *backer_name; @@ -810,25 +813,23 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) dpif_flow_flush(backer->dpif); } - /* Loop through the ports already on the datapath and remove any - * that we don't need anymore. */ - ovs_list_init(&garbage_list); + /* Loop through the ports already on the datapath and find ones that are + * not on the initial OpenFlow ports list. These are stale ports, that we + * do not need anymore, or tunnel backing interfaces, that do not generally + * match the name of OpenFlow tunnel ports, or both. Add all of them to + * the list of tunnel backers. type_run() will garbage collect those that + * are not active tunnel backing interfaces during revalidation. */ dpif_port_dump_start(&port_dump, backer->dpif); while (dpif_port_dump_next(&port_dump, &port)) { node = shash_find(&init_ofp_ports, port.name); if (!node && strcmp(port.name, dpif_base_name(backer->dpif))) { - garbage = xmalloc(sizeof *garbage); - garbage->odp_port = port.port_no; - ovs_list_push_front(&garbage_list, &garbage->list_node); + simap_put(&backer->tnl_backers, port.name, + odp_to_u32(port.port_no)); + backer->need_revalidate = REV_RECONFIGURE; } } dpif_port_dump_done(&port_dump); - LIST_FOR_EACH_POP (garbage, list_node, &garbage_list) { - dpif_port_del(backer->dpif, garbage->odp_port, false); - free(garbage); - } - shash_add(&all_dpif_backers, type, backer); check_support(backer); @@ -863,7 +864,7 @@ open_dpif_backer(const char *type, struct dpif_backer **backerp) * 'boottime_support' can be checked to prevent 'support' to be changed * beyond the datapath capabilities. In case 'support' is changed by * the user, 'boottime_support' can be used to restore it. */ - backer->bt_support = backer->rt_support; + copy_support(&backer->bt_support, &backer->rt_support); return error; } @@ -879,7 +880,11 @@ ovs_native_tunneling_is_on(struct ofproto_dpif *ofproto) bool ovs_explicit_drop_action_supported(struct ofproto_dpif *ofproto) { - return ofproto->backer->rt_support.explicit_drop_action; + bool value; + + atomic_read_relaxed(&ofproto->backer->rt_support.explicit_drop_action, + &value); + return value; } bool @@ -888,6 +893,12 @@ ovs_lb_output_action_supported(struct ofproto_dpif *ofproto) return ofproto->backer->rt_support.lb_output_action; } +bool +ovs_psample_supported(struct ofproto_dpif *ofproto) +{ + return ofproto->backer->rt_support.psample; +} + /* Tests whether 'backer''s datapath supports recirculation. Only newer * datapaths support OVS_KEY_ATTR_RECIRC_ID in keys. We need to disable some * features on older datapaths that don't support this feature. @@ -1283,7 +1294,7 @@ check_ct_eventmask(struct dpif_backer *backer) /* Compose a dummy UDP packet. */ dp_packet_init(&packet, 0); - flow_compose(&packet, &flow, NULL, 64); + flow_compose(&packet, &flow, NULL, 64, false); /* Execute the actions. On older datapaths this fails with EINVAL, on * newer datapaths it succeeds. */ @@ -1376,7 +1387,7 @@ check_ct_timeout_policy(struct dpif_backer *backer) /* Compose a dummy UDP packet. */ dp_packet_init(&packet, 0); - flow_compose(&packet, &flow, NULL, 64); + flow_compose(&packet, &flow, NULL, 64, false); /* Execute the actions. On older datapaths this fails with EINVAL, on * newer datapaths it succeeds. */ @@ -1403,6 +1414,40 @@ check_ct_timeout_policy(struct dpif_backer *backer) return !error; } +/* Tests whether backer's datapath supports the OVS_ACTION_ATTR_DROP action. */ +static bool +check_drop_action(struct dpif_backer *backer) +{ + struct odputil_keybuf keybuf; + uint8_t actbuf[NL_A_U32_SIZE]; + struct ofpbuf actions; + struct ofpbuf key; + bool supported; + + struct flow flow = { + .dl_type = CONSTANT_HTONS(0x1234), /* bogus */ + }; + struct odp_flow_key_parms odp_parms = { + .flow = &flow, + .probe = true, + }; + + ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); + odp_flow_key_from_flow(&odp_parms, &key); + + ofpbuf_use_stack(&actions, &actbuf, sizeof actbuf); + nl_msg_put_u32(&actions, OVS_ACTION_ATTR_DROP, XLATE_OK); + + supported = dpif_may_support_explicit_drop_action(backer->dpif) && + dpif_probe_feature(backer->dpif, "drop", &key, &actions, NULL); + + VLOG_INFO("%s: Datapath %s explicit drop action", + dpif_name(backer->dpif), + (supported) ? "supports" : "does not support"); + + return supported; +} + /* Tests whether 'backer''s datapath supports the all-zero SNAT case. */ static bool dpif_supports_ct_zero_snat(struct dpif_backer *backer) @@ -1590,6 +1635,44 @@ check_add_mpls(struct dpif_backer *backer) return supported; } +/* Tests whether 'backer''s datapath supports the OVS_ACTION_ATTR_PSAMPLE + * action. */ +static bool +check_psample(struct dpif_backer *backer) +{ + uint8_t cookie[OVS_PSAMPLE_COOKIE_MAX_SIZE]; + struct odputil_keybuf keybuf; + struct ofpbuf actions; + struct ofpbuf key; + bool supported; + + /* Intentionally bogus dl_type. */ + struct flow flow = { + .dl_type = CONSTANT_HTONS(0x1234), + }; + struct odp_flow_key_parms odp_parms = { + .flow = &flow, + .probe = true, + }; + + ofpbuf_use_stack(&key, &keybuf, sizeof keybuf); + odp_flow_key_from_flow(&odp_parms, &key); + ofpbuf_init(&actions, 32); + + /* Generate a random max-size cookie. */ + random_bytes(cookie, sizeof cookie); + + odp_put_psample_action(&actions, 10, cookie, sizeof cookie); + + supported = dpif_may_support_psample(backer->dpif) && + dpif_probe_feature(backer->dpif, "psample", &key, &actions, NULL); + + ofpbuf_uninit(&actions); + VLOG_INFO("%s: Datapath %s psample action", dpif_name(backer->dpif), + supported ? "supports" : "does not support"); + return supported; +} + #define CHECK_FEATURE__(NAME, SUPPORT, FIELD, VALUE, ETHTYPE) \ static bool \ check_##NAME(struct dpif_backer *backer) \ @@ -1637,6 +1720,24 @@ CHECK_FEATURE__(ct_orig_tuple6, ct_orig_tuple6, ct_nw_proto, 1, ETH_TYPE_IPV6) #undef CHECK_FEATURE #undef CHECK_FEATURE__ +static void +copy_support(struct dpif_backer_support *dst, struct dpif_backer_support *src) +{ +#define DPIF_SUPPORT_FIELD(TYPE, NAME, TITLE) \ + if (!strcmp(#TYPE, "atomic_bool")) { \ + bool value; \ + atomic_read_relaxed((atomic_bool *) &src->NAME, &value); \ + atomic_store_relaxed((atomic_bool *) &dst->NAME, value); \ + } else { \ + dst->NAME = src->NAME; \ + } + + DPIF_SUPPORT_FIELDS +#undef DPIF_SUPPORT_FIELD + + dst->odp = src->odp; +} + static void check_support(struct dpif_backer *backer) { @@ -1655,12 +1756,13 @@ check_support(struct dpif_backer *backer) backer->rt_support.max_hash_alg = check_max_dp_hash_alg(backer); backer->rt_support.check_pkt_len = check_check_pkt_len(backer); backer->rt_support.ct_timeout = check_ct_timeout_policy(backer); - backer->rt_support.explicit_drop_action = - dpif_supports_explicit_drop_action(backer->dpif); + atomic_store_relaxed(&backer->rt_support.explicit_drop_action, + check_drop_action(backer)); backer->rt_support.lb_output_action = dpif_supports_lb_output_action(backer->dpif); backer->rt_support.ct_zero_snat = dpif_supports_ct_zero_snat(backer); backer->rt_support.add_mpls = check_add_mpls(backer); + backer->rt_support.psample = check_psample(backer); /* Flow fields. */ backer->rt_support.odp.ct_state = check_ct_state(backer); @@ -1673,6 +1775,28 @@ check_support(struct dpif_backer *backer) backer->rt_support.odp.nd_ext = check_nd_extensions(backer); } +/* TC does not support offloading the explicit drop action. As such we need to + * re-probe the datapath if hw-offload has been modified. + * Note: We don't support true --> false transition as that requires a restart. + * See netdev_set_flow_api_enabled(). */ +static bool +recheck_support_explicit_drop_action(struct dpif_backer *backer) +{ + bool explicit_drop_action; + + atomic_read_relaxed(&backer->rt_support.explicit_drop_action, + &explicit_drop_action); + + if (explicit_drop_action + && !dpif_may_support_explicit_drop_action(backer->dpif)) { + ovs_assert(!check_drop_action(backer)); + atomic_store_relaxed(&backer->rt_support.explicit_drop_action, false); + return true; + } + + return false; +} + static int construct(struct ofproto *ofproto_) { @@ -1713,6 +1837,7 @@ construct(struct ofproto *ofproto_) ofproto->change_seq = 0; ofproto->ams_seq = seq_create(); ofproto->ams_seqno = seq_read(ofproto->ams_seq); + ofproto->explicit_sampled_drops = false; SHASH_FOR_EACH_SAFE (node, &init_ofp_ports) { @@ -1852,6 +1977,7 @@ destruct(struct ofproto *ofproto_, bool del) netflow_unref(ofproto->netflow); dpif_sflow_unref(ofproto->sflow); dpif_ipfix_unref(ofproto->ipfix); + dpif_lsample_unref(ofproto->lsample); hmap_destroy(&ofproto->bundles); mac_learning_unref(ofproto->ml); mcast_snooping_unref(ofproto->ms); @@ -1984,6 +2110,11 @@ run(struct ofproto *ofproto_) } } } + + if (ofproto->explicit_sampled_drops != ofproto_explicit_sampled_drops) { + ofproto->explicit_sampled_drops = ofproto_explicit_sampled_drops; + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } return 0; } @@ -2189,8 +2320,7 @@ port_destruct(struct ofport *port_, bool del) struct ofproto_dpif *ofproto = ofproto_dpif_cast(port->up.ofproto); const char *devname = netdev_get_name(port->up.netdev); const char *netdev_type = netdev_get_type(port->up.netdev); - char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; - const char *dp_port_name; + struct dpif_port dpif_port; ofproto->backer->need_revalidate = REV_RECONFIGURE; xlate_txn_start(); @@ -2204,9 +2334,13 @@ port_destruct(struct ofport *port_, bool del) del = dpif_cleanup_required(ofproto->backer->dpif); } - dp_port_name = netdev_vport_get_dpif_port(port->up.netdev, namebuf, - sizeof namebuf); - if (del && dpif_port_exists(ofproto->backer->dpif, dp_port_name)) { + /* Don't try to delete ports that are not part of the datapath. */ + if (del && port->odp_port == ODPP_NONE) { + del = false; + } + + if (del && !dpif_port_query_by_number(ofproto->backer->dpif, + port->odp_port, &dpif_port, false)) { /* The underlying device is still there, so delete it. This * happens when the ofproto is being destroyed, since the caller * assumes that removal of attached ports will happen as part of @@ -2214,6 +2348,7 @@ port_destruct(struct ofport *port_, bool del) if (!port->is_tunnel) { dpif_port_del(ofproto->backer->dpif, port->odp_port, false); } + dpif_port_destroy(&dpif_port); } else if (del) { /* The underlying device is already deleted (e.g. tunctl -d). * Calling dpif_port_remove to do local cleanup for the netdev */ @@ -2407,6 +2542,41 @@ get_ipfix_stats(const struct ofproto *ofproto_, return dpif_ipfix_get_stats(di, bridge_ipfix, replies); } +static int +set_local_sample(struct ofproto *ofproto_, + const struct ofproto_lsample_options *options, + size_t n_opts) +{ + struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); + struct dpif_lsample *lsample = ofproto->lsample; + bool changed = false; + + if (!ofproto->backer->rt_support.psample) { + return EOPNOTSUPP; + } + + if (n_opts && !lsample) { + lsample = ofproto->lsample = dpif_lsample_create(); + changed = true; + } + + if (lsample) { + if (!n_opts) { + dpif_lsample_unref(lsample); + lsample = ofproto->lsample = NULL; + changed = true; + } else if (dpif_lsample_set_options(lsample, options, n_opts)) { + changed = true; + } + } + + if (changed) { + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } + + return 0; +} + static int set_cfm(struct ofport *ofport_, const struct cfm_settings *s) { @@ -3670,7 +3840,7 @@ mirror_set__(struct ofproto *ofproto_, void *aux, const struct ofproto_mirror_settings *s) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - struct ofbundle **srcs, **dsts; + struct mirror_bundles mb; int error; size_t i; @@ -3679,23 +3849,34 @@ mirror_set__(struct ofproto *ofproto_, void *aux, return 0; } - srcs = xmalloc(s->n_srcs * sizeof *srcs); - dsts = xmalloc(s->n_dsts * sizeof *dsts); + mb.srcs = xmalloc(s->n_srcs * sizeof *mb.srcs); + mb.dsts = xmalloc(s->n_dsts * sizeof *mb.dsts); for (i = 0; i < s->n_srcs; i++) { - srcs[i] = bundle_lookup(ofproto, s->srcs[i]); + mb.srcs[i] = bundle_lookup(ofproto, s->srcs[i]); } for (i = 0; i < s->n_dsts; i++) { - dsts[i] = bundle_lookup(ofproto, s->dsts[i]); + mb.dsts[i] = bundle_lookup(ofproto, s->dsts[i]); + } + + mb.n_srcs = s->n_srcs; + mb.n_dsts = s->n_dsts; + mb.out_bundle = bundle_lookup(ofproto, s->out_bundle); + + error = mirror_set(ofproto->mbridge, ofproto_, aux, s, &mb); + + if (!error) { + ofproto->backer->need_revalidate = REV_RECONFIGURE; + } else if (error == ECANCELED) { + /* The user requested a change that is identical to the current state, + * the reconfiguration is canceled, but don't log an error message + * about that. */ + error = 0; } - error = mirror_set(ofproto->mbridge, aux, s->name, srcs, s->n_srcs, dsts, - s->n_dsts, s->src_vlans, - bundle_lookup(ofproto, s->out_bundle), - s->snaplen, s->out_vlan); - free(srcs); - free(dsts); + free(mb.srcs); + free(mb.dsts); return error; } @@ -3929,15 +4110,21 @@ port_query_by_name(const struct ofproto *ofproto_, const char *devname, int error; if (sset_contains(&ofproto->ghost_ports, devname)) { - const char *type = netdev_get_type_from_name(devname); - /* We may be called before ofproto->up.port_by_name is populated with * the appropriate ofport. For this reason, we must get the name and - * type from the netdev layer directly. */ - if (type) { - const struct ofport *ofport; + * type from the netdev layer directly. + * However, when a port deleted, the corresponding netdev is also + * removed from netdev_shash. netdev_get_type_from_name returns NULL + * in such case and we should try to get type from ofport->netdev. */ + const char *type = netdev_get_type_from_name(devname); + const struct ofport *ofport = + shash_find_data(&ofproto->up.port_by_name, devname); - ofport = shash_find_data(&ofproto->up.port_by_name, devname); + if (!type && ofport && ofport->netdev) { + type = netdev_get_type(ofport->netdev); + } + + if (type) { ofproto_port->ofp_port = ofport ? ofport->ofp_port : OFPP_NONE; ofproto_port->name = xstrdup(devname); ofproto_port->type = xstrdup(type); @@ -4411,15 +4598,20 @@ ofproto_dpif_get_tables_version(struct ofproto_dpif *ofproto) * a reference. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ static struct rule_dpif * rule_dpif_lookup_in_table(struct ofproto_dpif *ofproto, ovs_version_t version, uint8_t table_id, struct flow *flow, - struct flow_wildcards *wc) + struct flow_wildcards *wc, + struct hmapx *conj_flows) { struct classifier *cls = &ofproto->up.tables[table_id].cls; return rule_dpif_cast(rule_from_cls_rule(classifier_lookup(cls, version, - flow, wc))); + flow, wc, + conj_flows))); } void @@ -4461,7 +4653,10 @@ ofproto_dpif_credit_table_stats(struct ofproto_dpif *ofproto, uint8_t table_id, * 'in_port'. This is needed for resubmit action support. * * 'flow' is non-const to allow for temporary modifications during the lookup. - * Any changes are restored before returning. */ + * Any changes are restored before returning. + * + * 'conj_flows' is an optional parameter. If it is non-null, the matching + * conjunctive flows are inserted. */ struct rule_dpif * rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, ovs_version_t version, struct flow *flow, @@ -4469,7 +4664,8 @@ rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, const struct dpif_flow_stats *stats, uint8_t *table_id, ofp_port_t in_port, bool may_packet_in, bool honor_table_miss, - struct xlate_cache *xcache) + struct xlate_cache *xcache, + struct hmapx *conj_flows) { ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst; ofp_port_t old_in_port = flow->in_port.ofp_port; @@ -4525,7 +4721,8 @@ rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto, next_id++, next_id += (next_id == TBL_INTERNAL)) { *table_id = next_id; - rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc); + rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc, + conj_flows); if (stats) { struct oftable *tbl = &ofproto->up.tables[next_id]; unsigned long orig; @@ -4908,7 +5105,7 @@ packet_xlate(struct ofproto *ofproto_, struct ofproto_packet_out *opo) if (entry->type == XC_LEARN) { struct ofproto_flow_mod *ofm = entry->learn.ofm; - error = ofproto_flow_mod_learn_refresh(ofm); + error = ofproto_flow_mod_learn_refresh(ofm, time_msec()); if (error) { goto error_out; } @@ -5380,11 +5577,12 @@ type_set_config(const char *type, const struct smap *other_config) } static void -ct_flush(const struct ofproto *ofproto_, const uint16_t *zone) +ct_flush(const struct ofproto *ofproto_, const uint16_t *zone, + const struct ofp_ct_match *match) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_); - ct_dpif_flush(ofproto->backer->dpif, zone, NULL); + ct_dpif_flush(ofproto->backer->dpif, zone, match); } static struct ct_timeout_policy * @@ -5549,6 +5747,8 @@ ct_zone_config_init(struct dpif_backer *backer) cmap_init(&backer->ct_zones); hmap_init(&backer->ct_tps); ovs_list_init(&backer->ct_tp_kill_list); + ovs_list_init(&backer->ct_zone_limits_to_add); + ovs_list_init(&backer->ct_zone_limits_to_del); clear_existing_ct_timeout_policies(backer); } @@ -5572,6 +5772,8 @@ ct_zone_config_uninit(struct dpif_backer *backer) id_pool_destroy(backer->tp_ids); cmap_destroy(&backer->ct_zones); hmap_destroy(&backer->ct_tps); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_add); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_del); } static void @@ -5652,50 +5854,100 @@ ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone_id) } } +static void +ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit) +{ + struct dpif_backer *backer = shash_find_data(&all_dpif_backers, + datapath_type); + if (!backer) { + return; + } + + if (limit) { + ct_dpif_push_zone_limit(&backer->ct_zone_limits_to_add, zone_id, + *limit, 0); + } else { + ct_dpif_push_zone_limit(&backer->ct_zone_limits_to_del, zone_id, 0, 0); + } +} + +static void +ct_zone_limits_commit(struct dpif_backer *backer) +{ + if (!ovs_list_is_empty(&backer->ct_zone_limits_to_add)) { + ct_dpif_set_limits(backer->dpif, &backer->ct_zone_limits_to_add); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_add); + } + + if (!ovs_list_is_empty(&backer->ct_zone_limits_to_del)) { + ct_dpif_del_limits(backer->dpif, &backer->ct_zone_limits_to_del); + ct_dpif_free_zone_limits(&backer->ct_zone_limits_to_del); + } +} + +static void +ct_zone_limit_protection_update(const char *datapath_type, bool protected) +{ + struct dpif_backer *backer = shash_find_data(&all_dpif_backers, + datapath_type); + if (!backer) { + return; + } + + ct_dpif_set_zone_limit_protection(backer->dpif, protected); +} + static void get_datapath_cap(const char *datapath_type, struct smap *cap) { - struct odp_support odp; - struct dpif_backer_support s; + bool explicit_drop_action; + struct dpif_backer_support *s; struct dpif_backer *backer = shash_find_data(&all_dpif_backers, datapath_type); if (!backer) { return; } - s = backer->rt_support; - odp = s.odp; + s = &backer->rt_support; /* ODP_SUPPORT_FIELDS */ smap_add_format(cap, "max_vlan_headers", "%"PRIuSIZE, - odp.max_vlan_headers); - smap_add_format(cap, "max_mpls_depth", "%"PRIuSIZE, odp.max_mpls_depth); - smap_add(cap, "recirc", odp.recirc ? "true" : "false"); - smap_add(cap, "ct_state", odp.ct_state ? "true" : "false"); - smap_add(cap, "ct_zone", odp.ct_zone ? "true" : "false"); - smap_add(cap, "ct_mark", odp.ct_mark ? "true" : "false"); - smap_add(cap, "ct_label", odp.ct_label ? "true" : "false"); - smap_add(cap, "ct_state_nat", odp.ct_state_nat ? "true" : "false"); - smap_add(cap, "ct_orig_tuple", odp.ct_orig_tuple ? "true" : "false"); - smap_add(cap, "ct_orig_tuple6", odp.ct_orig_tuple6 ? "true" : "false"); - smap_add(cap, "nd_ext", odp.nd_ext ? "true" : "false"); + s->odp.max_vlan_headers); + smap_add_format(cap, "max_mpls_depth", "%"PRIuSIZE, s->odp.max_mpls_depth); + smap_add(cap, "recirc", s->odp.recirc ? "true" : "false"); + smap_add(cap, "ct_state", s->odp.ct_state ? "true" : "false"); + smap_add(cap, "ct_zone", s->odp.ct_zone ? "true" : "false"); + smap_add(cap, "ct_mark", s->odp.ct_mark ? "true" : "false"); + smap_add(cap, "ct_label", s->odp.ct_label ? "true" : "false"); + smap_add(cap, "ct_state_nat", s->odp.ct_state_nat ? "true" : "false"); + smap_add(cap, "ct_orig_tuple", s->odp.ct_orig_tuple ? "true" : "false"); + smap_add(cap, "ct_orig_tuple6", s->odp.ct_orig_tuple6 ? "true" : "false"); + smap_add(cap, "nd_ext", s->odp.nd_ext ? "true" : "false"); /* DPIF_SUPPORT_FIELDS */ - smap_add(cap, "masked_set_action", s.masked_set_action ? "true" : "false"); - smap_add(cap, "tnl_push_pop", s.tnl_push_pop ? "true" : "false"); - smap_add(cap, "ufid", s.ufid ? "true" : "false"); - smap_add(cap, "trunc", s.trunc ? "true" : "false"); - smap_add(cap, "clone", s.clone ? "true" : "false"); - smap_add(cap, "sample_nesting", s.sample_nesting ? "true" : "false"); - smap_add(cap, "ct_eventmask", s.ct_eventmask ? "true" : "false"); - smap_add(cap, "ct_clear", s.ct_clear ? "true" : "false"); - smap_add_format(cap, "max_hash_alg", "%"PRIuSIZE, s.max_hash_alg); - smap_add(cap, "check_pkt_len", s.check_pkt_len ? "true" : "false"); - smap_add(cap, "ct_timeout", s.ct_timeout ? "true" : "false"); + smap_add(cap, "masked_set_action", + s->masked_set_action ? "true" : "false"); + smap_add(cap, "tnl_push_pop", s->tnl_push_pop ? "true" : "false"); + smap_add(cap, "ufid", s->ufid ? "true" : "false"); + smap_add(cap, "trunc", s->trunc ? "true" : "false"); + smap_add(cap, "clone", s->clone ? "true" : "false"); + smap_add(cap, "sample_nesting", s->sample_nesting ? "true" : "false"); + smap_add(cap, "ct_eventmask", s->ct_eventmask ? "true" : "false"); + smap_add(cap, "ct_clear", s->ct_clear ? "true" : "false"); + smap_add_format(cap, "max_hash_alg", "%"PRIuSIZE, s->max_hash_alg); + smap_add(cap, "check_pkt_len", s->check_pkt_len ? "true" : "false"); + smap_add(cap, "ct_timeout", s->ct_timeout ? "true" : "false"); + atomic_read_relaxed(&s->explicit_drop_action, &explicit_drop_action); smap_add(cap, "explicit_drop_action", - s.explicit_drop_action ? "true" :"false"); - smap_add(cap, "lb_output_action", s.lb_output_action ? "true" : "false"); - smap_add(cap, "ct_zero_snat", s.ct_zero_snat ? "true" : "false"); - smap_add(cap, "add_mpls", s.add_mpls ? "true" : "false"); + explicit_drop_action ? "true" :"false"); + smap_add(cap, "lb_output_action", s->lb_output_action ? "true" : "false"); + smap_add(cap, "ct_zero_snat", s->ct_zero_snat ? "true" : "false"); + smap_add(cap, "add_mpls", s->add_mpls ? "true" : "false"); + smap_add(cap, "psample", s->psample ? "true" : "false"); + + /* The ct_tuple_flush is implemented on dpif level, so it is supported + * for all backers. */ + smap_add(cap, "ct_flush", "true"); } /* Gets timeout policy name in 'backer' based on 'zone', 'dl_type' and @@ -5968,7 +6220,7 @@ ofproto_unixctl_fdb_add(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *port_name = argv[2]; uint16_t vlan = atoi(argv[3]); struct eth_addr mac; - int age; + time_t age; ofproto = ofproto_dpif_lookup_by_name(br_name); if (!ofproto) { @@ -6135,7 +6387,7 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, return; } - ds_put_cstr(&ds, " port VLAN GROUP Age\n"); + ds_put_cstr(&ds, " port VLAN protocol GROUP Age\n"); ovs_rwlock_rdlock(&ofproto->ms->rwlock); LIST_FOR_EACH (grp, group_node, &ofproto->ms->group_lru) { LIST_FOR_EACH(b, bundle_node, &grp->bundle_lru) { @@ -6144,7 +6396,9 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, bundle = b->port; ofputil_port_to_string(ofbundle_get_a_port(bundle)->up.ofp_port, NULL, name, sizeof name); - ds_put_format(&ds, "%5s %4d ", name, grp->vlan); + ds_put_format(&ds, "%5s %4d %-8s ", name, grp->vlan, + mcast_snooping_group_protocol_str( + grp->protocol_version)); ipv6_format_mapped(&grp->addr, &ds); ds_put_format(&ds, " %3d\n", mcast_bundle_age(ofproto->ms, b)); @@ -6158,8 +6412,9 @@ ofproto_unixctl_mcast_snooping_show(struct unixctl_conn *conn, bundle = mrouter->port; ofputil_port_to_string(ofbundle_get_a_port(bundle)->up.ofp_port, NULL, name, sizeof name); - ds_put_format(&ds, "%5s %4d querier %3d\n", + ds_put_format(&ds, "%5s %4d %-8s querier %3d\n", name, mrouter->vlan, + mcast_snooping_group_protocol_str(-1), mcast_mrouter_age(ofproto->ms, mrouter)); } ovs_rwlock_unlock(&ofproto->ms->rwlock); @@ -6209,20 +6464,30 @@ ofproto_unixctl_dpif_dump_dps(struct unixctl_conn *conn, int argc OVS_UNUSED, } static void -show_dp_feature_bool(struct ds *ds, const char *feature, bool b) +show_dp_feature_bool(struct ds *ds, const char *feature, const bool *b) +{ + ds_put_format(ds, "%s: %s\n", feature, *b ? "Yes" : "No"); +} + +static void +show_dp_feature_atomic_bool(struct ds *ds, const char *feature, + const atomic_bool *b) { - ds_put_format(ds, "%s: %s\n", feature, b ? "Yes" : "No"); + bool value; + atomic_read_relaxed((atomic_bool *) b, &value); + ds_put_format(ds, "%s: %s\n", feature, value ? "Yes" : "No"); } static void -show_dp_feature_size_t(struct ds *ds, const char *feature, size_t s) +show_dp_feature_size_t(struct ds *ds, const char *feature, const size_t *s) { - ds_put_format(ds, "%s: %"PRIuSIZE"\n", feature, s); + ds_put_format(ds, "%s: %"PRIuSIZE"\n", feature, *s); } enum dpif_support_field_type { DPIF_SUPPORT_FIELD_bool, DPIF_SUPPORT_FIELD_size_t, + DPIF_SUPPORT_FIELD_atomic_bool, }; struct dpif_support_field { @@ -6239,12 +6504,12 @@ static void dpif_show_support(const struct dpif_backer_support *support, struct ds *ds) { #define DPIF_SUPPORT_FIELD(TYPE, NAME, TITLE) \ - show_dp_feature_##TYPE (ds, TITLE, support->NAME); + show_dp_feature_##TYPE (ds, TITLE, &support->NAME); DPIF_SUPPORT_FIELDS #undef DPIF_SUPPORT_FIELD #define ODP_SUPPORT_FIELD(TYPE, NAME, TITLE) \ - show_dp_feature_##TYPE (ds, TITLE, support->odp.NAME ); + show_dp_feature_##TYPE (ds, TITLE, &support->odp.NAME ); ODP_SUPPORT_FIELDS #undef ODP_SUPPORT_FIELD } @@ -6263,6 +6528,16 @@ display_support_field(const char *name, b ? "true" : "false"); break; } + case DPIF_SUPPORT_FIELD_atomic_bool: { + bool b, v; + + atomic_read_relaxed((atomic_bool *) field->rt_ptr, &v); + atomic_read_relaxed((atomic_bool *) field->bt_ptr, &b); + ds_put_format(ds, "%s (%s) : [run time]:%s, [boot time]:%s\n", name, + field->title, v ? "true" : "false", + b ? "true" : "false"); + break; + } case DPIF_SUPPORT_FIELD_size_t: ds_put_format(ds, "%s (%s) : [run time]:%"PRIuSIZE ", [boot time]:%"PRIuSIZE"\n", name, @@ -6280,7 +6555,8 @@ display_support_field(const char *name, static bool dpif_set_support(struct dpif_backer_support *rt_support, struct dpif_backer_support *bt_support, - const char *name, const char *value, struct ds *ds) + const char *name, const char *value, bool force, + struct ds *ds) { struct shash all_fields = SHASH_INITIALIZER(&all_fields); struct dpif_support_field *field; @@ -6332,8 +6608,13 @@ dpif_set_support(struct dpif_backer_support *rt_support, if (field->type == DPIF_SUPPORT_FIELD_bool) { if (!strcasecmp(value, "true")) { - if (*(bool *)field->bt_ptr) { - *(bool *)field->rt_ptr = true; + if (*(bool *) field->bt_ptr || force) { + if (force) { + VLOG_WARN( + "Enabling an unsupported feature is very dangerous" + ); + } + *(bool *) field->rt_ptr = true; changed = true; } else { ds_put_cstr(ds, "Can not enable features not supported by the datapth"); @@ -6367,19 +6648,108 @@ dpif_set_support(struct dpif_backer_support *rt_support, return changed; } +static struct json * +dpif_show_backer_json(struct json *backers, const struct dpif_backer *backer) +{ + struct json *json_backer = json_object_create(); + + /* Add datapath as new JSON object using its name as key. */ + json_object_put(backers, dpif_name(backer->dpif), json_backer); + + /* Add datapath's stats under "stats" key. */ + struct json *json_dp_stats = json_object_create(); + struct dpif_dp_stats dp_stats; + + dpif_get_dp_stats(backer->dpif, &dp_stats); + json_object_put_format(json_dp_stats, "hit", "%"PRIu64, dp_stats.n_hit); + json_object_put_format(json_dp_stats, "missed", "%"PRIu64, + dp_stats.n_missed); + json_object_put(json_backer, "stats", json_dp_stats); + + /* Add datapath's bridges under "bridges" key. */ + struct json *json_dp_bridges = json_object_create(); + + struct shash ofproto_shash = SHASH_INITIALIZER(&ofproto_shash); + free(get_ofprotos(&ofproto_shash)); + + struct shash_node *node; + SHASH_FOR_EACH (node, &ofproto_shash) { + struct ofproto_dpif *ofproto = node->data; + + if (ofproto->backer != backer) { + continue; + } + + /* Add bridge to "bridges" dictionary using its name as key. */ + struct json *json_ofproto = json_object_create(); + + /* Add bridge ports to the current bridge dictionary. */ + const struct shash_node *port; + SHASH_FOR_EACH (port, &ofproto->up.port_by_name) { + /* Add bridge port to a bridge's dict using port name as key. */ + struct json *json_ofproto_port = json_object_create(); + struct ofport *ofport = port->data; + + /* Add OpenFlow port associated with a bridge port. */ + json_object_put_format(json_ofproto_port, "ofport", "%"PRIu32, + ofport->ofp_port); + + /* Add bridge port number. */ + odp_port_t odp_port = ofp_port_to_odp_port(ofproto, + ofport->ofp_port); + if (odp_port != ODPP_NONE) { + json_object_put_format(json_ofproto_port, "port_no", + "%"PRIu32, odp_port); + } else { + json_object_put_string(json_ofproto_port, "port_no", "none"); + } + + /* Add type of a bridge port. */ + json_object_put_string(json_ofproto_port, "type", + netdev_get_type(ofport->netdev)); + + /* Add config entries for a bridge port. */ + + struct smap config = SMAP_INITIALIZER(&config); + + if (!netdev_get_config(ofport->netdev, &config) + && smap_count(&config)) { + struct json *json_port_config = json_object_create(); + struct smap_node *cfg_node; + + SMAP_FOR_EACH (cfg_node, &config) { + json_object_put_string(json_port_config, cfg_node->key, + cfg_node->value); + } + json_object_put(json_ofproto_port, "config", json_port_config); + } + smap_destroy(&config); + + json_object_put(json_ofproto, netdev_get_name(ofport->netdev), + json_ofproto_port); + } /* End of bridge port(s). */ + + json_object_put(json_dp_bridges, ofproto->up.name, json_ofproto); + } /* End of bridge(s). */ + + shash_destroy(&ofproto_shash); + + json_object_put(json_backer, "bridges", json_dp_bridges); + return json_backer; +} + static void -dpif_show_backer(const struct dpif_backer *backer, struct ds *ds) +dpif_show_backer_text(const struct dpif_backer *backer, struct ds *ds) { + struct shash ofproto_shash = SHASH_INITIALIZER(&ofproto_shash); const struct shash_node **ofprotos; struct dpif_dp_stats dp_stats; - struct shash ofproto_shash; size_t i; dpif_get_dp_stats(backer->dpif, &dp_stats); ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n", dpif_name(backer->dpif), dp_stats.n_hit, dp_stats.n_missed); - shash_init(&ofproto_shash); ofprotos = get_ofprotos(&ofproto_shash); for (i = 0; i < shash_count(&ofproto_shash); i++) { struct ofproto_dpif *ofproto = ofprotos[i]->data; @@ -6435,18 +6805,26 @@ static void ofproto_unixctl_dpif_show(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) { - struct ds ds = DS_EMPTY_INITIALIZER; - const struct shash_node **backers; - int i; + if (unixctl_command_get_output_format(conn) == UNIXCTL_OUTPUT_FMT_JSON) { + struct json *backers = json_object_create(); + const struct shash_node *backer; - backers = shash_sort(&all_dpif_backers); - for (i = 0; i < shash_count(&all_dpif_backers); i++) { - dpif_show_backer(backers[i]->data, &ds); - } - free(backers); + SHASH_FOR_EACH (backer, &all_dpif_backers) { + dpif_show_backer_json(backers, backer->data); + } + unixctl_command_reply_json(conn, backers); + } else { + const struct shash_node **backers = shash_sort(&all_dpif_backers); + struct ds ds = DS_EMPTY_INITIALIZER; - unixctl_command_reply(conn, ds_cstr(&ds)); - ds_destroy(&ds); + for (int i = 0; i < shash_count(&all_dpif_backers); i++) { + dpif_show_backer_text(backers[i]->data, &ds); + } + free(backers); + + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); + } } static void @@ -6568,10 +6946,19 @@ ofproto_unixctl_dpif_set_dp_features(struct unixctl_conn *conn, void *aux OVS_UNUSED) { struct ds ds = DS_EMPTY_INITIALIZER; - const char *br = argv[1]; + struct ofproto_dpif *ofproto; + bool changed, force = false; const char *name, *value; - struct ofproto_dpif *ofproto = ofproto_dpif_lookup_by_name(br); - bool changed; + const char *br; + + if (argc > 2 && !strcmp(argv[1], "--force")) { + force = true; + argc--; + argv++; + } + + br = argv[1]; + ofproto = ofproto_dpif_lookup_by_name(br); if (!ofproto) { unixctl_command_reply_error(conn, "no such bridge"); @@ -6582,7 +6969,7 @@ ofproto_unixctl_dpif_set_dp_features(struct unixctl_conn *conn, value = argc > 3 ? argv[3] : NULL; changed = dpif_set_support(&ofproto->backer->rt_support, &ofproto->backer->bt_support, - name, value, &ds); + name, value, force, &ds); if (changed) { xlate_set_support(ofproto, &ofproto->backer->rt_support); udpif_flush(ofproto->backer->udpif); @@ -6625,7 +7012,8 @@ ofproto_unixctl_init(void) unixctl_command_register("dpif/dump-flows", "[-m] [--names | --no-names] bridge", 1, INT_MAX, ofproto_unixctl_dpif_dump_flows, NULL); - unixctl_command_register("dpif/set-dp-features", "bridge", 1, 3 , + unixctl_command_register("dpif/set-dp-features", + "[--force] bridge [feature [value]]", 1, 4, ofproto_unixctl_dpif_set_dp_features, NULL); } @@ -6703,7 +7091,8 @@ ofproto_dpif_add_internal_flow(struct ofproto_dpif *ofproto, rule = rule_dpif_lookup_in_table(ofproto, ofproto_dpif_get_tables_version(ofproto), - TBL_INTERNAL, &match->flow, &match->wc); + TBL_INTERNAL, &match->flow, &match->wc, + NULL); if (rule) { *rulep = &rule->up; } else { @@ -6888,6 +7277,7 @@ const struct ofproto_class ofproto_dpif_class = { set_sflow, set_ipfix, get_ipfix_stats, + set_local_sample, set_cfm, cfm_status_changed, get_cfm_status, @@ -6937,4 +7327,6 @@ const struct ofproto_class ofproto_dpif_class = { ct_flush, /* ct_flush */ ct_set_zone_timeout_policy, ct_del_zone_timeout_policy, + ct_zone_limit_update, + ct_zone_limit_protection_update, }; diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index d8e0cd37ac5..f8d3df5ab5a 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -51,6 +51,7 @@ #include "hmapx.h" #include "odp-util.h" #include "id-pool.h" +#include "ovs-atomic.h" #include "ovs-thread.h" #include "ofproto-provider.h" #include "util.h" @@ -103,7 +104,8 @@ struct rule_dpif *rule_dpif_lookup_from_table(struct ofproto_dpif *, ofp_port_t in_port, bool may_packet_in, bool honor_table_miss, - struct xlate_cache *); + struct xlate_cache *, + struct hmapx *conj_flows); void rule_dpif_credit_stats(struct rule_dpif *, const struct dpif_flow_stats *, bool); @@ -201,7 +203,8 @@ struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, DPIF_SUPPORT_FIELD(bool, ct_timeout, "Conntrack timeout policy") \ \ /* True if the datapath supports explicit drop action. */ \ - DPIF_SUPPORT_FIELD(bool, explicit_drop_action, "Explicit Drop action") \ + DPIF_SUPPORT_FIELD(atomic_bool, explicit_drop_action, \ + "Explicit Drop action") \ \ /* True if the datapath supports balance_tcp optimization */ \ DPIF_SUPPORT_FIELD(bool, lb_output_action, "Optimized Balance TCP mode")\ @@ -210,7 +213,10 @@ struct group_dpif *group_dpif_lookup(struct ofproto_dpif *, DPIF_SUPPORT_FIELD(bool, ct_zero_snat, "Conntrack all-zero IP SNAT") \ \ /* True if the datapath supports add_mpls action. */ \ - DPIF_SUPPORT_FIELD(bool, add_mpls, "MPLS Label add") + DPIF_SUPPORT_FIELD(bool, add_mpls, "MPLS Label add") \ + \ + /* True if the datapath supports psample action. */ \ + DPIF_SUPPORT_FIELD(bool, psample, "psample action") /* Stores the various features which the corresponding backer supports. */ @@ -284,6 +290,11 @@ struct dpif_backer { feature than 'bt_support'. */ struct atomic_count tnl_count; + + struct ovs_list ct_zone_limits_to_add; /* CT zone limits queued for + * addition into datapath. */ + struct ovs_list ct_zone_limits_to_del; /* CT zone limits queued for + * deletion from datapath. */ }; /* All existing ofproto_backer instances, indexed by ofproto->up.type. */ @@ -320,6 +331,7 @@ struct ofproto_dpif { struct netflow *netflow; struct dpif_sflow *sflow; struct dpif_ipfix *ipfix; + struct dpif_lsample *lsample; struct hmap bundles; /* Contains "struct ofbundle"s. */ struct mac_learning *ml; struct mcast_snooping *ms; @@ -353,6 +365,8 @@ struct ofproto_dpif { bool is_controller_connected; /* True if any controller admitted this * switch connection. */ + bool explicit_sampled_drops; /* If explicit drop actions must added after + * trailing sample actions. */ }; struct ofproto_dpif *ofproto_dpif_lookup_by_name(const char *name); @@ -403,5 +417,6 @@ bool ofproto_dpif_ct_zone_timeout_policy_get_name( uint8_t nw_proto, char **tp_name, bool *unwildcard); bool ovs_explicit_drop_action_supported(struct ofproto_dpif *); +bool ovs_psample_supported(struct ofproto_dpif *); #endif /* ofproto-dpif.h */ diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h index 7e3fb669852..7df3f524691 100644 --- a/ofproto/ofproto-provider.h +++ b/ofproto/ofproto-provider.h @@ -42,6 +42,7 @@ #include "ofproto/ofproto.h" #include "openvswitch/list.h" #include "openvswitch/ofp-actions.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-flow.h" #include "openvswitch/ofp-group.h" @@ -449,6 +450,8 @@ void ofproto_rule_ref(struct rule *); bool ofproto_rule_try_ref(struct rule *); void ofproto_rule_unref(struct rule *); +void ofproto_rule_stats_ds(struct ds *, struct rule *, bool offload_stats); + static inline const struct rule_actions * rule_get_actions(const struct rule *); static inline bool rule_is_table_miss(const struct rule *); static inline bool rule_is_hidden(const struct rule *); @@ -540,10 +543,19 @@ extern unsigned ofproto_max_revalidator; * duration exceeds half of max-revalidator config variable. */ extern unsigned ofproto_min_revalidate_pps; +/* Worst case delay (in ms) it might take before statistics of offloaded flows + * are updated. Offloaded flows younger than this delay will always be + * revalidated regardless of ofproto_min_revalidate_pps. */ +extern unsigned ofproto_offloaded_stats_delay; + /* Number of upcall handler and revalidator threads. Only affects the * ofproto-dpif implementation. */ extern uint32_t n_handlers, n_revalidators; +/* If an explicit datapath drop action shall be added after trailing sample + * actions coming from IPFIX / sFlow / local sampling. */ +extern bool ofproto_explicit_sampled_drops; + static inline struct rule *rule_from_cls_rule(const struct cls_rule *); void ofproto_rule_expire(struct rule *rule, uint8_t reason) @@ -1483,6 +1495,15 @@ struct ofproto_class { bool bridge_ipfix, struct ovs_list *replies ); + /* Configures local sampling on 'ofproto' according to the options array + * of 'options' which contains 'n_options' elements. + * + * EOPNOTSUPP as a return value indicates that 'ofproto' does not support + * local sampling. */ + int (*set_local_sample)(struct ofproto *ofproto, + const struct ofproto_lsample_options *options, + size_t n_options); + /* Configures connectivity fault management on 'ofport'. * * If 'cfm_settings' is nonnull, configures CFM according to its members. @@ -1902,8 +1923,10 @@ struct ofproto_class { /* ## Connection tracking ## */ /* ## ------------------- ## */ /* Flushes the connection tracking tables. If 'zone' is not NULL, - * only deletes connections in '*zone'. */ - void (*ct_flush)(const struct ofproto *, const uint16_t *zone); + * only deletes connections in '*zone'. If 'match' is not NULL, + * deletes connections specified by the match. */ + void (*ct_flush)(const struct ofproto *, const uint16_t *zone, + const struct ofp_ct_match *match); /* Sets conntrack timeout policy specified by 'timeout_policy' to 'zone' * in datapath type 'dp_type'. */ @@ -1913,6 +1936,19 @@ struct ofproto_class { /* Deletes the timeout policy associated with 'zone' in datapath type * 'dp_type'. */ void (*ct_del_zone_timeout_policy)(const char *dp_type, uint16_t zone); + + /* Updates the CT zone limit for specified zone. Setting 'zone' to + * 'OVS_ZONE_LIMIT_DEFAULT_ZONE' represents the default zone. + * 'NULL' passed as 'limit' indicates that the limit should be removed for + * the specified zone. The caller must ensure that the 'limit' value is + * within proper range (0 - UINT32_MAX). */ + void (*ct_zone_limit_update)(const char *dp_type, int32_t zone, + int64_t *limit); + + /* Sets the CT zone limit protection to "protected" for the specified + * datapath type. */ + void (*ct_zone_limit_protection_update)(const char *dp_type, + bool protected); }; extern const struct ofproto_class ofproto_dpif_class; @@ -2019,9 +2055,11 @@ enum ofperr ofproto_flow_mod_init_for_learn(struct ofproto *, struct ofproto_flow_mod *) OVS_EXCLUDED(ofproto_mutex); enum ofperr ofproto_flow_mod_learn(struct ofproto_flow_mod *, bool keep_ref, - unsigned limit, bool *below_limit) + unsigned limit, bool *below_limit, + long long int last_used) OVS_EXCLUDED(ofproto_mutex); -enum ofperr ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm); +enum ofperr ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm, + long long int last_used); enum ofperr ofproto_flow_mod_learn_start(struct ofproto_flow_mod *ofm) OVS_REQUIRES(ofproto_mutex); void ofproto_flow_mod_learn_revert(struct ofproto_flow_mod *ofm) diff --git a/ofproto/ofproto-tnl-unixctl.man b/ofproto/ofproto-tnl-unixctl.man index 13a465119a9..a801cfdccc5 100644 --- a/ofproto/ofproto-tnl-unixctl.man +++ b/ofproto/ofproto-tnl-unixctl.man @@ -1,8 +1,9 @@ .SS "OPENVSWITCH TUNNELING COMMANDS" These commands query and modify OVS tunnel components. . -.IP "\fBovs/route/add ipv4_address/plen output_bridge [GW]\fR" -Adds ipv4_address/plen route to vswitchd routing table. output_bridge +.IP "\fBovs/route/add \fIip\fB/\fIplen\fB \fIoutput_bridge\fB \ +[\fIgw\fB] [pkt_mark=\fImark\fB] [src=\fIsrc_ip\fB]\fR" +Adds \fIip\fR/\fIplen\fR route to vswitchd routing table. \fIoutput_bridge\fR needs to be OVS bridge name. This command is useful if OVS cached routes does not look right. . @@ -10,8 +11,8 @@ routes does not look right. Print all routes in OVS routing table, This includes routes cached from system routing table and user configured routes. . -.IP "\fBovs/route/del ipv4_address/plen\fR" -Delete ipv4_address/plen route from OVS routing table. +.IP "\fBovs/route/del ip/plen [pkt_mark=mark]\fR" +Delete ip/plen route from OVS routing table. . .IP "\fBtnl/neigh/show\fR" .IP "\fBtnl/arp/show\fR" diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c index 3a527683cb3..982421cddd5 100644 --- a/ofproto/ofproto.c +++ b/ofproto/ofproto.c @@ -42,6 +42,7 @@ #include "openvswitch/meta-flow.h" #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-match.h" #include "openvswitch/ofp-msgs.h" @@ -310,6 +311,8 @@ unsigned ofproto_flow_limit = OFPROTO_FLOW_LIMIT_DEFAULT; unsigned ofproto_max_idle = OFPROTO_MAX_IDLE_DEFAULT; unsigned ofproto_max_revalidator = OFPROTO_MAX_REVALIDATOR_DEFAULT; unsigned ofproto_min_revalidate_pps = OFPROTO_MIN_REVALIDATE_PPS_DEFAULT; +unsigned ofproto_offloaded_stats_delay = OFPROTO_OFFLOADED_STATS_DELAY; +bool ofproto_explicit_sampled_drops = OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT; uint32_t n_handlers, n_revalidators; @@ -723,7 +726,24 @@ ofproto_set_max_revalidator(unsigned max_revalidator) void ofproto_set_min_revalidate_pps(unsigned min_revalidate_pps) { - ofproto_min_revalidate_pps = min_revalidate_pps ? min_revalidate_pps : 1; + ofproto_min_revalidate_pps = min_revalidate_pps; +} + +/* Set worst case delay (in ms) it might take before statistics of offloaded + * flows are updated. Offloaded flows younger than this delay will always be + * revalidated regardless of ofproto_min_revalidate_pps. */ +void +ofproto_set_offloaded_stats_delay(unsigned offloaded_stats_delay) +{ + ofproto_offloaded_stats_delay = offloaded_stats_delay; +} + +/* Set if an explicit datapath drop action shall be added after trailing sample + * actions coming from IPFIX / sFlow / local sampling. */ +void +ofproto_set_explicit_sampled_drops(bool explicit_sampled_drops) +{ + ofproto_explicit_sampled_drops = explicit_sampled_drops; } /* If forward_bpdu is true, the NORMAL action will forward frames with @@ -789,7 +809,7 @@ ofproto_type_set_config(const char *datapath_type, const struct smap *cfg) datapath_type = ofproto_normalize_type(datapath_type); class = ofproto_class_find__(datapath_type); - if (class->type_set_config) { + if (class && class->type_set_config) { class->type_set_config(datapath_type, cfg); } } @@ -934,7 +954,30 @@ handle_nxt_ct_flush_zone(struct ofconn *ofconn, const struct ofp_header *oh) uint16_t zone = ntohs(nzi->zone_id); if (ofproto->ofproto_class->ct_flush) { - ofproto->ofproto_class->ct_flush(ofproto, &zone); + ofproto->ofproto_class->ct_flush(ofproto, &zone, NULL); + } else { + return EOPNOTSUPP; + } + + return 0; +} + +static enum ofperr +handle_nxt_ct_flush(struct ofconn *ofconn, const struct ofp_header *oh) +{ + struct ofproto *ofproto = ofconn_get_ofproto(ofconn); + struct ofp_ct_match match = {0}; + bool with_zone = false; + uint16_t zone_id = 0; + + enum ofperr error = ofp_ct_match_decode(&match, &with_zone, &zone_id, oh); + if (error) { + return error; + } + + if (ofproto->ofproto_class->ct_flush) { + ofproto->ofproto_class->ct_flush(ofproto, with_zone ? &zone_id : NULL, + &match); } else { return EOPNOTSUPP; } @@ -966,6 +1009,18 @@ ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap) } } +int ofproto_set_local_sample(struct ofproto *ofproto, + const struct ofproto_lsample_options *options, + size_t n_options) +{ + if (ofproto->ofproto_class->set_local_sample) { + return ofproto->ofproto_class->set_local_sample(ofproto, options, + n_options); + } else { + return EOPNOTSUPP; + } +} + /* Connection tracking configuration. */ void ofproto_ct_set_zone_timeout_policy(const char *datapath_type, uint16_t zone_id, @@ -992,6 +1047,29 @@ ofproto_ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone_id) } +void +ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit) +{ + datapath_type = ofproto_normalize_type(datapath_type); + const struct ofproto_class *class = ofproto_class_find__(datapath_type); + + if (class && class->ct_zone_limit_update) { + class->ct_zone_limit_update(datapath_type, zone_id, limit); + } +} + +void +ofproto_ct_zone_limit_protection_update(const char *datapath_type, + bool protected) +{ + datapath_type = ofproto_normalize_type(datapath_type); + const struct ofproto_class *class = ofproto_class_find__(datapath_type); + + if (class && class->ct_zone_limit_protection_update) { + class->ct_zone_limit_protection_update(datapath_type, protected); + } +} /* Spanning Tree Protocol (STP) configuration. */ @@ -2442,6 +2520,7 @@ ofport_open(struct ofproto *ofproto, struct ofputil_phy_port *pp, struct netdev **p_netdev) { + uint32_t curr_speed, max_speed; enum netdev_flags flags; struct netdev *netdev; int error; @@ -2480,8 +2559,9 @@ ofport_open(struct ofproto *ofproto, pp->state = netdev_get_carrier(netdev) ? 0 : OFPUTIL_PS_LINK_DOWN; netdev_get_features(netdev, &pp->curr, &pp->advertised, &pp->supported, &pp->peer); - pp->curr_speed = netdev_features_to_bps(pp->curr, 0) / 1000; - pp->max_speed = netdev_features_to_bps(pp->supported, 0) / 1000; + netdev_get_speed(netdev, &curr_speed, &max_speed); + pp->curr_speed = curr_speed * 1000; + pp->max_speed = max_speed * 1000; *p_netdev = netdev; return 0; @@ -4785,9 +4865,9 @@ handle_flow_stats_request(struct ofconn *ofconn, return 0; } -static void -flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, - bool offload_stats) +void +ofproto_rule_stats_ds(struct ds *results, struct rule *rule, + bool offload_stats) { struct pkt_stats stats; const struct rule_actions *actions; @@ -4800,6 +4880,10 @@ flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, created = rule->created; ovs_mutex_unlock(&rule->mutex); + if (rule->flow_cookie != 0) { + ds_put_format(results, "cookie=0x%"PRIx64", ", + ntohll(rule->flow_cookie)); + } if (rule->table_id != 0) { ds_put_format(results, "table_id=%"PRIu8", ", rule->table_id); } @@ -4812,7 +4896,8 @@ flow_stats_ds(struct ofproto *ofproto, struct rule *rule, struct ds *results, ds_put_format(results, "n_offload_bytes=%"PRIu64", ", stats.n_offload_bytes); } - cls_rule_format(&rule->cr, ofproto_get_tun_tab(ofproto), NULL, results); + cls_rule_format(&rule->cr, ofproto_get_tun_tab(rule->ofproto), NULL, + results); ds_put_char(results, ','); ds_put_cstr(results, "actions="); @@ -4834,7 +4919,7 @@ ofproto_get_all_flows(struct ofproto *p, struct ds *results, struct rule *rule; CLS_FOR_EACH (rule, cr, &table->cls) { - flow_stats_ds(p, rule, results, offload_stats); + ofproto_rule_stats_ds(results, rule, offload_stats); } } } @@ -5432,7 +5517,8 @@ ofproto_flow_mod_init_for_learn(struct ofproto *ofproto, } enum ofperr -ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) +ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm, + long long int last_used) { enum ofperr error = 0; @@ -5453,9 +5539,37 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) * this function is executed the rule will be reinstated. */ if (rule->state == RULE_REMOVED) { struct cls_rule cr; + struct oftable *table = &rule->ofproto->tables[rule->table_id]; + ovs_version_t tables_version = rule->ofproto->tables_version; + + if (!cls_rule_visible_in_version(&rule->cr, tables_version)) { + const struct cls_rule *curr_cls_rule; + + /* Only check for matching classifier rules and their modified + * time, instead of also checking all rule metadata, with the goal + * of suppressing a learn action update that would replace a more + * recent rule in the classifier. */ + curr_cls_rule = classifier_find_rule_exactly(&table->cls, + &rule->cr, + tables_version); + if (curr_cls_rule) { + struct rule *curr_rule = rule_from_cls_rule(curr_cls_rule); + long long int curr_last_used; + + ovs_mutex_lock(&curr_rule->mutex); + curr_last_used = curr_rule->modified; + ovs_mutex_unlock(&curr_rule->mutex); + + if (curr_last_used > last_used) { + /* In the case of a newer visible rule, don't recreate the + * current rule. */ + return 0; + } + } + } - cls_rule_clone(&cr, &rule->cr); ovs_mutex_lock(&rule->mutex); + cls_rule_clone(&cr, &rule->cr); error = ofproto_rule_create(rule->ofproto, &cr, rule->table_id, rule->flow_cookie, rule->idle_timeout, @@ -5466,6 +5580,7 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) rule->match_tlv_bitmap, rule->ofpacts_tlv_bitmap, &ofm->temp_rule); + ofm->temp_rule->modified = last_used; ovs_mutex_unlock(&rule->mutex); if (!error) { ofproto_rule_unref(rule); /* Release old reference. */ @@ -5473,7 +5588,7 @@ ofproto_flow_mod_learn_refresh(struct ofproto_flow_mod *ofm) } else { /* Refresh the existing rule. */ ovs_mutex_lock(&rule->mutex); - rule->modified = time_msec(); + rule->modified = last_used; ovs_mutex_unlock(&rule->mutex); } return error; @@ -5525,10 +5640,16 @@ ofproto_flow_mod_learn_finish(struct ofproto_flow_mod *ofm, /* Refresh 'ofm->temp_rule', for which the caller holds a reference, if already * in the classifier, insert it otherwise. If the rule has already been - * removed from the classifier, a new rule is created using 'ofm->temp_rule' as - * a template and the reference to the old 'ofm->temp_rule' is freed. If - * 'keep_ref' is true, then a reference to the current rule is held, otherwise - * it is released and 'ofm->temp_rule' is set to NULL. + * removed from the classifier and replaced by another rule, the 'last_used' + * parameter is used to determine whether the newer rule is replaced or kept. + * If 'last_used' is greater than the last modified time of an identical rule + * in the classifier, then a new rule is created using 'ofm->temp_rule' as a + * template and the reference to the old 'ofm->temp_rule' is freed. If the + * rule has been removed but another identical rule doesn't exist in the + * classifier, then it will be recreated. If the rule hasn't been removed + * from the classifier, then 'last_used' is used to update the rules modified + * time. If 'keep_ref' is true, then a reference to the current rule is held, + * otherwise it is released and 'ofm->temp_rule' is set to NULL. * * If 'limit' != 0, insertion will fail if there are more than 'limit' rules * in the same table with the same cookie. If insertion succeeds, @@ -5539,10 +5660,11 @@ ofproto_flow_mod_learn_finish(struct ofproto_flow_mod *ofm, * during the call. */ enum ofperr ofproto_flow_mod_learn(struct ofproto_flow_mod *ofm, bool keep_ref, - unsigned limit, bool *below_limitp) + unsigned limit, bool *below_limitp, + long long int last_used) OVS_EXCLUDED(ofproto_mutex) { - enum ofperr error = ofproto_flow_mod_learn_refresh(ofm); + enum ofperr error = ofproto_flow_mod_learn_refresh(ofm, last_used); struct rule *rule = ofm->temp_rule; bool below_limit = true; @@ -5575,6 +5697,11 @@ ofproto_flow_mod_learn(struct ofproto_flow_mod *ofm, bool keep_ref, error = ofproto_flow_mod_learn_start(ofm); if (!error) { + /* ofproto_flow_mod_learn_start may have overwritten + * modified with current time. */ + ovs_mutex_lock(&ofm->temp_rule->mutex); + ofm->temp_rule->modified = last_used; + ovs_mutex_unlock(&ofm->temp_rule->mutex); error = ofproto_flow_mod_learn_finish(ofm, NULL); } } else { @@ -8787,6 +8914,9 @@ handle_single_part_openflow(struct ofconn *ofconn, const struct ofp_header *oh, case OFPTYPE_CT_FLUSH_ZONE: return handle_nxt_ct_flush_zone(ofconn, oh); + case OFPTYPE_CT_FLUSH: + return handle_nxt_ct_flush(ofconn, oh); + case OFPTYPE_HELLO: case OFPTYPE_ERROR: case OFPTYPE_FEATURES_REPLY: diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h index a15208330c4..ca136da5a76 100644 --- a/ofproto/ofproto.h +++ b/ofproto/ofproto.h @@ -74,6 +74,11 @@ struct ofproto_sflow_options { char *control_ip; }; +/* When using UDP, IPFIX Template Records must be re-sent regularly. + * The standard default interval is 10 minutes (600 seconds). + * Cf. IETF RFC 5101 Section 10.3.6. */ +#define OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL 600 + struct ofproto_ipfix_bridge_exporter_options { struct sset targets; uint32_t sampling_rate; @@ -81,6 +86,8 @@ struct ofproto_ipfix_bridge_exporter_options { uint32_t obs_point_id; /* Bridge-wide Observation Point ID. */ uint32_t cache_active_timeout; uint32_t cache_max_flows; + uint32_t template_interval; + uint32_t stats_interval; bool enable_tunnel_sampling; bool enable_input_sampling; bool enable_output_sampling; @@ -92,10 +99,17 @@ struct ofproto_ipfix_flow_exporter_options { struct sset targets; uint32_t cache_active_timeout; uint32_t cache_max_flows; + uint32_t template_interval; + uint32_t stats_interval; bool enable_tunnel_sampling; char *virtual_obs_id; }; +struct ofproto_lsample_options { + uint32_t collector_set_id; + uint32_t group_id; +}; + struct ofproto_rstp_status { bool enabled; /* If false, ignore other members. */ rstp_identifier root_id; @@ -313,6 +327,8 @@ int ofproto_port_dump_done(struct ofproto_port_dump *); #define OFPROTO_MAX_IDLE_DEFAULT 10000 /* ms */ #define OFPROTO_MAX_REVALIDATOR_DEFAULT 500 /* ms */ #define OFPROTO_MIN_REVALIDATE_PPS_DEFAULT 5 +#define OFPROTO_OFFLOADED_STATS_DELAY 2000 /* ms */ +#define OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT false const char *ofproto_port_open_type(const struct ofproto *, const char *port_type); @@ -342,6 +358,7 @@ void ofproto_set_flow_limit(unsigned limit); void ofproto_set_max_idle(unsigned max_idle); void ofproto_set_max_revalidator(unsigned max_revalidator); void ofproto_set_min_revalidate_pps(unsigned min_revalidate_pps); +void ofproto_set_offloaded_stats_delay(unsigned offloaded_stats_delay); void ofproto_set_forward_bpdu(struct ofproto *, bool forward_bpdu); void ofproto_set_mac_table_config(struct ofproto *, unsigned idle_time, size_t max_entries); @@ -362,6 +379,9 @@ int ofproto_set_ipfix(struct ofproto *, const struct ofproto_ipfix_bridge_exporter_options *, const struct ofproto_ipfix_flow_exporter_options *, size_t); +int ofproto_set_local_sample(struct ofproto *ofproto, + const struct ofproto_lsample_options *, + size_t n_options); void ofproto_set_flow_restore_wait(bool flow_restore_wait_db); bool ofproto_get_flow_restore_wait(void); int ofproto_set_stp(struct ofproto *, const struct ofproto_stp_settings *); @@ -375,8 +395,13 @@ void ofproto_ct_set_zone_timeout_policy(const char *datapath_type, struct simap *timeout_policy); void ofproto_ct_del_zone_timeout_policy(const char *datapath_type, uint16_t zone); +void ofproto_ct_zone_limit_update(const char *datapath_type, int32_t zone_id, + int64_t *limit); +void ofproto_ct_zone_limit_protection_update(const char *datapath_type, + bool protected); void ofproto_get_datapath_cap(const char *datapath_type, struct smap *dp_cap); +void ofproto_set_explicit_sampled_drops(bool explicit_sampled_drops); /* Configuration of ports. */ void ofproto_port_unregister(struct ofproto *, ofp_port_t ofp_port); @@ -491,6 +516,9 @@ struct ofproto_mirror_settings { uint16_t out_vlan; /* Output VLAN, only if out_bundle is NULL. */ uint16_t snaplen; /* Max packet size of a mirrored packet in byte, set to 0 equals 65535. */ + + /* Output filter. */ + char *filter; }; int ofproto_mirror_register(struct ofproto *, void *aux, diff --git a/ofproto/tunnel.c b/ofproto/tunnel.c index 3455ed233b2..f067a6c26c1 100644 --- a/ofproto/tunnel.c +++ b/ofproto/tunnel.c @@ -432,6 +432,7 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow, flow->tunnel.ipv6_dst = in6addr_any; } } + flow->tunnel.tp_src = 0; /* Do not carry from a previous tunnel. */ flow->tunnel.tp_dst = cfg->dst_port; if (!cfg->out_key_flow) { flow->tunnel.tun_id = cfg->out_key; @@ -464,9 +465,13 @@ tnl_port_send(const struct ofport_dpif *ofport, struct flow *flow, flow->tunnel.flags &= ~(FLOW_TNL_F_MASK & ~FLOW_TNL_PUB_F_MASK); flow->tunnel.flags |= (cfg->dont_fragment ? FLOW_TNL_F_DONT_FRAGMENT : 0) - | (cfg->csum ? FLOW_TNL_F_CSUM : 0) | (cfg->out_key_present ? FLOW_TNL_F_KEY : 0); + if (cfg->csum == NETDEV_TNL_CSUM_ENABLED || + (cfg->csum == NETDEV_TNL_CSUM_DEFAULT && !flow->tunnel.ip_dst)) { + flow->tunnel.flags |= FLOW_TNL_F_CSUM; + } + if (cfg->set_egress_pkt_mark) { flow->pkt_mark = cfg->egress_pkt_mark; wc->masks.pkt_mark = UINT32_MAX; @@ -705,8 +710,10 @@ tnl_port_format(const struct tnl_port *tnl_port, struct ds *ds) ds_put_cstr(ds, ", df=false"); } - if (cfg->csum) { + if (cfg->csum == NETDEV_TNL_CSUM_ENABLED) { ds_put_cstr(ds, ", csum=true"); + } else if (cfg->csum == NETDEV_TNL_CSUM_DISABLED) { + ds_put_cstr(ds, ", csum=false"); } ds_put_cstr(ds, ")\n"); diff --git a/ovsdb/automake.mk b/ovsdb/automake.mk index eba713bb6d7..d484fe9debb 100644 --- a/ovsdb/automake.mk +++ b/ovsdb/automake.mk @@ -114,11 +114,13 @@ $(OVSIDL_BUILT): ovsdb/ovsdb-idlc.in python/ovs/dirs.py # ovsdb-doc EXTRA_DIST += ovsdb/ovsdb-doc +FLAKE8_PYFILES += ovsdb/ovsdb-doc OVSDB_DOC = $(run_python) $(srcdir)/ovsdb/ovsdb-doc ovsdb/ovsdb-doc: python/ovs/dirs.py # ovsdb-dot EXTRA_DIST += ovsdb/ovsdb-dot.in ovsdb/dot2pic +FLAKE8_PYFILES += ovsdb/ovsdb-dot.in ovsdb/dot2pic noinst_SCRIPTS += ovsdb/ovsdb-dot CLEANFILES += ovsdb/ovsdb-dot OVSDB_DOT = $(run_python) $(srcdir)/ovsdb/ovsdb-dot.in diff --git a/ovsdb/condition.c b/ovsdb/condition.c index d0016fa7f79..4911fbf59be 100644 --- a/ovsdb/condition.c +++ b/ovsdb/condition.c @@ -47,7 +47,10 @@ ovsdb_clause_from_json(const struct ovsdb_table_schema *ts, /* Column and arg fields are not being used with boolean functions. * Use dummy values */ - clause->column = ovsdb_table_schema_get_column(ts, "_uuid"); + const struct ovsdb_column *uuid_column = + ovsdb_table_schema_get_column(ts, "_uuid"); + ovs_assert(uuid_column); + clause->column = uuid_column; clause->index = clause->column->index; ovsdb_datum_init_default(&clause->arg, &clause->column->type); return NULL; @@ -497,6 +500,67 @@ ovsdb_condition_cmp_3way(const struct ovsdb_condition *a, return 0; } +/* Given conditions 'a' and 'b', composes a new condition 'diff' that contains + * clauses that are present in one of the conditions, but not in the other. + * + * If some data doesn't match the resulted 'diff' condition, that means one of: + * 1. The data matches both 'a' and 'b'. + * 2. The data does not match either 'a' or 'b'. + * + * However, that is not true if one of the original conditions is a trivial + * True or False. In this case the function will currently just return an + * empty (True) condition. */ +void +ovsdb_condition_diff(struct ovsdb_condition *diff, + const struct ovsdb_condition *a, + const struct ovsdb_condition *b) +{ + size_t i, j; + int cmp; + + ovsdb_condition_init(diff); + + if (ovsdb_condition_is_trivial(a) || ovsdb_condition_is_trivial(b)) { + return; + } + + diff->clauses = xcalloc(a->n_clauses + b->n_clauses, + sizeof *diff->clauses); + + /* Clauses are sorted. */ + for (i = j = 0; i < a->n_clauses && j < b->n_clauses;) { + cmp = compare_clauses_3way_with_data(&a->clauses[i], &b->clauses[j]); + if (cmp < 0) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &a->clauses[i++]); + } else if (cmp > 0) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &b->clauses[j++]); + } else { + i++; + j++; + } + } + for (; i < a->n_clauses; i++) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &a->clauses[i]); + } + for (; j < b->n_clauses; j++) { + ovsdb_clause_clone(&diff->clauses[diff->n_clauses++], + &b->clauses[j]); + } + + if (diff->n_clauses) { + diff->optimized = a->optimized && b->optimized; + if (diff->optimized) { + ovsdb_condition_optimize(diff); + } + } else { + free(diff->clauses); + diff->clauses = NULL; + } +} + void ovsdb_condition_clone(struct ovsdb_condition *to, const struct ovsdb_condition *from) diff --git a/ovsdb/condition.h b/ovsdb/condition.h index c794966ce94..95e4c4f2033 100644 --- a/ovsdb/condition.h +++ b/ovsdb/condition.h @@ -58,6 +58,9 @@ bool ovsdb_condition_match_any_clause(const struct ovsdb_datum *, unsigned int index_map[]); int ovsdb_condition_cmp_3way(const struct ovsdb_condition *a, const struct ovsdb_condition *b); +void ovsdb_condition_diff(struct ovsdb_condition *, + const struct ovsdb_condition *, + const struct ovsdb_condition *); void ovsdb_condition_clone(struct ovsdb_condition *to, const struct ovsdb_condition *from); bool ovsdb_condition_is_true(const struct ovsdb_condition *cond); @@ -66,6 +69,12 @@ const struct ovsdb_column ** ovsdb_condition_get_columns(const struct ovsdb_condition *cond, size_t *n_columns); +static inline bool +ovsdb_condition_is_trivial(const struct ovsdb_condition *cond) +{ + return ovsdb_condition_is_true(cond) || ovsdb_condition_is_false(cond); +} + static inline bool ovsdb_condition_empty_or_match_any(const struct ovsdb_datum *row_datum, const struct ovsdb_condition *cnd, diff --git a/ovsdb/dot2pic b/ovsdb/dot2pic index 2f858e19d5b..3db6444de64 100755 --- a/ovsdb/dot2pic +++ b/ovsdb/dot2pic @@ -17,6 +17,7 @@ import getopt import sys + def dot2pic(src, dst): scale = 1.0 while True: @@ -49,8 +50,8 @@ def dot2pic(src, dst): dst.write("box at %f,%f wid %f height %f\n" % (x, y, width, height)) elif command == 'edge': - tail = words[1] - head = words[2] + # tail = words[1] + # head = words[2] n = int(words[3]) # Extract x,y coordinates. @@ -114,4 +115,3 @@ else: if font_scale: print(".ps %+d" % font_scale) print(".PE") - diff --git a/ovsdb/execution.c b/ovsdb/execution.c index f9b8067d02c..f4cc9e802ba 100644 --- a/ovsdb/execution.c +++ b/ovsdb/execution.c @@ -320,7 +320,7 @@ parse_row(const struct json *json, const struct ovsdb_table *table, } row = ovsdb_row_create(table); - error = ovsdb_row_from_json(row, json, symtab, columns); + error = ovsdb_row_from_json(row, json, symtab, columns, false); if (error) { ovsdb_row_destroy(row); return error; @@ -490,9 +490,11 @@ update_row_cb(const struct ovsdb_row *row, void *ur_) ur->n_matches++; if (!ovsdb_row_equal_columns(row, ur->row, ur->columns)) { + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(ur->txn, row, &rw_row, NULL); ovsdb_error_assert(ovsdb_row_update_columns( - ovsdb_txn_row_modify(ur->txn, row), - ur->row, ur->columns, false)); + rw_row, ur->row, ur->columns, false)); } return true; @@ -572,13 +574,27 @@ static bool mutate_row_cb(const struct ovsdb_row *row, void *mr_) { struct mutate_row_cbdata *mr = mr_; + struct ovsdb_row *rw_row; + + /* Not trying to track the row diff here, because user transactions + * may attempt to add duplicates or remove elements that do not exist. */ + ovsdb_txn_row_modify(mr->txn, row, &rw_row, NULL); mr->n_matches++; - *mr->error = ovsdb_mutation_set_execute(ovsdb_txn_row_modify(mr->txn, row), - mr->mutations); + *mr->error = ovsdb_mutation_set_execute(rw_row, mr->mutations); return *mr->error == NULL; } +static bool +count_row_cb(const struct ovsdb_row *row OVS_UNUSED, void *rc) +{ + size_t *row_count = rc; + + (*row_count)++; + + return true; +} + static struct ovsdb_error * ovsdb_execute_mutate(struct ovsdb_execution *x, struct ovsdb_parser *parser, struct json *result) @@ -603,7 +619,18 @@ ovsdb_execute_mutate(struct ovsdb_execution *x, struct ovsdb_parser *parser, error = ovsdb_condition_from_json(table->schema, where, x->symtab, &condition); } - if (!error) { + if (!error && ovsdb_mutation_set_empty(&mutations)) { + /* Special case with no mutations, just return the row count. */ + if (ovsdb_condition_empty(&condition)) { + json_object_put(result, "count", + json_integer_create(hmap_count(&table->rows))); + } else { + size_t row_count = 0; + ovsdb_query(table, &condition, count_row_cb, &row_count); + json_object_put(result, "count", + json_integer_create(row_count)); + } + } else if (!error) { mr.n_matches = 0; mr.txn = x->txn; mr.mutations = &mutations; @@ -764,7 +791,7 @@ ovsdb_execute_wait(struct ovsdb_execution *x, struct ovsdb_parser *parser, row = ovsdb_row_create(table); error = ovsdb_row_from_json(row, rows->array.elems[i], x->symtab, - NULL); + NULL, false); if (error) { ovsdb_row_destroy(row); break; diff --git a/ovsdb/file.c b/ovsdb/file.c index ca80c282356..66ef87a1f16 100644 --- a/ovsdb/file.c +++ b/ovsdb/file.c @@ -23,6 +23,7 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "log.h" #include "openvswitch/json.h" #include "lockfile.h" @@ -52,7 +53,8 @@ static void ovsdb_file_txn_init(struct ovsdb_file_txn *); static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *, const struct ovsdb_row *old, const struct ovsdb_row *new, - const unsigned long int *changed); + const unsigned long int *changed, + bool allow_shallow_copies); /* If set to 'true', file transactions will contain difference between * datums of old and new rows and not the whole new datum for the column. */ @@ -79,8 +81,8 @@ ovsdb_file_column_diff_disable(void) } static struct ovsdb_error * -ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, - bool row_contains_diff, +ovsdb_file_update_row_from_json(struct ovsdb_row *row, struct ovsdb_row *diff, + bool converting, bool row_contains_diff, const struct json *json) { struct ovsdb_table_schema *schema = row->table->schema; @@ -106,16 +108,27 @@ ovsdb_file_update_row_from_json(struct ovsdb_row *row, bool converting, column_name, schema->name); } - error = ovsdb_datum_from_json(&datum, &column->type, node->data, NULL); + if (row_contains_diff) { + /* Diff may violate the type size rules. */ + error = ovsdb_transient_datum_from_json(&datum, &column->type, + node->data); + } else { + error = ovsdb_datum_from_json(&datum, &column->type, + node->data, NULL); + } if (error) { return error; } - if (row_contains_diff - && !ovsdb_datum_is_default(&row->fields[column->index], - &column->type)) { + if (row_contains_diff) { error = ovsdb_datum_apply_diff_in_place( &row->fields[column->index], &datum, &column->type); + if (!error && diff) { + ovs_assert(ovsdb_datum_is_default(&diff->fields[column->index], + &column->type)); + ovsdb_datum_swap(&diff->fields[column->index], &datum); + } + ovsdb_datum_destroy(&datum, &column->type); if (error) { return error; @@ -144,17 +157,20 @@ ovsdb_file_txn_row_from_json(struct ovsdb_txn *txn, struct ovsdb_table *table, ovsdb_txn_row_delete(txn, row); return NULL; } else if (row) { - return ovsdb_file_update_row_from_json(ovsdb_txn_row_modify(txn, row), - converting, row_contains_diff, - json); + struct ovsdb_row *new, *diff = NULL; + + ovsdb_txn_row_modify(txn, row, &new, + row_contains_diff ? &diff : NULL); + return ovsdb_file_update_row_from_json(new, diff, converting, + row_contains_diff, json); } else { struct ovsdb_error *error; struct ovsdb_row *new; new = ovsdb_row_create(table); *ovsdb_row_get_uuid_rw(new) = *row_uuid; - error = ovsdb_file_update_row_from_json(new, converting, - row_contains_diff, json); + error = ovsdb_file_update_row_from_json(new, NULL, converting, + false, json); if (error) { ovsdb_row_destroy(new); } else { @@ -269,22 +285,50 @@ ovsdb_convert_table(struct ovsdb_txn *txn, const struct ovsdb_table *src_table, struct ovsdb_table *dst_table) { + const struct ovsdb_column **dst_columns; + struct ovsdb_error *error = NULL; const struct ovsdb_row *src_row; + unsigned long *src_equal; + struct shash_node *node; + size_t n_src_columns; + + n_src_columns = shash_count(&src_table->schema->columns); + src_equal = bitmap_allocate(n_src_columns); + dst_columns = xzalloc(n_src_columns * sizeof *dst_columns); + + SHASH_FOR_EACH (node, &src_table->schema->columns) { + const struct ovsdb_column *src_column = node->data; + + if (src_column->index == OVSDB_COL_UUID || + src_column->index == OVSDB_COL_VERSION) { + continue; + } + + const struct ovsdb_column *dst_column = + shash_find_data(&dst_table->schema->columns, src_column->name); + + if (!dst_column) { + continue; + } + + dst_columns[src_column->index] = dst_column; + + if (ovsdb_type_equals(&src_column->type, &dst_column->type)) { + bitmap_set1(src_equal, src_column->index); + } + } + HMAP_FOR_EACH (src_row, hmap_node, &src_table->rows) { struct ovsdb_row *dst_row = ovsdb_row_create(dst_table); *ovsdb_row_get_uuid_rw(dst_row) = *ovsdb_row_get_uuid(src_row); - struct shash_node *node; + cooperative_multitasking_yield(); + SHASH_FOR_EACH (node, &src_table->schema->columns) { const struct ovsdb_column *src_column = node->data; - if (src_column->index == OVSDB_COL_UUID || - src_column->index == OVSDB_COL_VERSION) { - continue; - } + const struct ovsdb_column *dst_column; - const struct ovsdb_column *dst_column - = shash_find_data(&dst_table->schema->columns, - src_column->name); + dst_column = dst_columns[src_column->index]; if (!dst_column) { continue; } @@ -292,19 +336,30 @@ ovsdb_convert_table(struct ovsdb_txn *txn, ovsdb_datum_destroy(&dst_row->fields[dst_column->index], &dst_column->type); - struct ovsdb_error *error = ovsdb_datum_convert( + if (bitmap_is_set(src_equal, src_column->index)) { + /* This column didn't change - no need to convert. */ + ovsdb_datum_clone(&dst_row->fields[dst_column->index], + &src_row->fields[src_column->index]); + continue; + } + + error = ovsdb_datum_convert( &dst_row->fields[dst_column->index], &dst_column->type, &src_row->fields[src_column->index], &src_column->type); if (error) { ovsdb_datum_init_empty(&dst_row->fields[dst_column->index]); ovsdb_row_destroy(dst_row); - return error; + goto exit; } } ovsdb_txn_row_insert(txn, dst_row); } - return NULL; + +exit: + free(dst_columns); + bitmap_free(src_equal); + return error; } /* Copies the data in 'src', converts it into the schema specified in @@ -361,12 +416,19 @@ ovsdb_file_change_cb(const struct ovsdb_row *old, void *ftxn_) { struct ovsdb_file_txn *ftxn = ftxn_; - ovsdb_file_txn_add_row(ftxn, old, new, changed); + ovsdb_file_txn_add_row(ftxn, old, new, changed, true); return true; } +/* Converts the database into transaction JSON representation. + * If 'allow_shallow_copies' is false, makes sure that all the JSON + * objects in the resulted transaction JSON are separately allocated + * objects and not shallow clones of JSON objects already existing + * in the database. Useful when multiple threads are working on the + * same database object. */ struct json * -ovsdb_to_txn_json(const struct ovsdb *db, const char *comment) +ovsdb_to_txn_json(const struct ovsdb *db, const char *comment, + bool allow_shallow_copies) { struct ovsdb_file_txn ftxn; @@ -378,7 +440,8 @@ ovsdb_to_txn_json(const struct ovsdb *db, const char *comment) const struct ovsdb_row *row; HMAP_FOR_EACH (row, hmap_node, &table->rows) { - ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL); + ovsdb_file_txn_add_row(&ftxn, NULL, row, NULL, + allow_shallow_copies); } } @@ -426,7 +489,8 @@ static void ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, const struct ovsdb_row *old, const struct ovsdb_row *new, - const unsigned long int *changed) + const unsigned long int *changed, + bool allow_shallow_copies) { struct json *row; @@ -451,10 +515,20 @@ ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, if (old && use_column_diff) { ovsdb_datum_diff(&datum, &old->fields[idx], &new->fields[idx], type); - column_json = ovsdb_datum_to_json(&datum, type); + if (allow_shallow_copies) { + column_json = ovsdb_datum_to_json(&datum, type); + } else { + column_json = ovsdb_datum_to_json_deep(&datum, type); + } ovsdb_datum_destroy(&datum, type); } else { - column_json = ovsdb_datum_to_json(&new->fields[idx], type); + if (allow_shallow_copies) { + column_json = ovsdb_datum_to_json( + &new->fields[idx], type); + } else { + column_json = ovsdb_datum_to_json_deep( + &new->fields[idx], type); + } } if (!row) { row = json_object_create(); @@ -465,9 +539,12 @@ ovsdb_file_txn_add_row(struct ovsdb_file_txn *ftxn, } if (row) { + ovs_assert(new || old); struct ovsdb_table *table = new ? new->table : old->table; char uuid[UUID_LEN + 1]; + ovs_assert(table); + if (table != ftxn->table) { /* Create JSON object for transaction overall. */ if (!ftxn->json) { diff --git a/ovsdb/file.h b/ovsdb/file.h index be4f6ad27ca..ae90d4fe130 100644 --- a/ovsdb/file.h +++ b/ovsdb/file.h @@ -25,7 +25,8 @@ struct ovsdb_txn; void ovsdb_file_column_diff_disable(void); -struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment); +struct json *ovsdb_to_txn_json(const struct ovsdb *, const char *comment, + bool allow_shallow_copies); struct json *ovsdb_file_txn_to_json(const struct ovsdb_txn *); struct json *ovsdb_file_txn_annotate(struct json *, const char *comment); struct ovsdb_error *ovsdb_file_txn_from_json(struct ovsdb *, diff --git a/ovsdb/jsonrpc-server.c b/ovsdb/jsonrpc-server.c index 916a1f414e5..26a53898f0a 100644 --- a/ovsdb/jsonrpc-server.c +++ b/ovsdb/jsonrpc-server.c @@ -21,6 +21,7 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "monitor.h" #include "openvswitch/json.h" @@ -211,13 +212,116 @@ struct ovsdb_jsonrpc_options * ovsdb_jsonrpc_default_options(const char *target) { struct ovsdb_jsonrpc_options *options = xzalloc(sizeof *options); - options->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF; - options->probe_interval = (stream_or_pstream_needs_probes(target) - ? RECONNECT_DEFAULT_PROBE_INTERVAL - : 0); + struct jsonrpc_session_options *rpc_opt = &options->rpc; + + rpc_opt->max_backoff = RECONNECT_DEFAULT_MAX_BACKOFF; + rpc_opt->probe_interval = (stream_or_pstream_needs_probes(target) + ? RECONNECT_DEFAULT_PROBE_INTERVAL : 0); + rpc_opt->dscp = DSCP_DEFAULT; return options; } +struct ovsdb_jsonrpc_options * +ovsdb_jsonrpc_options_clone(const struct ovsdb_jsonrpc_options *options) +{ + struct ovsdb_jsonrpc_options *clone; + + clone = xmemdup(options, sizeof *options); + clone->role = nullable_xstrdup(options->role); + + return clone; +} + +void +ovsdb_jsonrpc_options_free(struct ovsdb_jsonrpc_options *options) +{ + if (options) { + free(options->role); + free(options); + } +} + +struct json * +ovsdb_jsonrpc_options_to_json(const struct ovsdb_jsonrpc_options *options, + bool jsonrpc_session_only) +{ + struct json *json = json_object_create(); + + json_object_put(json, "max-backoff", + json_integer_create(options->rpc.max_backoff)); + json_object_put(json, "inactivity-probe", + json_integer_create(options->rpc.probe_interval)); + json_object_put(json, "dscp", json_integer_create(options->rpc.dscp)); + + if (jsonrpc_session_only) { + /* Caller is not interested in OVSDB-specific options. */ + return json; + } + + json_object_put(json, "read-only", + json_boolean_create(options->read_only)); + if (options->role) { + json_object_put(json, "role", json_string_create(options->role)); + } + + return json; +} + +void +ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *options, + const struct json *json, + bool jsonrpc_session_only) +{ + const struct json *max_backoff, *probe_interval, *read_only, *dscp, *role; + struct ovsdb_parser parser; + struct ovsdb_error *error; + + ovsdb_parser_init(&parser, json, "JSON-RPC options"); + + max_backoff = ovsdb_parser_member(&parser, "max-backoff", + OP_INTEGER | OP_OPTIONAL); + if (max_backoff) { + options->rpc.max_backoff = json_integer(max_backoff); + } + + probe_interval = ovsdb_parser_member(&parser, "inactivity-probe", + OP_INTEGER | OP_OPTIONAL); + if (probe_interval) { + options->rpc.probe_interval = json_integer(probe_interval); + } + + dscp = ovsdb_parser_member(&parser, "dscp", OP_INTEGER | OP_OPTIONAL); + if (dscp) { + options->rpc.dscp = json_integer(dscp); + } + + if (jsonrpc_session_only) { + /* Caller is not interested in OVSDB-specific options. */ + goto exit; + } + + read_only = ovsdb_parser_member(&parser, "read-only", + OP_BOOLEAN | OP_OPTIONAL); + if (read_only) { + options->read_only = json_boolean(read_only); + } + + role = ovsdb_parser_member(&parser, "role", OP_STRING | OP_OPTIONAL); + if (role) { + free(options->role); + options->role = nullable_xstrdup(json_string(role)); + } + +exit: + error = ovsdb_parser_finish(&parser); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("%s", s); + free(s); + } +} + /* Sets 'svr''s current set of remotes to the names in 'new_remotes', with * options in the struct ovsdb_jsonrpc_options supplied as the data values. * @@ -237,7 +341,8 @@ ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *svr, if (!options) { VLOG_INFO("%s: remote deconfigured", node->name); ovsdb_jsonrpc_server_del_remote(node); - } else if (options->dscp != remote->dscp) { + } else if (options->rpc.dscp != remote->dscp + || !nullable_string_is_equal(options->role, remote->role)) { ovsdb_jsonrpc_server_del_remote(node); } } @@ -266,26 +371,37 @@ ovsdb_jsonrpc_server_add_remote(struct ovsdb_jsonrpc_server *svr, struct pstream *listener; int error; - error = jsonrpc_pstream_open(name, &listener, options->dscp); - if (error && error != EAFNOSUPPORT) { - VLOG_ERR_RL(&rl, "%s: listen failed: %s", name, ovs_strerror(error)); - return NULL; - } + error = jsonrpc_pstream_open(name, &listener, options->rpc.dscp); + switch (error) { + case 0: + case EAFNOSUPPORT: + remote = xmalloc(sizeof *remote); + remote->server = svr; + remote->listener = listener; + ovs_list_init(&remote->sessions); + remote->dscp = options->rpc.dscp; + remote->read_only = options->read_only; + remote->role = nullable_xstrdup(options->role); + shash_add(&svr->remotes, name, remote); + if (!listener) { + /* Not a listener, attempt creation of active jsonrpc session. */ + ovsdb_jsonrpc_session_create(remote, + jsonrpc_session_open(name, true), + svr->read_only || remote->read_only); + } + return remote; - remote = xmalloc(sizeof *remote); - remote->server = svr; - remote->listener = listener; - ovs_list_init(&remote->sessions); - remote->dscp = options->dscp; - remote->read_only = options->read_only; - remote->role = nullable_xstrdup(options->role); - shash_add(&svr->remotes, name, remote); + case EAGAIN: + VLOG_DBG_RL(&rl, "%s: listen failed: " + "DNS resolution in progress or host not found", name); + return NULL; - if (!listener) { - ovsdb_jsonrpc_session_create(remote, jsonrpc_session_open(name, true), - svr->read_only || remote->read_only); + default: + VLOG_ERR_RL(&rl, "%s: listen failed: %s", name, + ovs_strerror(error)); + return NULL; } - return remote; + OVS_NOT_REACHED(); } static void @@ -573,21 +689,14 @@ ovsdb_jsonrpc_session_run(struct ovsdb_jsonrpc_session *s) return jsonrpc_session_is_alive(s->js) ? 0 : ETIMEDOUT; } -static void -ovsdb_jsonrpc_session_set_options(struct ovsdb_jsonrpc_session *session, - const struct ovsdb_jsonrpc_options *options) -{ - jsonrpc_session_set_max_backoff(session->js, options->max_backoff); - jsonrpc_session_set_probe_interval(session->js, options->probe_interval); - jsonrpc_session_set_dscp(session->js, options->dscp); -} - static void ovsdb_jsonrpc_session_run_all(struct ovsdb_jsonrpc_remote *remote) { struct ovsdb_jsonrpc_session *s; LIST_FOR_EACH_SAFE (s, node, &remote->sessions) { + cooperative_multitasking_yield(); + int error = ovsdb_jsonrpc_session_run(s); if (error) { ovsdb_jsonrpc_session_close(s); @@ -700,7 +809,7 @@ ovsdb_jsonrpc_session_set_all_options( struct ovsdb_jsonrpc_session *s; LIST_FOR_EACH (s, node, &remote->sessions) { - ovsdb_jsonrpc_session_set_options(s, options); + jsonrpc_session_set_options(s->js, &options->rpc); } } @@ -1027,7 +1136,7 @@ ovsdb_jsonrpc_session_got_request(struct ovsdb_jsonrpc_session *s, request->id); } else if (!strcmp(request->method, "get_schema")) { struct ovsdb *db = ovsdb_jsonrpc_lookup_db(s, request, &reply); - if (!reply) { + if (db && !reply) { reply = jsonrpc_create_reply(ovsdb_schema_to_json(db->schema), request->id); } @@ -1120,6 +1229,8 @@ static void ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, struct jsonrpc_msg *request) { + ovs_assert(db); + /* Check for duplicate ID. */ size_t hash = json_hash(request->id, 0); struct ovsdb_jsonrpc_trigger *t @@ -1134,7 +1245,8 @@ ovsdb_jsonrpc_trigger_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, /* Insert into trigger table. */ t = xmalloc(sizeof *t); bool disconnect_all = ovsdb_trigger_init( - &s->up, db, &t->trigger, request, time_msec(), s->read_only, + &s->up, db, &t->trigger, request, time_msec(), + s->read_only || db->read_only, s->remote->role, jsonrpc_session_get_id(s->js)); t->id = json_clone(request->id); hmap_insert(&s->triggers, &t->hmap_node, hash); @@ -1380,6 +1492,8 @@ ovsdb_jsonrpc_monitor_create(struct ovsdb_jsonrpc_session *s, struct ovsdb *db, enum ovsdb_monitor_version version, const struct json *request_id) { + ovs_assert(db); + struct ovsdb_jsonrpc_monitor *m = NULL; struct ovsdb_monitor *dbmon = NULL; struct json *monitor_id, *monitor_requests; diff --git a/ovsdb/jsonrpc-server.h b/ovsdb/jsonrpc-server.h index e0653aa3974..d613cb7c70e 100644 --- a/ovsdb/jsonrpc-server.h +++ b/ovsdb/jsonrpc-server.h @@ -18,6 +18,7 @@ #include #include "openvswitch/types.h" +#include "jsonrpc.h" struct ovsdb; struct shash; @@ -33,14 +34,22 @@ void ovsdb_jsonrpc_server_destroy(struct ovsdb_jsonrpc_server *); /* Options for a remote. */ struct ovsdb_jsonrpc_options { - int max_backoff; /* Maximum reconnection backoff, in msec. */ - int probe_interval; /* Max idle time before probing, in msec. */ + struct jsonrpc_session_options rpc; /* JSON-RPC options. */ bool read_only; /* Only read-only transactions are allowed. */ - int dscp; /* Dscp value for manager connections */ char *role; /* Role, for role-based access controls */ }; -struct ovsdb_jsonrpc_options * -ovsdb_jsonrpc_default_options(const char *target); +struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_default_options( + const char *target); +struct ovsdb_jsonrpc_options *ovsdb_jsonrpc_options_clone( + const struct ovsdb_jsonrpc_options *); +void ovsdb_jsonrpc_options_free(struct ovsdb_jsonrpc_options *); + +struct json *ovsdb_jsonrpc_options_to_json( + const struct ovsdb_jsonrpc_options *, bool jsonrpc_session_only) + OVS_WARN_UNUSED_RESULT; +void ovsdb_jsonrpc_options_update_from_json(struct ovsdb_jsonrpc_options *, + const struct json *, + bool jsonrpc_session_only); void ovsdb_jsonrpc_server_set_remotes(struct ovsdb_jsonrpc_server *, const struct shash *); diff --git a/ovsdb/log.c b/ovsdb/log.c index e42f002464b..fff7c6ba104 100644 --- a/ovsdb/log.c +++ b/ovsdb/log.c @@ -552,6 +552,23 @@ ovsdb_log_truncate(struct ovsdb_log *file) return error; } +/* Removes all the data from the log by moving current offset to zero and + * truncating the file to zero bytes. After this operation the file is empty + * and in a write state. */ +struct ovsdb_error * OVS_WARN_UNUSED_RESULT +ovsdb_log_reset(struct ovsdb_log *file) +{ + ovsdb_error_destroy(file->error); + file->offset = file->prev_offset = 0; + file->error = ovsdb_log_truncate(file); + if (file->error) { + file->state = OVSDB_LOG_WRITE_ERROR; + return ovsdb_error_clone(file->error); + } + file->state = OVSDB_LOG_WRITE; + return NULL; +} + /* Composes a log record for 'json' by filling 'header' with a header line and * 'data' with a data line (each ending with a new-line). To write the record * to a file, write 'header' followed by 'data'. diff --git a/ovsdb/log.h b/ovsdb/log.h index 90714ea1319..63e5681a0b6 100644 --- a/ovsdb/log.h +++ b/ovsdb/log.h @@ -66,6 +66,9 @@ struct ovsdb_error *ovsdb_log_read(struct ovsdb_log *, struct json **) OVS_WARN_UNUSED_RESULT; void ovsdb_log_unread(struct ovsdb_log *); +struct ovsdb_error *ovsdb_log_reset(struct ovsdb_log *) + OVS_WARN_UNUSED_RESULT; + void ovsdb_log_compose_record(const struct json *, const char *magic, struct ds *header, struct ds *data); diff --git a/ovsdb/monitor.c b/ovsdb/monitor.c index 191befcae3b..c3bfae3d2a1 100644 --- a/ovsdb/monitor.c +++ b/ovsdb/monitor.c @@ -20,8 +20,10 @@ #include "bitmap.h" #include "column.h" +#include "cooperative-multitasking.h" #include "openvswitch/dynamic-string.h" #include "openvswitch/json.h" +#include "json.h" #include "jsonrpc.h" #include "ovsdb-error.h" #include "ovsdb-parser.h" @@ -55,6 +57,10 @@ struct ovsdb_monitor_table_condition { struct ovsdb_monitor_table *mt; struct ovsdb_condition old_condition; struct ovsdb_condition new_condition; + + /* Condition composed from difference between clauses in old and new. + * Note: Empty diff condition doesn't mean that old == new. */ + struct ovsdb_condition diff_condition; }; /* Backend monitor. @@ -258,7 +264,7 @@ ovsdb_monitor_json_cache_flush(struct ovsdb_monitor *dbmon) struct ovsdb_monitor_json_cache_node *node; HMAP_FOR_EACH_POP(node, hmap_node, &dbmon->json_cache) { - json_destroy(node->json); + json_destroy_with_yield(node->json); free(node); } } @@ -274,7 +280,7 @@ ovsdb_monitor_json_cache_destroy(struct ovsdb_monitor *dbmon, = ovsdb_monitor_json_cache_search(dbmon, v, change_set); if (node) { hmap_remove(&dbmon->json_cache, &node->hmap_node); - json_destroy(node->json); + json_destroy_with_yield(node->json); free(node); } } @@ -474,16 +480,30 @@ ovsdb_monitor_add_column(struct ovsdb_monitor *dbmon, enum ovsdb_monitor_selection select, bool monitored) { + struct ovsdb_monitor_change_set *mcs; struct ovsdb_monitor_table *mt; struct ovsdb_monitor_column *c; mt = shash_find_data(&dbmon->tables, table->schema->name); + ovs_assert(mt); /* Check for column duplication. Return duplicated column name. */ if (mt->columns_index_map[column->index] != -1) { return column->name; } + mcs = dbmon->init_change_set; + if (mcs) { + /* A new column is going to be added to the monitor. Existing + * initial change set doesn't have it, so can no longer be used. + * Initial change set is never used by more than one session at + * the same time, so it's safe to destroy it here. */ + ovs_assert(mcs->n_refs == 1); + ovsdb_monitor_json_cache_destroy(dbmon, mcs); + ovsdb_monitor_change_set_destroy(mcs); + dbmon->init_change_set = NULL; + } + if (mt->n_columns >= mt->allocated_columns) { mt->columns = x2nrealloc(mt->columns, &mt->allocated_columns, sizeof *mt->columns); @@ -609,7 +629,10 @@ ovsdb_monitor_untrack_change_set(struct ovsdb_monitor *dbmon, ovs_assert(mcs); if (--mcs->n_refs == 0) { if (mcs == dbmon->init_change_set) { - dbmon->init_change_set = NULL; + /* The initial change set should exist as long as the + * monitor doesn't change. */ + mcs->n_refs++; + return; } else if (mcs == dbmon->new_change_set) { dbmon->new_change_set = NULL; } @@ -710,6 +733,7 @@ ovsdb_monitor_session_condition_destroy( ovsdb_condition_destroy(&mtc->new_condition); ovsdb_condition_destroy(&mtc->old_condition); + ovsdb_condition_destroy(&mtc->diff_condition); shash_delete(&condition->tables, node); free(mtc); } @@ -730,6 +754,7 @@ ovsdb_monitor_table_condition_create( mtc->table = table; ovsdb_condition_init(&mtc->old_condition); ovsdb_condition_init(&mtc->new_condition); + ovsdb_condition_init(&mtc->diff_condition); if (json_cnd) { error = ovsdb_condition_from_json(table->schema, @@ -743,7 +768,7 @@ ovsdb_monitor_table_condition_create( } shash_add(&condition->tables, table->schema->name, mtc); - /* On session startup old == new condition */ + /* On session startup old == new condition, diff is empty. */ ovsdb_condition_clone(&mtc->new_condition, &mtc->old_condition); ovsdb_monitor_session_condition_set_mode(condition); @@ -755,7 +780,8 @@ ovsdb_monitor_get_table_conditions( const struct ovsdb_monitor_table *mt, const struct ovsdb_monitor_session_condition *condition, struct ovsdb_condition **old_condition, - struct ovsdb_condition **new_condition) + struct ovsdb_condition **new_condition, + struct ovsdb_condition **diff_condition) { if (!condition) { return false; @@ -769,6 +795,7 @@ ovsdb_monitor_get_table_conditions( } *old_condition = &mtc->old_condition; *new_condition = &mtc->new_condition; + *diff_condition = &mtc->diff_condition; return true; } @@ -789,6 +816,8 @@ ovsdb_monitor_table_condition_update( struct ovsdb_error *error; struct ovsdb_condition cond = OVSDB_CONDITION_INITIALIZER(&cond); + ovs_assert(mtc); + error = ovsdb_condition_from_json(table->schema, cond_json, NULL, &cond); if (error) { @@ -797,6 +826,9 @@ ovsdb_monitor_table_condition_update( ovsdb_condition_destroy(&mtc->new_condition); ovsdb_condition_clone(&mtc->new_condition, &cond); ovsdb_condition_destroy(&cond); + ovsdb_condition_destroy(&mtc->diff_condition); + ovsdb_condition_diff(&mtc->diff_condition, + &mtc->old_condition, &mtc->new_condition); ovsdb_monitor_condition_add_columns(dbmon, table, &mtc->new_condition); @@ -812,11 +844,14 @@ ovsdb_monitor_table_condition_updated(struct ovsdb_monitor_table *mt, shash_find_data(&condition->tables, mt->table->schema->name); if (mtc) { - /* If conditional monitoring - set old condition to new condition */ + /* If conditional monitoring - set old condition to new condition + * and clear the diff. */ if (ovsdb_condition_cmp_3way(&mtc->old_condition, &mtc->new_condition)) { ovsdb_condition_destroy(&mtc->old_condition); ovsdb_condition_clone(&mtc->old_condition, &mtc->new_condition); + ovsdb_condition_destroy(&mtc->diff_condition); + ovsdb_condition_init(&mtc->diff_condition); ovsdb_monitor_session_condition_set_mode(condition); } } @@ -831,29 +866,42 @@ ovsdb_monitor_row_update_type_condition( const struct ovsdb_datum *old, const struct ovsdb_datum *new) { - struct ovsdb_condition *old_condition, *new_condition; + struct ovsdb_condition *old_condition, *new_condition, *diff_condition; enum ovsdb_monitor_selection type = ovsdb_monitor_row_update_type(initial, old, new); if (ovsdb_monitor_get_table_conditions(mt, condition, &old_condition, - &new_condition)) { - bool old_cond = !old ? false - : ovsdb_condition_empty_or_match_any(old, - old_condition, - row_type == OVSDB_MONITOR_ROW ? - mt->columns_index_map : - NULL); - bool new_cond = !new ? false - : ovsdb_condition_empty_or_match_any(new, - new_condition, - row_type == OVSDB_MONITOR_ROW ? - mt->columns_index_map : - NULL); - - if (!old_cond && !new_cond) { + &new_condition, + &diff_condition)) { + unsigned int *index_map = row_type == OVSDB_MONITOR_ROW + ? mt->columns_index_map : NULL; + bool old_cond = false, new_cond = false; + + if (old && old == new + && !ovsdb_condition_empty_or_match_any(old, diff_condition, + index_map)) { + /* Condition changed, but not the data. And the row is not + * affected by the condition change. It either mathes or + * doesn't match both old and new conditions at the same time. + * In any case, this row should not be part of the update. */ type = OJMS_NONE; + } else { + /* The row changed or the condition change affects this row. + * Need to fully check old and new conditions. */ + if (old) { + old_cond = ovsdb_condition_empty_or_match_any( + old, old_condition, index_map); + } + if (new) { + new_cond = ovsdb_condition_empty_or_match_any( + new, new_condition, index_map); + } + + if (!old_cond && !new_cond) { + type = OJMS_NONE; + } } switch (type) { @@ -1126,6 +1174,8 @@ ovsdb_monitor_compose_update( struct ovsdb_monitor_table *mt = mcst->mt; HMAP_FOR_EACH_SAFE (row, hmap_node, &mcst->rows) { + cooperative_multitasking_yield(); + struct json *row_json; row_json = (*row_update)(mt, condition, OVSDB_MONITOR_ROW, row, initial, changed, mcst->n_columns); @@ -1152,15 +1202,16 @@ ovsdb_monitor_compose_cond_change_update( unsigned long int *changed = xmalloc(bitmap_n_bytes(max_columns)); SHASH_FOR_EACH (node, &dbmon->tables) { + struct ovsdb_condition *old_condition, *new_condition, *diff_condition; struct ovsdb_monitor_table *mt = node->data; - struct ovsdb_row *row; struct json *table_json = NULL; - struct ovsdb_condition *old_condition, *new_condition; + struct ovsdb_row *row; if (!ovsdb_monitor_get_table_conditions(mt, condition, &old_condition, - &new_condition) || + &new_condition, + &diff_condition) || !ovsdb_condition_cmp_3way(old_condition, new_condition)) { /* Nothing to update on this table */ continue; @@ -1170,6 +1221,8 @@ ovsdb_monitor_compose_cond_change_update( HMAP_FOR_EACH (row, hmap_node, &mt->table->rows) { struct json *row_json; + cooperative_multitasking_yield(); + row_json = ovsdb_monitor_compose_row_update2(mt, condition, OVSDB_ROW, row, false, changed, @@ -1239,8 +1292,9 @@ ovsdb_monitor_get_update( /* Pre-serializing the object to avoid doing this * for every client. */ - json_serialized = json_serialized_object_create(json); - json_destroy(json); + json_serialized = + json_serialized_object_create_with_yield(json); + json_destroy_with_yield(json); json = json_serialized; } ovsdb_monitor_json_cache_insert(dbmon, version, mcs, @@ -1279,6 +1333,7 @@ ovsdb_monitor_table_add_select(struct ovsdb_monitor *dbmon, struct ovsdb_monitor_table * mt; mt = shash_find_data(&dbmon->tables, table->schema->name); + ovs_assert(mt); mt->select |= select; } @@ -1329,8 +1384,11 @@ ovsdb_monitor_changes_update(const struct ovsdb_row *old, const struct ovsdb_monitor_table *mt, struct ovsdb_monitor_change_set_for_table *mcst) { + ovs_assert(new || old); const struct uuid *uuid = ovsdb_row_get_uuid(new ? new : old); - struct ovsdb_monitor_row *change; + struct ovsdb_monitor_row *change = NULL; + + ovs_assert(uuid); change = ovsdb_monitor_changes_row_find(mcst, uuid); if (!change) { @@ -1660,6 +1718,8 @@ ovsdb_monitor_hash(const struct ovsdb_monitor *dbmon, size_t basis) for (i = 0; i < n; i++) { struct ovsdb_monitor_table *mt = nodes[i]->data; + ovs_assert(mt); + basis = hash_pointer(mt->table, basis); basis = hash_3words(mt->select, mt->n_columns, basis); diff --git a/ovsdb/mutation.c b/ovsdb/mutation.c index cbc71bc4944..79456001917 100644 --- a/ovsdb/mutation.c +++ b/ovsdb/mutation.c @@ -236,7 +236,8 @@ ovsdb_mutation_set_destroy(struct ovsdb_mutation_set *set) enum ovsdb_mutation_scalar_error { ME_OK, ME_DOM, - ME_RANGE + ME_RANGE, + ME_NOTSUP }; struct ovsdb_scalar_mutation { @@ -267,6 +268,9 @@ ovsdb_mutation_scalar_error(enum ovsdb_mutation_scalar_error error, "Result of \"%s\" operation is out of range.", ovsdb_mutator_to_string(mutator)); + case ME_NOTSUP: + return ovsdb_error(NULL, "Operation not supported."); + default: return OVSDB_BUG("unexpected error"); } @@ -514,6 +518,12 @@ div_double(double *x, double y) } } +static int +mod_double(double *x OVS_UNUSED, double y OVS_UNUSED) +{ + return ME_NOTSUP; +} + static const struct ovsdb_scalar_mutation add_mutation = { add_int, add_double, OVSDB_M_ADD }; @@ -531,5 +541,5 @@ static const struct ovsdb_scalar_mutation div_mutation = { }; static const struct ovsdb_scalar_mutation mod_mutation = { - mod_int, NULL, OVSDB_M_MOD + mod_int, mod_double, OVSDB_M_MOD }; diff --git a/ovsdb/mutation.h b/ovsdb/mutation.h index 7566ef199d6..05d4a262a98 100644 --- a/ovsdb/mutation.h +++ b/ovsdb/mutation.h @@ -69,4 +69,10 @@ void ovsdb_mutation_set_destroy(struct ovsdb_mutation_set *); struct ovsdb_error *ovsdb_mutation_set_execute( struct ovsdb_row *, const struct ovsdb_mutation_set *) OVS_WARN_UNUSED_RESULT; +static inline bool ovsdb_mutation_set_empty( + const struct ovsdb_mutation_set *ms) +{ + return ms->n_mutations == 0; +} + #endif /* ovsdb/mutation.h */ diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c index f1b8d649105..45501911c30 100644 --- a/ovsdb/ovsdb-client.c +++ b/ovsdb/ovsdb-client.c @@ -250,7 +250,7 @@ main(int argc, char *argv[]) parse_options(argc, argv); fatal_ignore_sigpipe(); - daemon_become_new_user(false); + daemon_become_new_user(false, false); if (optind >= argc) { ovs_fatal(0, "missing command name; use --help for help"); } @@ -451,8 +451,9 @@ usage(void) " wait until DATABASE reaches STATE " "(\"added\" or \"connected\" or \"removed\")\n" " in DATBASE on SERVER.\n" - "\n dump [SERVER] [DATABASE]\n" - " dump contents of DATABASE on SERVER to stdout\n" + "\n dump [SERVER] [DATABASE] [TABLE [COLUMN]...]\n" + " dump contents of COLUMNs, TABLE (or all tables) in DATABASE\n" + " on SERVER to stdout\n" "\n backup [SERVER] [DATABASE] > SNAPSHOT\n" " dump database contents in the form of a database file\n" "\n [--force] restore [SERVER] [DATABASE] < SNAPSHOT\n" @@ -473,6 +474,8 @@ usage(void) vlog_usage(); ovs_replay_usage(); printf("\nOther options:\n" + " -t, --timeout=SECS limits ovsdb-client runtime to\n" + " approximately SECS seconds.\n" " -h, --help display this help message\n" " -V, --version display version information\n"); exit(EXIT_SUCCESS); @@ -1232,8 +1235,11 @@ parse_monitor_columns(char *arg, const char *server, const char *database, } free(nodes); - add_column(server, ovsdb_table_schema_get_column(table, "_version"), - columns, columns_json); + const struct ovsdb_column *version_column = + ovsdb_table_schema_get_column(table, "_version"); + + ovs_assert(version_column); + add_column(server, version_column, columns, columns_json); } if (!initial || !insert || !delete || !modify) { @@ -1392,7 +1398,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database, daemon_save_fd(STDOUT_FILENO); daemon_save_fd(STDERR_FILENO); - daemonize_start(false); + daemonize_start(false, false); if (get_detach()) { int error; @@ -1840,7 +1846,7 @@ do_dump(struct jsonrpc *rpc, const char *database, struct ovsdb_schema *schema; struct json *transaction; - const struct shash_node *node, **tables; + const struct shash_node *node, **tables = NULL; size_t n_tables; struct ovsdb_table_schema *tschema; const struct shash *columns; @@ -1866,8 +1872,10 @@ do_dump(struct jsonrpc *rpc, const char *database, shash_add(&custom_columns, argv[i], node->data); } } else { - tables = shash_sort(&schema->tables); n_tables = shash_count(&schema->tables); + if (n_tables) { + tables = shash_sort(&schema->tables); + } } /* Construct transaction to retrieve entire database. */ @@ -2276,7 +2284,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const char *lock) getting a reply of the previous request. */ daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); lock_req_init(&lock_req, method, lock); if (get_detach()) { diff --git a/ovsdb/ovsdb-doc b/ovsdb/ovsdb-doc index 10d0c0c1343..2edf487a289 100755 --- a/ovsdb/ovsdb-doc +++ b/ovsdb/ovsdb-doc @@ -14,9 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datetime import date import getopt -import os import sys import xml.dom.minidom @@ -24,10 +22,13 @@ import ovs.json from ovs.db import error import ovs.db.schema -from build.nroff import * +from ovs_build_helpers.nroff import block_xml_to_nroff +from ovs_build_helpers.nroff import escape_nroff_literal +from ovs_build_helpers.nroff import text_to_nroff argv0 = sys.argv[0] + def typeAndConstraintsToNroff(column): type = column.type.toEnglish(escape_nroff_literal) constraints = column.type.constraintsToEnglish(escape_nroff_literal, @@ -38,6 +39,7 @@ def typeAndConstraintsToNroff(column): type += " (must be unique within table)" return type + def columnGroupToNroff(table, groupXml, documented_columns): introNodes = [] columnNodes = [] @@ -49,7 +51,10 @@ def columnGroupToNroff(table, groupXml, documented_columns): if (columnNodes and not (node.nodeType == node.TEXT_NODE and node.data.isspace())): - raise error.Error("text follows or inside : %s" % node) + raise error.Error( + "text follows or inside : %s" + % node + ) introNodes += [node] summary = [] @@ -65,15 +70,9 @@ def columnGroupToNroff(table, groupXml, documented_columns): if node.hasAttribute('type'): type_string = node.attributes['type'].nodeValue type_json = ovs.json.from_string(str(type_string)) - # py2 -> py3 means str -> bytes and unicode -> str - try: - if type(type_json) in (str, unicode): - raise error.Error("%s %s:%s has invalid 'type': %s" - % (table.name, name, key, type_json)) - except: - if type(type_json) in (bytes, str): - raise error.Error("%s %s:%s has invalid 'type': %s" - % (table.name, name, key, type_json)) + if type(type_json) in (bytes, str): + raise error.Error("%s %s:%s has invalid 'type': %s" + % (table.name, name, key, type_json)) type_ = ovs.db.types.BaseType.from_json(type_json) else: type_ = column.type.value @@ -91,10 +90,11 @@ def columnGroupToNroff(table, groupXml, documented_columns): else: if type_.type != column.type.value.type: type_english = type_.toEnglish() + typeNroff += ", containing " if type_english[0] in 'aeiou': - typeNroff += ", containing an %s" % type_english + typeNroff += "an %s" % type_english else: - typeNroff += ", containing a %s" % type_english + typeNroff += "a %s" % type_english constraints = ( type_.constraintsToEnglish(escape_nroff_literal, text_to_nroff)) @@ -121,6 +121,7 @@ def columnGroupToNroff(table, groupXml, documented_columns): raise error.Error("unknown element %s in " % node.tagName) return summary, intro, body + def tableSummaryToNroff(summary, level=0): s = "" for type, name, arg in summary: @@ -132,6 +133,7 @@ def tableSummaryToNroff(summary, level=0): s += ".RE\n" return s + def tableToNroff(schema, tableXml): tableName = tableXml.attributes['name'].nodeValue table = schema.tables[tableName] @@ -156,20 +158,17 @@ def tableToNroff(schema, tableXml): return s + def docsToNroff(schemaFile, xmlFile, erFile, version=None): schema = ovs.db.schema.DbSchema.from_json(ovs.json.from_file(schemaFile)) doc = xml.dom.minidom.parse(xmlFile).documentElement - schemaDate = os.stat(schemaFile).st_mtime - xmlDate = os.stat(xmlFile).st_mtime - d = date.fromtimestamp(max(schemaDate, xmlDate)) - if doc.hasAttribute('name'): manpage = doc.attributes['name'].nodeValue else: manpage = schema.name - if version == None: + if version is None: version = "UNKNOWN" # Putting '\" p as the first line tells "man" that the manpage @@ -194,7 +193,6 @@ def docsToNroff(schemaFile, xmlFile, erFile, version=None): .PP ''' % (manpage, schema.version, version, text_to_nroff(manpage), schema.name) - tables = "" introNodes = [] tableNodes = [] summary = [] @@ -237,8 +235,8 @@ Purpose """ % (name, text_to_nroff(title)) if erFile: - s += """ -.\\" check if in troff mode (TTY) + s += r""" +.\" check if in troff mode (TTY) .if t \{ .bp .SH "TABLE RELATIONSHIPS" @@ -248,8 +246,8 @@ database. Each node represents a table. Tables that are part of the ``root set'' are shown with double borders. Each edge leads from the table that contains it and points to the table that its value represents. Edges are labeled with their column names, followed by a -constraint on the number of allowed values: \\fB?\\fR for zero or one, -\\fB*\\fR for zero or more, \\fB+\\fR for one or more. Thick lines +constraint on the number of allowed values: \fB?\fR for zero or one, +\fB*\fR for zero or more, \fB+\fR for one or more. Thick lines represent strong references; thin lines represent weak references. .RS -1in """ @@ -263,6 +261,7 @@ represent strong references; thin lines represent weak references. s += tableToNroff(schema, node) + "\n" return s + def usage(): print("""\ %(argv0)s: ovsdb schema documentation generator @@ -278,6 +277,7 @@ The following options are also available: """ % {'argv0': argv0}) sys.exit(0) + if __name__ == "__main__": try: try: diff --git a/ovsdb/ovsdb-dot.in b/ovsdb/ovsdb-dot.in index 41b986c0ac7..f1eefd49cbc 100755 --- a/ovsdb/ovsdb-dot.in +++ b/ovsdb/ovsdb-dot.in @@ -1,15 +1,13 @@ #! @PYTHON3@ -from datetime import date import ovs.db.error import ovs.db.schema import getopt -import os -import re import sys argv0 = sys.argv[0] + def printEdge(tableName, type, baseType, label): if baseType.ref_table_name: if type.n_min == 0: @@ -31,38 +29,42 @@ def printEdge(tableName, type, baseType, label): options['label'] = '"%s%s"' % (label, arity) if baseType.ref_type == 'weak': options['style'] = 'dotted' - print ("\t%s -> %s [%s];" % ( + print("\t%s -> %s [%s];" % ( tableName, baseType.ref_table_name, - ', '.join(['%s=%s' % (k,v) for k,v in options.items()]))) + ', '.join(['%s=%s' % (k, v) for k, v in options.items()]))) + def schemaToDot(schemaFile, arrows): schema = ovs.db.schema.DbSchema.from_json(ovs.json.from_file(schemaFile)) - print ("digraph %s {" % schema.name) - print ('\trankdir=LR;') - print ('\tsize="6.5,4";') - print ('\tmargin="0";') - print ("\tnode [shape=box];") + print("digraph %s {" % schema.name) + print('\trankdir=LR;') + print('\tsize="6.5,4";') + print('\tmargin="0";') + print("\tnode [shape=box];") if not arrows: - print ("\tedge [dir=none, arrowhead=none, arrowtail=none];") + print("\tedge [dir=none, arrowhead=none, arrowtail=none];") for tableName, table in schema.tables.items(): options = {} if table.is_root: options['style'] = 'bold' - print ("\t%s [%s];" % ( + print("\t%s [%s];" % ( tableName, - ', '.join(['%s=%s' % (k,v) for k,v in options.items()]))) + ', '.join(['%s=%s' % (k, v) for k, v in options.items()]))) for columnName, column in table.columns.items(): if column.type.value: - printEdge(tableName, column.type, column.type.key, "%s key" % columnName) - printEdge(tableName, column.type, column.type.value, "%s value" % columnName) + printEdge(tableName, column.type, column.type.key, + "%s key" % columnName) + printEdge(tableName, column.type, column.type.value, + "%s value" % columnName) else: printEdge(tableName, column.type, column.type.key, columnName) - print ("}"); + print("}") + def usage(): - print ("""\ + print("""\ %(argv0)s: compiles ovsdb schemas to graphviz format Prints a .dot file that "dot" can render to an entity-relationship diagram usage: %(argv0)s [OPTIONS] SCHEMA @@ -75,12 +77,13 @@ The following options are also available: """ % {'argv0': argv0}) sys.exit(0) + if __name__ == "__main__": try: try: options, args = getopt.gnu_getopt(sys.argv[1:], 'hV', ['no-arrows', - 'help', 'version',]) + 'help', 'version']) except getopt.GetoptError as geo: sys.stderr.write("%s: %s\n" % (argv0, geo.msg)) sys.exit(1) @@ -92,7 +95,7 @@ if __name__ == "__main__": elif key in ['-h', '--help']: usage() elif key in ['-V', '--version']: - print ("ovsdb-dot (Open vSwitch) @VERSION@") + print("ovsdb-dot (Open vSwitch) @VERSION@") else: sys.exit(0) diff --git a/ovsdb/ovsdb-idlc.in b/ovsdb/ovsdb-idlc.in index 5a97a8ea3e1..9a54f06a191 100755 --- a/ovsdb/ovsdb-idlc.in +++ b/ovsdb/ovsdb-idlc.in @@ -362,6 +362,8 @@ struct %(s)s *%(s)s_cursor_data(struct ovsdb_idl_cursor *); void %(s)s_init(struct %(s)s *); void %(s)s_delete(const struct %(s)s *); struct %(s)s *%(s)s_insert(struct ovsdb_idl_txn *); +struct %(s)s *%(s)s_insert_persist_uuid( + struct ovsdb_idl_txn *txn, const struct uuid *uuid); /* Returns true if the tracked column referenced by 'enum %(s)s_column_id' of * the row referenced by 'struct %(s)s *' was updated since the last change @@ -809,6 +811,19 @@ struct %(s)s * return %(s)s_cast(ovsdb_idl_txn_insert(txn, &%(p)stable_%(tl)s, NULL)); } +/* Inserts and returns a new row in the table "%(t)s" in the database + * with open transaction 'txn'. + * + * The new row is assigned the UUID specified in the 'uuid' parameter + * (which cannot be null). ovsdb-server will try to assign the same + * UUID when 'txn' is committed. */ +struct %(s)s * +%(s)s_insert_persist_uuid(struct ovsdb_idl_txn *txn, const struct uuid *uuid) +{ + return %(s)s_cast(ovsdb_idl_txn_insert_persist_uuid( + txn, &%(p)stable_%(tl)s, uuid)); +} + bool %(s)s_is_updated(const struct %(s)s *row, enum %(s)s_column_id column) { diff --git a/ovsdb/ovsdb-server.1.in b/ovsdb/ovsdb-server.1.in index da7a6fd5d54..23b8e6e9cd8 100644 --- a/ovsdb/ovsdb-server.1.in +++ b/ovsdb/ovsdb-server.1.in @@ -12,6 +12,7 @@ ovsdb\-server \- Open vSwitch database server [\fIdatabase\fR]\&... [\fIrelay:schema_name:remote\fR]\&... [\fB\-\-remote=\fIremote\fR]\&... +[\fB\-\-config\-file=\fIfile\fR] [\fB\-\-run=\fIcommand\fR] .so lib/daemon-syn.man .so lib/service-syn.man @@ -44,6 +45,11 @@ If none of database files or relay databases is specified, the default is initialized using, for example, \fBovsdb\-tool\fR's \fBcreate\fR, \fBcreate\-cluster\fR, or \fBjoin\-cluster\fR command. .PP +All types of databases can alternatively be added using a configuration +file provided via \fB\-\-config\-file\fR option. This option is mutually +exclusive with specifying \fIdatabase\fR on the command line. For a detailed +description of the configuration file format see \fBovsdb\fR(7). +.PP This OVSDB implementation supports standalone, active-backup, relay and clustered database service models, as well as database replication. See the Service Models section of \fBovsdb\fR(7) for more information. @@ -105,6 +111,74 @@ It is an error for \fIcolumn\fR to have another type. .IP To connect or listen on multiple connection methods, use multiple \fB\-\-remote\fR options. +.IP +Alternatively, remotes can be specified in a "remotes" section of the +configuration file, if provided using \fB\-\-config\-file\fR option. +\fB\-\-config\-file\fR and \fB\-\-remote\fR options are mutually +exclusive. +. +.IP "\fB\-\-config-file=\fIfile\fR" +Specifies a configuration file for \fBovsdb\-server\fR. This \fIfile\fR +can contain connection methods and databases used by the server. +The \fIfile\fR contains a JSON object with two main elements: +.RS +.IP "\fBremotes\fR" +JSON object that contains a set of connection methods in the following format: +"\fItarget\fR": { "\fIoption\fR": \fIvalue\fR, ... }. Where \fItarget\fR +is in the same format as \fIremote\fR in \fB\-\-remote\fR option. +\fIoption\fR can be \fBmax-backoff\fR (integer), \fBinactivity-probe\fR +(integer), \fBread-only\fR (boolean), \fBrole\fR (string) or \fBdscp\fR +(integer) with their allowed \fIvalue\fRs respectively. The meaning of these +\fIoption\fRs is the same as in configuration of \fIremote\fR via a database +row with \fB\-\-remote\fR option. +.IP "\fBdatabases\fR" +JSON object that describes databases that should be added to the +\fBovsdb\-server\fR in the following format: "\fIname\fR":{ "\fIoption\fR": +\fIvalue\fR, ... }. Where \fIname\fR is either a file name of a previously +created and initialized database or a schema name in case of relay +databases. Available \fIoption\fRs are: +.RS +.IP "\fBservice-model\fR (string)" +Describes the service model of this database. One of: \fBstandalone\fR, +\fBclustered\fR, \fBactive-backup\fR or \fBrelay\fR. This option is +required for all types, except for standalone and clustered. For these +databases the service model will be inferred from the file, if not +specified explicitly. \fBovsdb-server\fR will refuse to add a database +if the specified \fBservice-model\fR doesn't match with the provided file. +.IP "\fBsource\fR (JSON object; active-backup or relay)" +Describes the connection method to the active database or to the relay +source. It is a JSON object with exactly one element in the same format +as elements of "\fBremotes\fR", except that \fBread-only\fR and \fBrole\fR +options are not applicable. E.g. \fB"source": { "unix:db.sock": { +"inactivity-probe": 10000, "max-backoff": 8000 } }\fR +.IP "\fBbackup\fR (boolean; active-backup only)" +If set to \fBtrue\fR, \fBovsdb-server\fR will use this database as a +backup for the specified \fBsource\fR. Will be served as an active +database otherwise. +.IP "\fBexclude-tables\fR (JSON array of strings; active-backup only)" +List of table names that should be excluded from replication in backup mode, +e.g. \fB"exclude-tables": [ "Table_One", "Table_Two" ]\fR. +.RE +.RE +.IP +Content of the most basic configuration file may look like this: +\fB{ "remotes": { "pssl:6640": {} }, "databases": { "conf.db": {} } }\fR +.IP +Examples of configuration files for different service models can be +found in in \fBovsdb\fR(7). +.IP +\fB\-\-config-file\fR option is mutually exclusive with the \fB\-\-remote\fR +as well as with specifying \fIdatabase\fR on a command line. It is also +mutually exclusive with all the \fBActive-Backup Options\fR and all the +\fBRUNTIME MANAGEMENT COMMANDS\fR that can change the configuration of +the server in conflict with the content of the file, i.e. all the commands +that manipulate with remotes and databases. Read-only commands can still +be used. +.IP +In case of changes in the \fIfile\fR, users should run the +\fBovsdb-server/reload\fR command with \fBovs-appctl\fR(8) in order for +changes to take effect. +.RE . .IP "\fB\-\-run=\fIcommand\fR]" Ordinarily \fBovsdb\-server\fR runs forever, or until it is told to @@ -178,6 +252,8 @@ allow the syncing options to be specified using command line options, yet start the server, as the default, active server. To switch the running server to backup mode, use \fBovs-appctl(1)\fR to execute the \fBovsdb\-server/connect\-active\-ovsdb\-server\fR command. +.PP +These options are mutually exclusive with the \fB\-\-config\-file\fR. .SS "Public Key Infrastructure Options" The options described below for configuring the SSL public key infrastructure accept a special syntax for obtaining their @@ -230,6 +306,8 @@ clients. Adds a remote, as if \fB\-\-remote=\fIremote\fR had been specified on the \fBovsdb\-server\fR command line. (If \fIremote\fR is already a remote, this command succeeds without changing the configuration.) +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/remove\-remote \fIremote\fR" Removes the specified \fIremote\fR from the configuration, failing @@ -241,6 +319,8 @@ configuring a \fBdb:\fIdb\fB,\fItable\fB,\fIcolumn\fR remote. (You can remove a database source with \fBovsdb\-server/remove\-remote \fBdb:\fIdb\fB,\fItable\fB,\fIcolumn\fR, but not individual remotes found indirectly through the database.) +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/list\-remotes" Outputs a list of the currently configured remotes named on @@ -254,6 +334,8 @@ Adds the \fIdatabase\fR to the running \fBovsdb\-server\fR. \fIdatabase\fR could be a database file or a relay description in the following format: \fIrelay:schema_name:remote\fR. The database file must already have been created and initialized using, for example, \fBovsdb\-tool create\fR. +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/remove\-db \fIdatabase\fR" Removes \fIdatabase\fR from the running \fBovsdb\-server\fR. \fIdatabase\fR @@ -268,6 +350,8 @@ Any public key infrastructure options specified through this database (e.g. \fB\-\-private\-key=db:\fIdatabase,\fR... on the command line) will be disabled until another database with the same name is added again (with \fBovsdb\-server/add\-db\fR). +.IP +Mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/list\-dbs" Outputs a list of the currently configured databases added either through @@ -286,6 +370,9 @@ These commands query and update the role of \fBovsdb\-server\fR within an active-backup pair of servers. See \fBActive-Backup Options\fR, above, and \fBActive-Backup Database Service Model\fR in \fBovsdb\fR(7) for more information. +.PP +All \fBActive-Backup Commands\fR that change the state of \fBovsdb\-server\fR +are mutually exclusive with the \fB\-\-config\-file\fR option. . .IP "\fBovsdb\-server/set\-active\-ovsdb\-server \fIserver" Sets the active \fIserver\fR from which \fBovsdb\-server\fR connects through @@ -324,11 +411,10 @@ Gets the tables that are currently excluded from synchronization. Prints a summary of replication run time information. The \fBstate\fR information is always provided, indicating whether the server is running in the \fIactive\fR or the \fIbackup\fR mode. -When running in backup mode, replication connection status, which -can be either \fIconnecting\fR, \fIreplicating\fR or \fIerror\fR, are shown. -When the connection is in \fIreplicating\fR state, further output shows -the list of databases currently replicating, and the tables that are -excluded. +For all databases with active-backup service model, replication connection +status, which can be either \fIconnecting\fR, \fIreplicating\fR or +\fIerror\fR, are shown. When the connection is in \fIreplicating\fR state, +further output shows the tables that are currently excluded from replication. . .SS "Cluster Commands" These commands support the \fBovsdb\-server\fR clustered service model. @@ -375,8 +461,7 @@ This does not result in a three server cluster that lacks quorum. . .IP "\fBcluster/kick \fIdb server\fR" Start graceful removal of \fIserver\fR from \fIdb\fR's cluster, like -\fBcluster/leave\fR (without \fB\-\-force\fR) except that it can -remove any server, not just this one. +\fBcluster/leave\fR, except that it can remove any server, not just this one. .IP \fIserver\fR may be a server ID, as printed by \fBcluster/sid\fR, or the server's local network address as passed to \fBovsdb-tool\fR's diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c index 7a6bfe0a03c..a876f8bcf72 100644 --- a/ovsdb/ovsdb-server.c +++ b/ovsdb/ovsdb-server.c @@ -24,6 +24,7 @@ #include "column.h" #include "command-line.h" +#include "cooperative-multitasking.h" #include "daemon.h" #include "dirs.h" #include "dns-resolve.h" @@ -42,6 +43,7 @@ #include "ovsdb-data.h" #include "ovsdb-types.h" #include "ovsdb-error.h" +#include "ovsdb-parser.h" #include "openvswitch/poll-loop.h" #include "process.h" #include "replication.h" @@ -65,12 +67,6 @@ VLOG_DEFINE_THIS_MODULE(ovsdb_server); -struct db { - char *filename; - struct ovsdb *db; - struct uuid row_uuid; -}; - /* SSL configuration. */ static char *private_key_file; static char *certificate_file; @@ -94,20 +90,92 @@ static unixctl_cb_func ovsdb_server_get_active_ovsdb_server; static unixctl_cb_func ovsdb_server_connect_active_ovsdb_server; static unixctl_cb_func ovsdb_server_disconnect_active_ovsdb_server; static unixctl_cb_func ovsdb_server_set_active_ovsdb_server_probe_interval; +static unixctl_cb_func ovsdb_server_set_relay_source_interval; static unixctl_cb_func ovsdb_server_set_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_exclude_tables; static unixctl_cb_func ovsdb_server_get_sync_status; static unixctl_cb_func ovsdb_server_get_db_storage_status; +/* Holds the name of the configuration file passed via --config-file. + * Mutually exclusive with command-line and unixctl configuration + * that can otherwise be done via configuration file. */ +static char *config_file_path; +/* UnixCtl command to reload configuration from a configuration file. */ +static unixctl_cb_func ovsdb_server_reload; + +#define SERVICE_MODELS \ + SERVICE_MODEL(UNDEFINED, undefined) \ + SERVICE_MODEL(STANDALONE, standalone) \ + SERVICE_MODEL(CLUSTERED, clustered) \ + SERVICE_MODEL(ACTIVE_BACKUP, active-backup) \ + SERVICE_MODEL(RELAY, relay) + +enum service_model { +#define SERVICE_MODEL(ENUM, NAME) SM_##ENUM, + SERVICE_MODELS +#undef SERVICE_MODEL +}; + +static const char * +service_model_to_string(enum service_model model) +{ + switch (model) { +#define SERVICE_MODEL(ENUM, NAME) \ + case SM_##ENUM: return #NAME; + SERVICE_MODELS +#undef SERVICE_MODEL + default: OVS_NOT_REACHED(); + } +} + +static enum service_model +service_model_from_string(const char *model) +{ +#define SERVICE_MODEL(ENUM, NAME) \ + if (!strcmp(model, #NAME)) { \ + return SM_##ENUM; \ + } + SERVICE_MODELS +#undef SERVICE_MODEL + + VLOG_WARN("Unrecognized database service model: '%s'", model); + + return SM_UNDEFINED; +} + +struct db_config { + enum service_model model; + char *source; /* sync-from for backup or relay source. */ + struct ovsdb_jsonrpc_options *options; /* For 'source' connection. */ + + /* Configuration specific to SM_ACTIVE_BACKUP. */ + struct { + char *sync_exclude; /* Tables to exclude. */ + bool backup; /* If true, the database is read-only and receives + * updates from the 'source'. */ + } ab; +}; + +struct db { + struct ovsdb *db; + char *filename; + struct db_config *config; + struct uuid row_uuid; +}; + struct server_config { - struct sset *remotes; - struct shash *all_dbs; - FILE *config_tmpfile; + struct shash *remotes; + struct shash *all_dbs; /* All the currently serviced databases. + * 'struct db' by a schema name. */ + struct ovsdb_jsonrpc_server *jsonrpc; + + /* Command line + appctl configuration. */ char **sync_from; char **sync_exclude; bool *is_backup; int *replication_probe_interval; - struct ovsdb_jsonrpc_server *jsonrpc; + int *relay_source_probe_interval; + FILE *config_tmpfile; }; static unixctl_cb_func ovsdb_server_add_remote; static unixctl_cb_func ovsdb_server_remove_remote; @@ -121,54 +189,52 @@ static unixctl_cb_func ovsdb_server_tlog_list; static void read_db(struct server_config *, struct db *); static struct ovsdb_error *open_db(struct server_config *, - const char *filename) + const char *filename, + const struct db_config *) OVS_WARN_UNUSED_RESULT; static void add_server_db(struct server_config *); static void remove_db(struct server_config *, struct shash_node *db, char *); static void close_db(struct server_config *, struct db *, char *); +static struct ovsdb_error *update_schema(struct ovsdb *, + const struct ovsdb_schema *, + const struct uuid *txnid, + bool conversion_with_no_data, + void *aux) + OVS_WARN_UNUSED_RESULT; + static void parse_options(int argc, char *argvp[], - struct sset *db_filenames, struct sset *remotes, + struct shash *db_conf, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *is_backup); OVS_NO_RETURN static void usage(void); +static struct ovsdb_jsonrpc_options *add_remote( + struct shash *remotes, const char *target, + const struct ovsdb_jsonrpc_options *); +static void free_remotes(struct shash *remotes); + static char *reconfigure_remotes(struct ovsdb_jsonrpc_server *, const struct shash *all_dbs, - struct sset *remotes); + struct shash *remotes); static char *reconfigure_ssl(const struct shash *all_dbs); static void report_error_if_changed(char *error, char **last_errorp); static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, - const struct sset *remotes, + const struct shash *remotes, struct shash *all_dbs); static void update_server_status(struct shash *all_dbs); -static void save_config__(FILE *config_file, const struct sset *remotes, - const struct sset *db_filenames, +static void save_config__(FILE *config_file, const struct shash *remotes, + const struct shash *db_conf, const char *sync_from, const char *sync_exclude, bool is_backup); static void save_config(struct server_config *); -static void load_config(FILE *config_file, struct sset *remotes, - struct sset *db_filenames, char **sync_from, +static bool load_config(FILE *config_file, struct shash *remotes, + struct shash *db_conf, char **sync_from, char **sync_exclude, bool *is_backup); -static void -ovsdb_replication_init(const char *sync_from, const char *exclude, - struct shash *all_dbs, const struct uuid *server_uuid, - int probe_interval) -{ - replication_init(sync_from, exclude, server_uuid, probe_interval); - struct shash_node *node; - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; - if (node->name[0] != '_' && db->db) { - replication_add_local_db(node->name, db->db); - } - } -} - static void log_and_free_error(struct ovsdb_error *error) { @@ -179,11 +245,53 @@ log_and_free_error(struct ovsdb_error *error) } } +static void +ovsdb_server_replication_remove_db(struct db *db) +{ + replication_remove_db(db->db); + db->config->ab.backup = false; +} + +static void +ovsdb_server_replication_run(struct server_config *config) +{ + struct shash_node *node; + bool all_alive = true; + + replication_run(); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP && db->config->ab.backup + && !replication_is_alive(db->db)) { + ovsdb_server_replication_remove_db(db); + all_alive = false; + } + } + + /* If one connection is broken, switch all databases to active, + * if they are configured via the command line / appctl and so have + * shared configuration. */ + if (!config_file_path && !all_alive && *config->is_backup) { + *config->is_backup = false; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP + && db->config->ab.backup) { + ovsdb_server_replication_remove_db(db); + } + } + } +} + static void main_loop(struct server_config *config, struct ovsdb_jsonrpc_server *jsonrpc, struct shash *all_dbs, - struct unixctl_server *unixctl, struct sset *remotes, - struct process *run_process, bool *exiting, bool *is_backup) + struct unixctl_server *unixctl, struct shash *remotes, + struct process *run_process, bool *exiting) { char *remotes_error, *ssl_error; struct shash_node *node; @@ -213,7 +321,7 @@ main_loop(struct server_config *config, * the set of remotes that reconfigure_remotes() uses. */ unixctl_server_run(unixctl); - ovsdb_jsonrpc_server_set_read_only(jsonrpc, *is_backup); + ovsdb_jsonrpc_server_set_read_only(jsonrpc, false); report_error_if_changed( reconfigure_remotes(jsonrpc, all_dbs, remotes), @@ -221,19 +329,12 @@ main_loop(struct server_config *config, report_error_if_changed(reconfigure_ssl(all_dbs), &ssl_error); ovsdb_jsonrpc_server_run(jsonrpc); - if (*is_backup) { - replication_run(); - if (!replication_is_alive()) { - disconnect_active_server(); - *is_backup = false; - } - } - + ovsdb_server_replication_run(config); ovsdb_relay_run(); SHASH_FOR_EACH_SAFE (node, all_dbs) { struct db *db = node->data; - ovsdb_txn_history_run(db->db); + ovsdb_storage_run(db->db->storage); read_db(config, db); /* Run triggers after storage_run and read_db to make sure new raft @@ -276,10 +377,8 @@ main_loop(struct server_config *config, update_server_status(all_dbs); memory_wait(); - if (*is_backup) { - replication_wait(); - } + replication_wait(); ovsdb_relay_wait(); ovsdb_jsonrpc_server_wait(jsonrpc); @@ -309,6 +408,307 @@ main_loop(struct server_config *config, free(remotes_error); } +/* Parsing the relay in format 'relay:DB_NAME:'. + * On success, returns 'true', 'name' is set to DB_NAME, 'remotes' to + * ''. Caller is responsible of freeing 'name' and + * 'remotes'. On failure, returns 'false'. */ +static bool +parse_relay_args(const char *arg, char **name, char **remote) +{ + const char *relay_prefix = "relay:"; + const int relay_prefix_len = strlen(relay_prefix); + bool is_relay; + + is_relay = !strncmp(arg, relay_prefix, relay_prefix_len); + if (!is_relay) { + return false; + } + + *remote = strchr(arg + relay_prefix_len, ':'); + + if (!*remote || (*remote)[0] == '\0') { + *remote = NULL; + return false; + } + arg += relay_prefix_len; + *name = xmemdup0(arg, *remote - arg); + *remote = xstrdup(*remote + 1); + return true; +} + +static void +db_config_destroy(struct db_config *conf) +{ + if (!conf) { + return; + } + + free(conf->source); + ovsdb_jsonrpc_options_free(conf->options); + free(conf->ab.sync_exclude); + free(conf); +} + +static struct db_config * +db_config_clone(const struct db_config *c) +{ + struct db_config *conf = xmemdup(c, sizeof *c); + + conf->source = nullable_xstrdup(c->source); + if (c->options) { + conf->options = ovsdb_jsonrpc_options_clone(c->options); + } + conf->ab.sync_exclude = nullable_xstrdup(c->ab.sync_exclude); + + return conf; +} + +static struct ovsdb_jsonrpc_options * +get_jsonrpc_options(const char *target, enum service_model model) +{ + struct ovsdb_jsonrpc_options *options; + + options = ovsdb_jsonrpc_default_options(target); + if (model == SM_ACTIVE_BACKUP) { + options->rpc.probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; + } else if (model == SM_RELAY) { + options->rpc.probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; + } + + return options; +} + +static void +add_database_config(struct shash *db_conf, const char *opt, + const char *sync_from, const char *sync_exclude, + bool active) +{ + struct db_config *conf = xzalloc(sizeof *conf); + char *filename = NULL; + + if (parse_relay_args(opt, &filename, &conf->source)) { + conf->model = SM_RELAY; + conf->options = get_jsonrpc_options(conf->source, conf->model); + } else if (sync_from) { + conf->model = SM_ACTIVE_BACKUP; + conf->source = xstrdup(sync_from); + conf->options = get_jsonrpc_options(conf->source, conf->model); + conf->ab.sync_exclude = nullable_xstrdup(sync_exclude); + conf->ab.backup = !active; + filename = xstrdup(opt); + } else { + conf->model = SM_UNDEFINED; /* We'll update once the file is open. */ + filename = xstrdup(opt); + } + + conf = shash_replace_nocopy(db_conf, filename, conf); + if (conf) { + VLOG_WARN("Duplicate database configuration: %s", filename); + db_config_destroy(conf); + } +} + +static void +free_database_configs(struct shash *db_conf) +{ + struct shash_node *node; + + SHASH_FOR_EACH (node, db_conf) { + db_config_destroy(node->data); + } + shash_clear(db_conf); +} + +static bool +service_model_can_convert(enum service_model a, enum service_model b) +{ + ovs_assert(a != SM_UNDEFINED); + + if (a == b) { + return true; + } + + if (b == SM_UNDEFINED) { + return a == SM_STANDALONE || a == SM_CLUSTERED; + } + + /* Conversion can happen only between standalone and active-backup. */ + return (a == SM_STANDALONE && b == SM_ACTIVE_BACKUP) + || (a == SM_ACTIVE_BACKUP && b == SM_STANDALONE); +} + +static void +database_update_config(struct server_config *server_config, + struct db *db, const struct db_config *new_conf) +{ + struct db_config *conf = db->config; + enum service_model model = conf->model; + + /* Stop replicating when transitioning to active or standalone. */ + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup + && (new_conf->model == SM_STANDALONE || !new_conf->ab.backup)) { + ovsdb_server_replication_remove_db(db); + } + + db_config_destroy(conf); + conf = db->config = db_config_clone(new_conf); + + if (conf->model == SM_UNDEFINED) { + /* We're operating on the same file, the model is the same. */ + conf->model = model; + } + + if (conf->model == SM_RELAY) { + ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, + &conf->options->rpc); + } + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup) { + const struct uuid *server_uuid; + + server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); + } +} + +static bool +reconfigure_databases(struct server_config *server_config, + struct shash *db_conf) +{ + struct db_config *cur_conf, *new_conf; + struct shash_node *node, *conf_node; + bool res = true; + struct db *db; + + /* Remove databases that are no longer in the configuration or have + * incompatible configuration. Update compatible ones. */ + SHASH_FOR_EACH_SAFE (node, server_config->all_dbs) { + db = node->data; + + if (node->name[0] == '_') { + /* Skip internal databases. */ + continue; + } + + cur_conf = db->config; + conf_node = shash_find(db_conf, db->filename); + new_conf = conf_node ? conf_node->data : NULL; + + if (!new_conf) { + remove_db(server_config, node, + xasprintf("database %s removed from configuration", + node->name)); + continue; + } + if (!service_model_can_convert(cur_conf->model, new_conf->model)) { + remove_db(server_config, node, + xasprintf("service model changed for database %s", + node->name)); + continue; + } + database_update_config(server_config, db, new_conf); + + db_config_destroy(new_conf); + shash_delete(db_conf, conf_node); + } + + /* Create new databases. */ + SHASH_FOR_EACH (node, db_conf) { + struct ovsdb_error *error = open_db(server_config, + node->name, node->data); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("failed to open database '%s': %s", node->name, s); + free(s); + res = false; + } + db_config_destroy(node->data); + } + shash_clear(db_conf); + + return res; +} + +static bool +reconfigure_ovsdb_server(struct server_config *server_config) +{ + char *sync_from = NULL, *sync_exclude = NULL; + bool is_backup = false; + struct shash remotes; + struct shash db_conf; + bool res = true; + + FILE *file = NULL; + + if (config_file_path) { + file = fopen(config_file_path, "r+b"); + if (!file) { + VLOG_ERR("failed to open configuration file '%s': %s", + config_file_path, ovs_strerror(errno)); + return false; + } else { + VLOG_INFO("loading configuration from '%s'", config_file_path); + } + } else { + file = server_config->config_tmpfile; + } + ovs_assert(file); + + shash_init(&remotes); + shash_init(&db_conf); + + if (!load_config(file, &remotes, &db_conf, + &sync_from, &sync_exclude, &is_backup)) { + if (config_file_path) { + VLOG_WARN("failed to load configuration from %s", + config_file_path); + } else { + VLOG_FATAL("failed to load configuration from a temporary file"); + } + res = false; + goto exit_close; + } + + /* Parsing was successful. Update the server configuration. */ + shash_swap(server_config->remotes, &remotes); + free(*server_config->sync_from); + *server_config->sync_from = sync_from; + free(*server_config->sync_exclude); + *server_config->sync_exclude = sync_exclude; + *server_config->is_backup = is_backup; + + if (!reconfigure_databases(server_config, &db_conf)) { + VLOG_WARN("failed to configure databases"); + res = false; + } + + char *error = reconfigure_remotes(server_config->jsonrpc, + server_config->all_dbs, + server_config->remotes); + if (error) { + VLOG_WARN("failed to configure remotes: %s", error); + res = false; + } else { + error = reconfigure_ssl(server_config->all_dbs); + if (error) { + VLOG_WARN("failed to configure SSL: %s", error); + res = false; + } + } + free(error); + +exit_close: + if (config_file_path) { + fclose(file); + } + free_remotes(&remotes); + free_database_configs(&db_conf); + shash_destroy(&remotes); + shash_destroy(&db_conf); + return res; +} + int main(int argc, char *argv[]) { @@ -316,18 +716,29 @@ main(int argc, char *argv[]) char *run_command = NULL; struct unixctl_server *unixctl; struct ovsdb_jsonrpc_server *jsonrpc; - struct sset remotes, db_filenames; - char *sync_from, *sync_exclude; - bool is_backup; - const char *db_filename; struct process *run_process; bool exiting; int retval; - FILE *config_tmpfile; - struct server_config server_config; + FILE *config_tmpfile = NULL; struct shash all_dbs; struct shash_node *node; int replication_probe_interval = REPLICATION_DEFAULT_PROBE_INTERVAL; + int relay_source_probe_interval = RELAY_SOURCE_DEFAULT_PROBE_INTERVAL; + struct sset db_filenames = SSET_INITIALIZER(&db_filenames); + struct shash db_conf = SHASH_INITIALIZER(&db_conf); + struct shash remotes = SHASH_INITIALIZER(&remotes); + char *sync_from = NULL, *sync_exclude = NULL; + bool is_backup; + + struct server_config server_config = { + .remotes = &remotes, + .all_dbs = &all_dbs, + .sync_from = &sync_from, + .sync_exclude = &sync_exclude, + .is_backup = &is_backup, + .replication_probe_interval = &replication_probe_interval, + .relay_source_probe_interval = &relay_source_probe_interval, + }; ovs_cmdl_proctitle_init(argc, argv); set_program_name(argv[0]); @@ -337,64 +748,45 @@ main(int argc, char *argv[]) dns_resolve_init(true); bool active = false; - parse_options(argc, argv, &db_filenames, &remotes, &unixctl_path, + parse_options(argc, argv, &db_conf, &remotes, &unixctl_path, &run_command, &sync_from, &sync_exclude, &active); is_backup = sync_from && !active; - daemon_become_new_user(false); - - /* Create and initialize 'config_tmpfile' as a temporary file to hold - * ovsdb-server's most basic configuration, and then save our initial - * configuration to it. When --monitor is used, this preserves the effects - * of ovs-appctl commands such as ovsdb-server/add-remote (which saves the - * new configuration) across crashes. */ - config_tmpfile = tmpfile(); - if (!config_tmpfile) { - ovs_fatal(errno, "failed to create temporary file"); + daemon_become_new_user(false, false); + + if (!config_file_path) { + /* Create and initialize 'config_tmpfile' as a temporary file to hold + * ovsdb-server's most basic configuration, and then save our initial + * configuration to it. When --monitor is used, this preserves the + * effects of ovs-appctl commands such as ovsdb-server/add-remote + * (which saves the new configuration) across crashes. */ + config_tmpfile = tmpfile(); + if (!config_tmpfile) { + ovs_fatal(errno, "failed to create temporary file"); + } + server_config.config_tmpfile = config_tmpfile; + save_config__(config_tmpfile, &remotes, &db_conf, sync_from, + sync_exclude, is_backup); } - server_config.remotes = &remotes; - server_config.config_tmpfile = config_tmpfile; - - save_config__(config_tmpfile, &remotes, &db_filenames, sync_from, - sync_exclude, is_backup); + free_remotes(&remotes); + free_database_configs(&db_conf); - daemonize_start(false); + daemonize_start(false, false); - /* Load the saved config. */ - load_config(config_tmpfile, &remotes, &db_filenames, &sync_from, - &sync_exclude, &is_backup); - - /* Start ovsdb jsonrpc server. When running as a backup server, - * jsonrpc connections are read only. Otherwise, both read - * and write transactions are allowed. */ - jsonrpc = ovsdb_jsonrpc_server_create(is_backup); + perf_counters_init(); - shash_init(&all_dbs); - server_config.all_dbs = &all_dbs; + /* Start ovsdb jsonrpc server. Both read and write transactions are + * allowed by default, individual remotes and databases will be configured + * as read-only, if necessary. */ + jsonrpc = ovsdb_jsonrpc_server_create(false); server_config.jsonrpc = jsonrpc; - server_config.sync_from = &sync_from; - server_config.sync_exclude = &sync_exclude; - server_config.is_backup = &is_backup; - server_config.replication_probe_interval = &replication_probe_interval; - - perf_counters_init(); - SSET_FOR_EACH (db_filename, &db_filenames) { - struct ovsdb_error *error = open_db(&server_config, db_filename); - if (error) { - char *s = ovsdb_error_to_string_free(error); - ovs_fatal(0, "%s", s); - } - } + shash_init(&all_dbs); add_server_db(&server_config); - char *error = reconfigure_remotes(jsonrpc, &all_dbs, &remotes); - if (!error) { - error = reconfigure_ssl(&all_dbs); - } - if (error) { - ovs_fatal(0, "%s", error); + if (!reconfigure_ovsdb_server(&server_config)) { + ovs_fatal(0, "server configuration failed"); } retval = unixctl_server_create(unixctl_path, &unixctl); @@ -424,7 +816,8 @@ main(int argc, char *argv[]) /* ovsdb-server is usually a long-running process, in which case it * makes plenty of sense to log the version, but --run makes * ovsdb-server more like a command-line tool, so skip it. */ - VLOG_INFO("%s (Open vSwitch) %s", program_name, VERSION); + VLOG_INFO("%s (Open vSwitch) %s", program_name, + VERSION VERSION_SUFFIX); } unixctl_command_register("exit", "", 0, 0, ovsdb_server_exit, &exiting); @@ -435,6 +828,8 @@ main(int argc, char *argv[]) ovsdb_server_memory_trim_on_compaction, NULL); unixctl_command_register("ovsdb-server/reconnect", "", 0, 0, ovsdb_server_reconnect, jsonrpc); + unixctl_command_register("ovsdb-server/reload", "", 0, 0, + ovsdb_server_reload, &server_config); unixctl_command_register("ovsdb-server/add-remote", "REMOTE", 1, 1, ovsdb_server_add_remote, &server_config); @@ -472,12 +867,15 @@ main(int argc, char *argv[]) unixctl_command_register( "ovsdb-server/set-active-ovsdb-server-probe-interval", "", 1, 1, ovsdb_server_set_active_ovsdb_server_probe_interval, &server_config); + unixctl_command_register( + "ovsdb-server/set-relay-source-probe-interval", "", 1, 1, + ovsdb_server_set_relay_source_interval, &server_config); unixctl_command_register("ovsdb-server/set-sync-exclude-tables", "", 0, 1, ovsdb_server_set_sync_exclude_tables, &server_config); unixctl_command_register("ovsdb-server/get-sync-exclude-tables", "", 0, 0, ovsdb_server_get_sync_exclude_tables, - NULL); + &server_config); unixctl_command_register("ovsdb-server/sync-status", "", 0, 0, ovsdb_server_get_sync_status, &server_config); @@ -490,15 +888,8 @@ main(int argc, char *argv[]) unixctl_command_register("ovsdb-server/disable-monitor-cond", "", 0, 0, ovsdb_server_disable_monitor_cond, jsonrpc); - if (is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(jsonrpc); - ovsdb_replication_init(sync_from, sync_exclude, &all_dbs, server_uuid, - replication_probe_interval); - } - main_loop(&server_config, jsonrpc, &all_dbs, unixctl, &remotes, - run_process, &exiting, &is_backup); + run_process, &exiting); SHASH_FOR_EACH_SAFE (node, &all_dbs) { struct db *db = node->data; @@ -507,12 +898,15 @@ main(int argc, char *argv[]) } ovsdb_jsonrpc_server_destroy(jsonrpc); shash_destroy(&all_dbs); - sset_destroy(&remotes); - sset_destroy(&db_filenames); + free_remotes(&remotes); + shash_destroy(&remotes); + free_database_configs(&db_conf); + shash_destroy(&db_conf); free(sync_from); free(sync_exclude); unixctl_server_destroy(unixctl); replication_destroy(); + free(config_file_path); if (run_process && process_exited(run_process)) { int status = process_status(run_process); @@ -523,6 +917,7 @@ main(int argc, char *argv[]) } dns_resolve_destroy(); perf_counters_destroy(); + cooperative_multitasking_destroy(); service_stop(); return 0; } @@ -532,7 +927,7 @@ main(int argc, char *argv[]) * * "False negatives" are possible. */ static bool -is_already_open(struct server_config *config OVS_UNUSED, +is_already_open(struct server_config *server_config OVS_UNUSED, const char *filename OVS_UNUSED) { #ifndef _WIN32 @@ -541,11 +936,12 @@ is_already_open(struct server_config *config OVS_UNUSED, if (!stat(filename, &s)) { struct shash_node *node; - SHASH_FOR_EACH (node, config->all_dbs) { + SHASH_FOR_EACH (node, server_config->all_dbs) { struct db *db = node->data; struct stat s2; - if (!stat(db->filename, &s2) + if (db->config->model != SM_RELAY + && !stat(db->filename, &s2) && s.st_dev == s2.st_dev && s.st_ino == s2.st_ino) { return true; @@ -558,13 +954,19 @@ is_already_open(struct server_config *config OVS_UNUSED, } static void -close_db(struct server_config *config, struct db *db, char *comment) +close_db(struct server_config *server_config, struct db *db, char *comment) { if (db) { - ovsdb_jsonrpc_server_remove_db(config->jsonrpc, db->db, comment); - if (db->db->is_relay) { + ovsdb_jsonrpc_server_remove_db(server_config->jsonrpc, + db->db, comment); + if (db->config->model == SM_RELAY) { ovsdb_relay_del_db(db->db); } + if (db->config->model == SM_ACTIVE_BACKUP + && db->config->ab.backup) { + ovsdb_server_replication_remove_db(db); + } + db_config_destroy(db->config); ovsdb_destroy(db->db); free(db->filename); free(db); @@ -573,8 +975,11 @@ close_db(struct server_config *config, struct db *db, char *comment) } } -static void -update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, void *aux) +static struct ovsdb_error * OVS_WARN_UNUSED_RESULT +update_schema(struct ovsdb *db, + const struct ovsdb_schema *schema, + const struct uuid *txnid, + bool conversion_with_no_data, void *aux) { struct server_config *config = aux; @@ -586,13 +991,33 @@ update_schema(struct ovsdb *db, const struct ovsdb_schema *schema, void *aux) : xasprintf("database %s connected to storage", db->name))); } - ovsdb_replace(db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); + if (db->schema && conversion_with_no_data) { + struct ovsdb *new_db = NULL; + struct ovsdb_error *error; + + /* If conversion was triggered by the current process, we might + * already have converted version of a database. */ + new_db = ovsdb_trigger_find_and_steal_converted_db(db, txnid); + if (!new_db) { + /* No luck. Converting. */ + error = ovsdb_convert(db, schema, &new_db); + if (error) { + /* Should never happen, because conversion should have been + * checked before writing the schema to the storage. */ + return error; + } + } + ovsdb_replace(db, new_db); + } else { + ovsdb_replace(db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); + } /* Force update to schema in _Server database. */ struct db *dbp = shash_find_data(config->all_dbs, db->name); if (dbp) { dbp->row_uuid = UUID_ZERO; } + return NULL; } static struct ovsdb_error * OVS_WARN_UNUSED_RESULT @@ -600,23 +1025,30 @@ parse_txn(struct server_config *config, struct db *db, const struct ovsdb_schema *schema, const struct json *txn_json, const struct uuid *txnid) { + struct ovsdb_error *error = NULL; + struct ovsdb_txn *txn = NULL; + if (schema) { - /* We're replacing the schema (and the data). Destroy the database - * (first grabbing its storage), then replace it with the new schema. - * The transaction must also include the replacement data. + /* We're replacing the schema (and the data). If transaction includes + * replacement data, destroy the database (first grabbing its storage), + * then replace it with the new schema. If not, it's a conversion + * without data specified. In this case, convert the current database + * to a new schema instead. * * Only clustered database schema changes and snapshot installs * go through this path. */ - ovs_assert(txn_json); ovs_assert(ovsdb_storage_is_clustered(db->db->storage)); - struct ovsdb_error *error = ovsdb_schema_check_for_ephemeral_columns( - schema); + error = ovsdb_schema_check_for_ephemeral_columns(schema); + if (error) { + return error; + } + + error = update_schema(db->db, schema, txnid, txn_json == NULL, config); if (error) { return error; } - update_schema(db->db, schema, config); } if (txn_json) { @@ -624,24 +1056,26 @@ parse_txn(struct server_config *config, struct db *db, return ovsdb_error(NULL, "%s: data without schema", db->filename); } - struct ovsdb_txn *txn; - struct ovsdb_error *error; - error = ovsdb_file_txn_from_json(db->db, txn_json, false, &txn); - if (!error) { - ovsdb_txn_set_txnid(txnid, txn); - log_and_free_error(ovsdb_txn_replay_commit(txn)); - } - if (!error && !uuid_is_zero(txnid)) { - db->db->prereq = *txnid; - } if (error) { ovsdb_storage_unread(db->db->storage); return error; } + } else if (schema) { + /* We just performed conversion without data. Transaction history + * was destroyed. Commit a dummy transaction to set the txnid. */ + txn = ovsdb_txn_create(db->db); } - return NULL; + if (txn) { + ovsdb_txn_set_txnid(txnid, txn); + error = ovsdb_txn_replay_commit(txn); + if (!error && !uuid_is_zero(txnid)) { + db->db->prereq = *txnid; + } + ovsdb_txn_history_run(db->db); + } + return error; } static void @@ -685,22 +1119,17 @@ add_db(struct server_config *config, struct db *db) } static struct ovsdb_error * OVS_WARN_UNUSED_RESULT -open_db(struct server_config *config, const char *filename) +open_db(struct server_config *server_config, + const char *filename, const struct db_config *conf) { - const char *relay_prefix = "relay:"; - const char *relay_remotes = NULL; - const int relay_prefix_len = strlen(relay_prefix); struct ovsdb_storage *storage; struct ovsdb_error *error; - bool is_relay; - char *name; - is_relay = !strncmp(filename, relay_prefix, relay_prefix_len); - if (!is_relay) { + if (conf->model != SM_RELAY) { /* If we know that the file is already open, return a good error * message. Otherwise, if the file is open, we'll fail later on with * a harder to interpret file locking error. */ - if (is_already_open(config, filename)) { + if (is_already_open(server_config, filename)) { return ovsdb_error(NULL, "%s: already open", filename); } @@ -708,65 +1137,78 @@ open_db(struct server_config *config, const char *filename) if (error) { return error; } - name = xstrdup(filename); } else { - /* Parsing the relay in format 'relay:DB_NAME:'*/ - relay_remotes = strchr(filename + relay_prefix_len, ':'); + storage = ovsdb_storage_create_unbacked(filename); + } - if (!relay_remotes || relay_remotes[0] == '\0') { - return ovsdb_error(NULL, "%s: invalid syntax", filename); - } - name = xmemdup0(filename, relay_remotes - filename); - storage = ovsdb_storage_create_unbacked(name + relay_prefix_len); - relay_remotes++; /* Skip the ':'. */ + enum service_model model = conf->model; + if (model == SM_UNDEFINED || model == SM_STANDALONE + || model == SM_CLUSTERED) { + /* Check the actual service model from the storage. */ + model = ovsdb_storage_is_clustered(storage) + ? SM_CLUSTERED : SM_STANDALONE; + } + if (conf->model != SM_UNDEFINED && conf->model != model) { + ovsdb_storage_close(storage); + return ovsdb_error(NULL, "%s: database is %s and not %s", + filename, service_model_to_string(model), + service_model_to_string(conf->model)); } struct ovsdb_schema *schema; - if (is_relay || ovsdb_storage_is_clustered(storage)) { + if (model == SM_RELAY || model == SM_CLUSTERED) { schema = NULL; } else { struct json *txn_json; error = ovsdb_storage_read(storage, &schema, &txn_json, NULL); if (error) { ovsdb_storage_close(storage); - free(name); return error; } ovs_assert(schema && !txn_json); } struct db *db = xzalloc(sizeof *db); - db->filename = name; + db->filename = xstrdup(filename); + db->config = db_config_clone(conf); + db->config->model = model; db->db = ovsdb_create(schema, storage); - ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); + ovsdb_jsonrpc_server_add_db(server_config->jsonrpc, db->db); /* Enable txn history for clustered and relay modes. It is not enabled for * other modes for now, since txn id is available for clustered and relay * modes only. */ - ovsdb_txn_history_init(db->db, - is_relay || ovsdb_storage_is_clustered(storage)); + ovsdb_txn_history_init(db->db, model == SM_RELAY || model == SM_CLUSTERED); - read_db(config, db); + read_db(server_config, db); error = (db->db->name[0] == '_' ? ovsdb_error(NULL, "%s: names beginning with \"_\" are reserved", db->db->name) - : shash_find(config->all_dbs, db->db->name) + : shash_find(server_config->all_dbs, db->db->name) ? ovsdb_error(NULL, "%s: duplicate database name", db->db->name) : NULL); if (error) { char *error_s = ovsdb_error_to_string(error); - close_db(config, db, + close_db(server_config, db, xasprintf("cannot complete opening %s database (%s)", db->db->name, error_s)); free(error_s); return error; } - add_db(config, db); + add_db(server_config, db); + + if (model == SM_RELAY) { + ovsdb_relay_add_db(db->db, conf->source, update_schema, server_config, + &conf->options->rpc); + } + if (model == SM_ACTIVE_BACKUP && conf->ab.backup) { + const struct uuid *server_uuid; - if (is_relay) { - ovsdb_relay_add_db(db->db, relay_remotes, update_schema, config); + server_uuid = ovsdb_jsonrpc_server_get_uuid(server_config->jsonrpc); + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); } return NULL; } @@ -790,9 +1232,14 @@ add_server_db(struct server_config *config) /* We don't need txn_history for server_db. */ db->filename = xstrdup(""); + db->config = xzalloc(sizeof *db->config); + db->config->model = SM_UNDEFINED; db->db = ovsdb_create(schema, ovsdb_storage_create_unbacked(NULL)); + db->db->read_only = true; + bool ok OVS_UNUSED = ovsdb_jsonrpc_server_add_db(config->jsonrpc, db->db); ovs_assert(ok); + add_db(config, db); } @@ -928,13 +1375,16 @@ query_db_string(const struct shash *all_dbs, const char *name, } static struct ovsdb_jsonrpc_options * -add_remote(struct shash *remotes, const char *target) +add_remote(struct shash *remotes, const char *target, + const struct ovsdb_jsonrpc_options *options_) { struct ovsdb_jsonrpc_options *options; options = shash_find_data(remotes, target); if (!options) { - options = ovsdb_jsonrpc_default_options(target); + options = options_ + ? ovsdb_jsonrpc_options_clone(options_) + : ovsdb_jsonrpc_default_options(target); shash_add(remotes, target, options); } @@ -949,9 +1399,10 @@ free_remotes(struct shash *remotes) SHASH_FOR_EACH (node, remotes) { struct ovsdb_jsonrpc_options *options = node->data; - free(options->role); + + ovsdb_jsonrpc_options_free(options); } - shash_destroy_free_data(remotes); + shash_clear(remotes); } } @@ -972,13 +1423,13 @@ add_manager_options(struct shash *remotes, const struct ovsdb_row *row) return; } - options = add_remote(remotes, target); + options = add_remote(remotes, target, NULL); if (ovsdb_util_read_integer_column(row, "max_backoff", &max_backoff)) { - options->max_backoff = max_backoff; + options->rpc.max_backoff = max_backoff; } if (ovsdb_util_read_integer_column(row, "inactivity_probe", &probe_interval)) { - options->probe_interval = probe_interval; + options->rpc.probe_interval = probe_interval; } if (ovsdb_util_read_bool_column(row, "read_only", &read_only)) { options->read_only = read_only; @@ -990,13 +1441,13 @@ add_manager_options(struct shash *remotes, const struct ovsdb_row *row) options->role = xstrdup(role); } - options->dscp = DSCP_DEFAULT; + options->rpc.dscp = DSCP_DEFAULT; dscp_string = ovsdb_util_read_map_string_column(row, "other_config", "dscp"); if (dscp_string) { int dscp = atoi(dscp_string); if (dscp >= 0 && dscp <= 63) { - options->dscp = dscp; + options->rpc.dscp = dscp; } } } @@ -1032,7 +1483,7 @@ query_db_remotes(const char *name, const struct shash *all_dbs, datum = &row->fields[column->index]; for (i = 0; i < datum->n; i++) { - add_remote(remotes, json_string(datum->keys[i].s)); + add_remote(remotes, json_string(datum->keys[i].s), NULL); } } } else if (column->type.key.type == OVSDB_TYPE_UUID @@ -1071,7 +1522,7 @@ update_remote_row(const struct ovsdb_row *row, struct ovsdb_txn *txn, /* Bad remote spec or incorrect schema. */ return; } - rw_row = ovsdb_txn_row_modify(txn, row); + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); ovsdb_jsonrpc_server_get_remote_status(jsonrpc, target, &status); /* Update status information columns. */ @@ -1180,19 +1631,24 @@ commit_txn(struct ovsdb_txn *txn, const char *name) static void update_remote_status(const struct ovsdb_jsonrpc_server *jsonrpc, - const struct sset *remotes, + const struct shash *remotes, struct shash *all_dbs) { - struct shash_node *node; - SHASH_FOR_EACH (node, all_dbs) { - struct db *db = node->data; + struct shash_node *db_node; + + SHASH_FOR_EACH (db_node, all_dbs) { + struct db *db = db_node->data; + if (!db->db || ovsdb_storage_is_clustered(db->db->storage)) { continue; } struct ovsdb_txn *txn = ovsdb_txn_create(db->db); - const char *remote; - SSET_FOR_EACH (remote, remotes) { + const struct shash_node *remote_node; + + SHASH_FOR_EACH (remote_node, remotes) { + const char *remote = remote_node->name; + update_remote_rows(all_dbs, db, remote, jsonrpc, txn); } commit_txn(txn, "remote status"); @@ -1261,7 +1717,10 @@ update_server_status(struct shash *all_dbs) if (!db || !db->db) { ovsdb_txn_row_delete(txn, row); } else { - update_database_status(ovsdb_txn_row_modify(txn, row), db); + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + update_database_status(rw_row, db); } } @@ -1299,23 +1758,27 @@ update_server_status(struct shash *all_dbs) /* Reconfigures ovsdb-server's remotes based on information in the database. */ static char * reconfigure_remotes(struct ovsdb_jsonrpc_server *jsonrpc, - const struct shash *all_dbs, struct sset *remotes) + const struct shash *all_dbs, struct shash *remotes) { struct ds errors = DS_EMPTY_INITIALIZER; struct shash resolved_remotes; - const char *name; + struct shash_node *node; /* Configure remotes. */ shash_init(&resolved_remotes); - SSET_FOR_EACH (name, remotes) { + SHASH_FOR_EACH (node, remotes) { + const struct ovsdb_jsonrpc_options *options = node->data; + const char *name = node->name; + if (!strncmp(name, "db:", 3)) { query_db_remotes(name, all_dbs, &resolved_remotes, &errors); } else { - add_remote(&resolved_remotes, name); + add_remote(&resolved_remotes, name, options); } } ovsdb_jsonrpc_server_set_remotes(jsonrpc, &resolved_remotes); free_remotes(&resolved_remotes); + shash_destroy(&resolved_remotes); return errors.string; } @@ -1361,17 +1824,47 @@ report_error_if_changed(char *error, char **last_errorp) } } +static bool +check_config_file_on_unixctl(struct unixctl_conn *conn) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + + if (!config_file_path) { + return false; + } + + ds_put_format(&ds, "Update the %s and use ovsdb-server/reload instead", + config_file_path); + unixctl_command_reply_error(conn, ds_cstr(&ds)); + ds_destroy(&ds); + + return true; +} + static void ovsdb_server_set_active_ovsdb_server(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[], void *config_) { struct server_config *config = config_; + struct shash_node *node; - if (*config->sync_from) { - free(*config->sync_from); + if (check_config_file_on_unixctl(conn)) { + return; } + + free(*config->sync_from); *config->sync_from = xstrdup(argv[1]); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + + if (db->config->model == SM_ACTIVE_BACKUP) { + free(db->config->source); + db->config->source = xstrdup(argv[1]); + } + } + save_config(config); unixctl_command_reply(conn, NULL); @@ -1395,20 +1888,43 @@ ovsdb_server_connect_active_ovsdb_server(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; char *msg = NULL; - if ( !*config->sync_from) { + if (check_config_file_on_unixctl(conn)) { + return; + } + + if (!*config->sync_from) { msg = "Unable to connect: active server is not specified.\n"; } else { const struct uuid *server_uuid; server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - if (!*config->is_backup) { - *config->is_backup = true; - save_config(config); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + /* This command also converts standalone databases to AB. */ + if (conf->model == SM_STANDALONE) { + conf->model = SM_ACTIVE_BACKUP; + conf->source = xstrdup(*config->sync_from); + conf->options = ovsdb_jsonrpc_default_options(conf->source); + conf->options->rpc.probe_interval = + *config->replication_probe_interval; + conf->ab.sync_exclude = + nullable_xstrdup(*config->sync_exclude); + conf->ab.backup = false; + } + + if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); + conf->ab.backup = true; + } } + *config->is_backup = true; + save_config(config); } unixctl_command_reply(conn, msg); } @@ -1420,8 +1936,20 @@ ovsdb_server_disconnect_active_ovsdb_server(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; + + if (check_config_file_on_unixctl(conn)) { + return; + } + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; - disconnect_active_server(); + if (conf->model == SM_ACTIVE_BACKUP && conf->ab.backup) { + ovsdb_server_replication_remove_db(db); + } + } *config->is_backup = false; save_config(config); unixctl_command_reply(conn, NULL); @@ -1434,19 +1962,76 @@ ovsdb_server_set_active_ovsdb_server_probe_interval(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; - + struct shash_node *node; int probe_interval; - if (str_to_int(argv[1], 10, &probe_interval)) { - *config->replication_probe_interval = probe_interval; - save_config(config); - if (*config->is_backup) { - replication_set_probe_interval(probe_interval); + + if (check_config_file_on_unixctl(conn)) { + return; + } + + if (!str_to_int(argv[1], 10, &probe_interval)) { + unixctl_command_reply_error( + conn, "Invalid probe interval, integer value expected"); + return; + } + + const struct uuid *server_uuid; + server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); + + *config->replication_probe_interval = probe_interval; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_ACTIVE_BACKUP) { + conf->options->rpc.probe_interval = probe_interval; + if (conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); + } } - unixctl_command_reply(conn, NULL); - } else { - unixctl_command_reply( + } + + save_config(config); + unixctl_command_reply(conn, NULL); +} + +static void +ovsdb_server_set_relay_source_interval(struct unixctl_conn *conn, + int argc OVS_UNUSED, + const char *argv[], + void *config_) +{ + struct server_config *config = config_; + struct shash_node *node; + int probe_interval; + + if (check_config_file_on_unixctl(conn)) { + return; + } + + if (!str_to_int(argv[1], 10, &probe_interval)) { + unixctl_command_reply_error( conn, "Invalid probe interval, integer value expected"); + return; + } + + *config->relay_source_probe_interval = probe_interval; + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_RELAY) { + conf->options->rpc.probe_interval = probe_interval; + } } + + ovsdb_relay_set_probe_interval(probe_interval); + save_config(config); + + unixctl_command_reply(conn, NULL); } static void @@ -1456,21 +2041,40 @@ ovsdb_server_set_sync_exclude_tables(struct unixctl_conn *conn, void *config_) { struct server_config *config = config_; + struct shash_node *node; - char *err = set_excluded_tables(argv[1], true); - if (!err) { - free(*config->sync_exclude); - *config->sync_exclude = xstrdup(argv[1]); - save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); + if (check_config_file_on_unixctl(conn)) { + return; + } + + char *err = parse_excluded_tables(argv[1]); + if (err) { + goto exit; + } + + const struct uuid *server_uuid; + server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); + + free(*config->sync_exclude); + *config->sync_exclude = xstrdup(argv[1]); + + SHASH_FOR_EACH (node, config->all_dbs) { + struct db *db = node->data; + struct db_config *conf = db->config; + + if (conf->model == SM_ACTIVE_BACKUP) { + free(conf->ab.sync_exclude); + conf->ab.sync_exclude = xstrdup(argv[1]); + if (conf->ab.backup) { + replication_set_db(db->db, conf->source, conf->ab.sync_exclude, + server_uuid, &conf->options->rpc); + } } - err = set_excluded_tables(argv[1], false); } + + save_config(config); + +exit: unixctl_command_reply(conn, err); free(err); } @@ -1479,11 +2083,11 @@ static void ovsdb_server_get_sync_exclude_tables(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, - void *arg_ OVS_UNUSED) + void *config_) { - char *reply = get_excluded_tables(); - unixctl_command_reply(conn, reply); - free(reply); + struct server_config *config = config_; + + unixctl_command_reply(conn, *config->sync_exclude); } static void @@ -1600,6 +2204,8 @@ ovsdb_server_memory_trim_on_compaction(struct unixctl_conn *conn, const char *argv[], void *arg OVS_UNUSED) { + bool old_trim_memory = trim_memory; + static bool have_logged = false; const char *command = argv[1]; #if !HAVE_DECL_MALLOC_TRIM @@ -1615,8 +2221,11 @@ ovsdb_server_memory_trim_on_compaction(struct unixctl_conn *conn, unixctl_command_reply_error(conn, "invalid argument"); return; } - VLOG_INFO("memory trimming after compaction %s.", - trim_memory ? "enabled" : "disabled"); + if (!have_logged || (trim_memory != old_trim_memory)) { + have_logged = true; + VLOG_INFO("memory trimming after compaction %s.", + trim_memory ? "enabled" : "disabled"); + } unixctl_command_reply(conn, NULL); } @@ -1632,6 +2241,28 @@ ovsdb_server_reconnect(struct unixctl_conn *conn, int argc OVS_UNUSED, unixctl_command_reply(conn, NULL); } +/* "ovsdb-server/reload": makes ovsdb-server open a configuration file on + * 'config_file_path', read it and sync the runtime configuration with it. */ +static void +ovsdb_server_reload(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, void *config_) +{ + struct server_config *config = config_; + + if (!config_file_path) { + unixctl_command_reply_error(conn, + "Configuration file was not specified on command line"); + return; + } + + if (!reconfigure_ovsdb_server(config)) { + unixctl_command_reply_error(conn, + "Configuration failed. See the log file for details."); + } else { + unixctl_command_reply(conn, NULL); + } +} + /* "ovsdb-server/add-remote REMOTE": adds REMOTE to the set of remotes that * ovsdb-server services. */ static void @@ -1646,12 +2277,16 @@ ovsdb_server_add_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, const struct db *db; char *retval; + if (check_config_file_on_unixctl(conn)) { + return; + } + retval = (strncmp("db:", remote, 3) ? NULL : parse_db_column(config->all_dbs, remote, &db, &table, &column)); if (!retval) { - if (sset_add(config->remotes, remote)) { + if (add_remote(config->remotes, remote, NULL)) { save_config(config); } unixctl_command_reply(conn, NULL); @@ -1668,11 +2303,15 @@ ovsdb_server_remove_remote(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[], void *config_) { struct server_config *config = config_; - struct sset_node *node; + struct ovsdb_jsonrpc_options *options; + + if (check_config_file_on_unixctl(conn)) { + return; + } - node = sset_find(config->remotes, argv[1]); - if (node) { - sset_delete(config->remotes, node); + options = shash_find_and_delete(config->remotes, argv[1]); + if (options) { + ovsdb_jsonrpc_options_free(options); save_config(config); unixctl_command_reply(conn, NULL); } else { @@ -1685,15 +2324,15 @@ static void ovsdb_server_list_remotes(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *remotes_) { - struct sset *remotes = remotes_; - const char **list, **p; + const struct shash *remotes = remotes_; + const struct shash_node **list; struct ds s; ds_init(&s); - list = sset_sort(remotes); - for (p = list; *p; p++) { - ds_put_format(&s, "%s\n", *p); + list = shash_sort(remotes); + for (size_t i = 0; i < shash_count(remotes); i++) { + ds_put_format(&s, "%s\n", list[i]->name); } free(list); @@ -1709,22 +2348,30 @@ ovsdb_server_add_database(struct unixctl_conn *conn, int argc OVS_UNUSED, { struct server_config *config = config_; const char *filename = argv[1]; + const struct shash_node *node; + struct shash db_conf; + + if (check_config_file_on_unixctl(conn)) { + return; + } + + shash_init(&db_conf); + add_database_config(&db_conf, filename, *config->sync_from, + *config->sync_exclude, !config->is_backup); + ovs_assert(shash_count(&db_conf) == 1); + node = shash_first(&db_conf); - char *error = ovsdb_error_to_string_free(open_db(config, filename)); + char *error = ovsdb_error_to_string_free(open_db(config, + node->name, node->data)); if (!error) { save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - } unixctl_command_reply(conn, NULL); } else { unixctl_command_reply_error(conn, error); free(error); } + db_config_destroy(node->data); + shash_destroy(&db_conf); } static void @@ -1736,13 +2383,6 @@ remove_db(struct server_config *config, struct shash_node *node, char *comment) shash_delete(config->all_dbs, node); save_config(config); - if (*config->is_backup) { - const struct uuid *server_uuid; - server_uuid = ovsdb_jsonrpc_server_get_uuid(config->jsonrpc); - ovsdb_replication_init(*config->sync_from, *config->sync_exclude, - config->all_dbs, server_uuid, - *config->replication_probe_interval); - } } static void @@ -1752,6 +2392,10 @@ ovsdb_server_remove_database(struct unixctl_conn *conn, int argc OVS_UNUSED, struct server_config *config = config_; struct shash_node *node; + if (check_config_file_on_unixctl(conn)) { + return; + } + node = shash_find(config->all_dbs, argv[1]); if (!node) { unixctl_command_reply_error(conn, "Failed to find the database."); @@ -1878,13 +2522,34 @@ ovsdb_server_get_sync_status(struct unixctl_conn *conn, int argc OVS_UNUSED, const char *argv[] OVS_UNUSED, void *config_) { struct server_config *config = config_; - bool is_backup = *config->is_backup; struct ds ds = DS_EMPTY_INITIALIZER; + bool any_backup = false; + + const struct shash_node **db_nodes = shash_sort(config->all_dbs); + + for (size_t i = 0; i < shash_count(config->all_dbs); i++) { + const struct db *db = db_nodes[i]->data; + + if (db->config->model != SM_ACTIVE_BACKUP) { + continue; + } - ds_put_format(&ds, "state: %s\n", is_backup ? "backup" : "active"); + any_backup = true; + + ds_put_format(&ds, "database: %s\n", db->db->name); + ds_put_format(&ds, "state: %s\n", + db->config->ab.backup ? "backup" : "active"); + if (db->config->ab.backup) { + ds_put_and_free_cstr(&ds, replication_status(db->db)); + } + if (i + 1 < shash_count(config->all_dbs)) { + ds_put_char(&ds, '\n'); + } + } + free(db_nodes); - if (is_backup) { - ds_put_and_free_cstr(&ds, replication_status()); + if (!any_backup) { + ds_put_cstr(&ds, "state: active\n"); } unixctl_command_reply(conn, ds_cstr(&ds)); @@ -1928,7 +2593,7 @@ ovsdb_server_get_db_storage_status(struct unixctl_conn *conn, static void parse_options(int argc, char *argv[], - struct sset *db_filenames, struct sset *remotes, + struct shash *db_conf, struct shash *remotes, char **unixctl_pathp, char **run_command, char **sync_from, char **sync_exclude, bool *active) { @@ -1943,6 +2608,8 @@ parse_options(int argc, char *argv[], OPT_ACTIVE, OPT_NO_DBS, OPT_FILE_COLUMN_DIFF, + OPT_FILE_NO_DATA_CONVERSION, + OPT_CONFIG_FILE, VLOG_OPTION_ENUMS, DAEMON_OPTION_ENUMS, SSL_OPTION_ENUMS, @@ -1968,6 +2635,9 @@ parse_options(int argc, char *argv[], {"active", no_argument, NULL, OPT_ACTIVE}, {"no-dbs", no_argument, NULL, OPT_NO_DBS}, {"disable-file-column-diff", no_argument, NULL, OPT_FILE_COLUMN_DIFF}, + {"disable-file-no-data-conversion", no_argument, NULL, + OPT_FILE_NO_DATA_CONVERSION}, + {"config-file", required_argument, NULL, OPT_CONFIG_FILE}, {NULL, 0, NULL, 0}, }; char *short_options = ovs_cmdl_long_options_to_short_options(long_options); @@ -1975,8 +2645,8 @@ parse_options(int argc, char *argv[], *sync_from = NULL; *sync_exclude = NULL; - sset_init(db_filenames); - sset_init(remotes); + shash_init(db_conf); + shash_init(remotes); for (;;) { int c; @@ -1987,7 +2657,7 @@ parse_options(int argc, char *argv[], switch (c) { case OPT_REMOTE: - sset_add(remotes, optarg); + add_remote(remotes, optarg, NULL); break; case OPT_UNIXCTL: @@ -2045,7 +2715,7 @@ parse_options(int argc, char *argv[], break; case OPT_SYNC_EXCLUDE: { - char *err = set_excluded_tables(optarg, false); + char *err = parse_excluded_tables(optarg); if (err) { ovs_fatal(0, "%s", err); } @@ -2064,6 +2734,15 @@ parse_options(int argc, char *argv[], ovsdb_file_column_diff_disable(); break; + case OPT_FILE_NO_DATA_CONVERSION: + ovsdb_no_data_conversion_disable(); + break; + + case OPT_CONFIG_FILE: + config_file_path = abs_file_name(ovs_dbdir(), optarg); + add_default_db = false; + break; + case '?': exit(EXIT_FAILURE); @@ -2075,12 +2754,29 @@ parse_options(int argc, char *argv[], argc -= optind; argv += optind; - if (argc > 0) { + + if (config_file_path) { + if (*sync_from || *sync_exclude || *active) { + ovs_fatal(0, "--config-file is mutually exclusive with " + "--sync-from, --sync-exclude and --active"); + } + if (shash_count(remotes)) { + ovs_fatal(0, "--config-file is mutually exclusive with --remote"); + } + if (argc > 0) { + ovs_fatal(0, "Databases should be specified in a config file"); + } + } else if (argc > 0) { for (int i = 0; i < argc; i++) { - sset_add(db_filenames, argv[i]); + add_database_config(db_conf, argv[i], *sync_from, *sync_exclude, + *active); } } else if (add_default_db) { - sset_add_and_free(db_filenames, xasprintf("%s/conf.db", ovs_dbdir())); + char *filename = xasprintf("%s/conf.db", ovs_dbdir()); + + add_database_config(db_conf, filename, *sync_from, *sync_exclude, + *active); + free(filename); } } @@ -2095,6 +2791,12 @@ usage(void) printf("\nJSON-RPC options (may be specified any number of times):\n" " --remote=REMOTE connect or listen to REMOTE\n"); stream_usage("JSON-RPC", true, true, true); + printf("\nConfiguration file:\n" + " --config-file PATH Use configuration file as a source of\n" + " database and JSON-RPC configuration.\n" + " Mutually exclusive with the DATABASE,\n" + " JSON-RPC and Syncing options.\n" + " Assumes --no-dbs.\n"); daemon_usage(); vlog_usage(); replication_usage(); @@ -2102,6 +2804,7 @@ usage(void) printf("\nOther options:\n" " --run COMMAND run COMMAND as subprocess then exit\n" " --unixctl=SOCKET override default control socket name\n" + " --no-dbs do not add default database\n" " --disable-file-column-diff\n" " don't use column diff in database file\n" " -h, --help display this help message\n" @@ -2122,11 +2825,72 @@ sset_to_json(const struct sset *sset) return array; } +static struct json * +remotes_to_json(const struct shash *remotes) +{ + const struct shash_node *node; + struct json *json; + + json = json_object_create(); + SHASH_FOR_EACH (node, remotes) { + json_object_put(json, node->name, + ovsdb_jsonrpc_options_to_json(node->data, false)); + } + return json; +} + +static struct json * +db_config_to_json(const struct db_config *conf) +{ + struct json *json; + + json = json_object_create(); + + if (conf->model != SM_UNDEFINED) { + json_object_put(json, "service-model", + json_string_create( + service_model_to_string(conf->model))); + } + + if (conf->source) { + struct json *source = json_object_create(); + + json_object_put(source, conf->source, + ovsdb_jsonrpc_options_to_json(conf->options, true)); + json_object_put(json, "source", source); + } + + if (conf->model == SM_ACTIVE_BACKUP) { + if (conf->ab.sync_exclude) { + struct sset set = SSET_INITIALIZER(&set); + + sset_from_delimited_string(&set, conf->ab.sync_exclude, " ,"); + json_object_put(json, "exclude-tables", sset_to_json(&set)); + sset_destroy(&set); + } + json_object_put(json, "backup", json_boolean_create(conf->ab.backup)); + } + return json; +} + +static struct json * +databases_to_json(const struct shash *db_conf) +{ + const struct shash_node *node; + struct json *json; + + json = json_object_create(); + SHASH_FOR_EACH (node, db_conf) { + json_object_put(json, node->name, db_config_to_json(node->data)); + } + return json; +} + /* Truncates and replaces the contents of 'config_file' by a representation of - * 'remotes' and 'db_filenames'. */ + * 'remotes', 'db_conf' and a few global replication paramaters. */ static void -save_config__(FILE *config_file, const struct sset *remotes, - const struct sset *db_filenames, const char *sync_from, +save_config__(FILE *config_file, const struct shash *remotes, + const struct shash *db_conf, const char *sync_from, const char *sync_exclude, bool is_backup) { struct json *obj; @@ -2138,8 +2902,9 @@ save_config__(FILE *config_file, const struct sset *remotes, } obj = json_object_create(); - json_object_put(obj, "remotes", sset_to_json(remotes)); - json_object_put(obj, "db_filenames", sset_to_json(db_filenames)); + json_object_put(obj, "remotes", remotes_to_json(remotes)); + json_object_put(obj, "databases", databases_to_json(db_conf)); + if (sync_from) { json_object_put(obj, "sync_from", json_string_create(sync_from)); } @@ -2165,58 +2930,232 @@ save_config__(FILE *config_file, const struct sset *remotes, static void save_config(struct server_config *config) { - struct sset db_filenames; struct shash_node *node; + struct shash db_conf; + + if (config_file_path) { + return; + } - sset_init(&db_filenames); + shash_init(&db_conf); SHASH_FOR_EACH (node, config->all_dbs) { struct db *db = node->data; + if (node->name[0] != '_') { - sset_add(&db_filenames, db->filename); + shash_add(&db_conf, db->filename, db->config); } } - save_config__(config->config_tmpfile, config->remotes, &db_filenames, + save_config__(config->config_tmpfile, config->remotes, &db_conf, *config->sync_from, *config->sync_exclude, *config->is_backup); - sset_destroy(&db_filenames); + shash_destroy(&db_conf); } -static void -sset_from_json(struct sset *sset, const struct json *array) +static bool +remotes_from_json(struct shash *remotes, const struct json *json) { - size_t i; + struct ovsdb_jsonrpc_options *options; + const struct shash_node *node; + const struct shash *object; + + free_remotes(remotes); - sset_clear(sset); + ovs_assert(json); + if (json->type == JSON_NULL) { + return true; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("config: 'remotes' is not a JSON object"); + return false; + } - ovs_assert(array->type == JSON_ARRAY); - for (i = 0; i < array->array.n; i++) { - const struct json *elem = array->array.elems[i]; - sset_add(sset, json_string(elem)); + object = json_object(json); + SHASH_FOR_EACH (node, object) { + options = ovsdb_jsonrpc_default_options(node->name); + shash_add(remotes, node->name, options); + + json = node->data; + if (json->type == JSON_OBJECT) { + ovsdb_jsonrpc_options_update_from_json(options, node->data, false); + } else if (json->type != JSON_NULL) { + VLOG_WARN("%s: JSON-RPC options are not a JSON object or null", + node->name); + free_remotes(remotes); + return false; + } } + + return true; } -/* Clears and replaces 'remotes' and 'dbnames' by a configuration read from - * 'config_file', which must have been previously written by save_config(). */ -static void -load_config(FILE *config_file, struct sset *remotes, struct sset *db_filenames, - char **sync_from, char **sync_exclude, bool *is_backup) +static struct db_config * +db_config_from_json(const char *name, const struct json *json) +{ + const struct json *model, *source, *sync_exclude, *backup; + struct db_config *conf = xzalloc(sizeof *conf); + struct ovsdb_parser parser; + struct ovsdb_error *error; + + conf->model = SM_UNDEFINED; + + ovs_assert(json); + if (json->type == JSON_NULL) { + return conf; + } + + ovsdb_parser_init(&parser, json, "database %s", name); + + model = ovsdb_parser_member(&parser, "service-model", + OP_STRING | OP_OPTIONAL); + if (model) { + conf->model = service_model_from_string(json_string(model)); + if (conf->model == SM_UNDEFINED) { + ovsdb_parser_raise_error(&parser, + "'%s' is not a valid service model", json_string(model)); + } + } + + if (conf->model == SM_ACTIVE_BACKUP) { + backup = ovsdb_parser_member(&parser, "backup", OP_BOOLEAN); + conf->ab.backup = backup ? json_boolean(backup) : false; + + sync_exclude = ovsdb_parser_member(&parser, "exclude-tables", + OP_ARRAY | OP_OPTIONAL); + if (sync_exclude) { + const struct json_array *exclude = json_array(sync_exclude); + struct sset set = SSET_INITIALIZER(&set); + + for (size_t i = 0; i < exclude->n; i++) { + if (exclude->elems[i]->type != JSON_STRING) { + ovsdb_parser_raise_error(&parser, + "'exclude-tables' must contain strings"); + break; + } + sset_add(&set, json_string(exclude->elems[i])); + } + conf->ab.sync_exclude = sset_join(&set, ",", ""); + sset_destroy(&set); + } + } + + if (conf->model == SM_ACTIVE_BACKUP || conf->model == SM_RELAY) { + enum ovsdb_parser_types type = OP_OBJECT; + + if (conf->model == SM_ACTIVE_BACKUP && !conf->ab.backup) { + /* Active database doesn't have to have a source. */ + type |= OP_OPTIONAL; + } + source = ovsdb_parser_member(&parser, "source", type); + + if (source && shash_count(json_object(source)) != 1) { + ovsdb_parser_raise_error(&parser, + "'source' should be an object with exactly one element"); + } else if (source) { + const struct shash_node *node = shash_first(json_object(source)); + const struct json *options; + + ovs_assert(node); + conf->source = xstrdup(node->name); + options = node->data; + + conf->options = get_jsonrpc_options(conf->source, conf->model); + + if (options->type == JSON_OBJECT) { + ovsdb_jsonrpc_options_update_from_json(conf->options, + options, true); + } else if (options->type != JSON_NULL) { + ovsdb_parser_raise_error(&parser, + "JSON-RPC options is not a JSON object or null"); + } + } + } + + error = ovsdb_parser_finish(&parser); + if (error) { + char *s = ovsdb_error_to_string_free(error); + + VLOG_WARN("%s", s); + free(s); + db_config_destroy(conf); + return NULL; + } + + return conf; +} + + +static bool +databases_from_json(struct shash *db_conf, const struct json *json) +{ + const struct shash_node *node; + const struct shash *object; + + free_database_configs(db_conf); + + ovs_assert(json); + if (json->type == JSON_NULL) { + return true; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("config: 'databases' is not a JSON object or null"); + } + + object = json_object(json); + SHASH_FOR_EACH (node, object) { + struct db_config *conf = db_config_from_json(node->name, node->data); + + if (conf) { + shash_add(db_conf, node->name, conf); + } else { + free_database_configs(db_conf); + return false; + } + } + return true; +} + +/* Clears and replaces 'remotes' and 'db_conf' by a configuration read from + * 'config_file', which must have been previously written by save_config() + * or provided by the user with --config-file. + * Returns 'true', if parsing was successful, 'false' otherwise. */ +static bool +load_config(FILE *config_file, struct shash *remotes, + struct shash *db_conf, char **sync_from, + char **sync_exclude, bool *is_backup) { struct json *json; if (fseek(config_file, 0, SEEK_SET) != 0) { - VLOG_FATAL("seek failed in temporary file (%s)", ovs_strerror(errno)); + VLOG_WARN("config: file seek failed (%s)", ovs_strerror(errno)); + return false; } json = json_from_stream(config_file); if (json->type == JSON_STRING) { - VLOG_FATAL("reading json failed (%s)", json_string(json)); + VLOG_WARN("config: reading JSON failed (%s)", json_string(json)); + json_destroy(json); + return false; + } + if (json->type != JSON_OBJECT) { + VLOG_WARN("configuration in a file must be a JSON object"); + json_destroy(json); + return false; } - ovs_assert(json->type == JSON_OBJECT); - sset_from_json(remotes, shash_find_data(json_object(json), "remotes")); - sset_from_json(db_filenames, - shash_find_data(json_object(json), "db_filenames")); + if (!remotes_from_json(remotes, + shash_find_data(json_object(json), "remotes"))) { + VLOG_WARN("config: failed to parse 'remotes'"); + json_destroy(json); + return false; + } + if (!databases_from_json(db_conf, shash_find_data(json_object(json), + "databases"))) { + VLOG_WARN("config: failed to parse 'databases'"); + free_remotes(remotes); + json_destroy(json); + return false; + } struct json *string; string = shash_find_data(json_object(json), "sync_from"); @@ -2227,7 +3166,9 @@ load_config(FILE *config_file, struct sset *remotes, struct sset *db_filenames, free(*sync_exclude); *sync_exclude = string ? xstrdup(json_string(string)) : NULL; - *is_backup = json_boolean(shash_find_data(json_object(json), "is_backup")); + struct json *boolean = shash_find_data(json_object(json), "is_backup"); + *is_backup = boolean ? json_boolean(boolean) : false; json_destroy(json); + return true; } diff --git a/ovsdb/ovsdb-tool.c b/ovsdb/ovsdb-tool.c index df2e373c3cd..facd680ff3f 100644 --- a/ovsdb/ovsdb-tool.c +++ b/ovsdb/ovsdb-tool.c @@ -304,7 +304,7 @@ do_create_cluster(struct ovs_cmdl_context *ctx) struct ovsdb *ovsdb = ovsdb_file_read(src_file_name, false); char *comment = xasprintf("created from %s", src_file_name); - data = ovsdb_to_txn_json(ovsdb, comment); + data = ovsdb_to_txn_json(ovsdb, comment, true); free(comment); schema = ovsdb_schema_clone(ovsdb->schema); ovsdb_destroy(ovsdb); @@ -359,7 +359,8 @@ write_standalone_db(const char *file_name, const char *comment, error = ovsdb_log_write_and_free(log, ovsdb_schema_to_json(db->schema)); if (!error) { - error = ovsdb_log_write_and_free(log, ovsdb_to_txn_json(db, comment)); + error = ovsdb_log_write_and_free(log, + ovsdb_to_txn_json(db, comment, true)); } ovsdb_log_close(log); @@ -1005,7 +1006,8 @@ raft_header_to_standalone_log(const struct raft_header *h, } static void -raft_record_to_standalone_log(const struct raft_record *r, +raft_record_to_standalone_log(const char *db_file_name, + const struct raft_record *r, struct ovsdb_log *db_log_data) { if (r->type == RAFT_REC_ENTRY) { @@ -1017,7 +1019,40 @@ raft_record_to_standalone_log(const struct raft_record *r, if (pa->n != 2) { ovs_fatal(0, "Incorrect raft record array length"); } + + struct json *schema_json = pa->elems[0]; struct json *data_json = pa->elems[1]; + + if (schema_json->type != JSON_NULL) { + /* This is a database conversion record. Reset the log and + * write the new schema. */ + struct ovsdb_schema *schema; + + check_ovsdb_error(ovsdb_schema_from_json(schema_json, &schema)); + + if (data_json->type == JSON_NULL) { + /* We have a conversion request with no data. There is no + * other way as to read back what we have and convert. */ + struct ovsdb *old_db, *new_db; + + check_ovsdb_error(ovsdb_log_commit_block(db_log_data)); + + old_db = ovsdb_file_read(db_file_name, false); + check_ovsdb_error(ovsdb_convert(old_db, schema, &new_db)); + ovsdb_destroy(old_db); + + pa->elems[1] = ovsdb_to_txn_json( + new_db, "converted by ovsdb-tool", true); + ovsdb_destroy(new_db); + + json_destroy(data_json); + data_json = pa->elems[1]; + } + + ovsdb_schema_destroy(schema); + check_ovsdb_error(ovsdb_log_reset(db_log_data)); + check_ovsdb_error(ovsdb_log_write(db_log_data, schema_json)); + } if (data_json->type != JSON_NULL) { check_ovsdb_error(ovsdb_log_write(db_log_data, data_json)); } @@ -1059,6 +1094,7 @@ do_show_log_cluster(struct ovsdb_log *log) free(s); } + json_destroy(json); putchar('\n'); } @@ -1635,7 +1671,8 @@ do_compare_versions(struct ovs_cmdl_context *ctx) } static void -do_convert_to_standalone(struct ovsdb_log *log, struct ovsdb_log *db_log_data) +do_convert_to_standalone(const char *db_file_name, + struct ovsdb_log *log, struct ovsdb_log *db_log_data) { for (unsigned int i = 0; ; i++) { struct json *json; @@ -1652,7 +1689,7 @@ do_convert_to_standalone(struct ovsdb_log *log, struct ovsdb_log *db_log_data) } else { struct raft_record r; check_ovsdb_error(raft_record_from_json(&r, json)); - raft_record_to_standalone_log(&r, db_log_data); + raft_record_to_standalone_log(db_file_name, &r, db_log_data); raft_record_uninit(&r); } json_destroy(json); @@ -1675,7 +1712,7 @@ do_cluster_standalone(struct ovs_cmdl_context *ctx) if (strcmp(ovsdb_log_get_magic(log), RAFT_MAGIC) != 0) { ovs_fatal(0, "Database is not clustered db.\n"); } - do_convert_to_standalone(log, db_log_data); + do_convert_to_standalone(db_file_name, log, db_log_data); check_ovsdb_error(ovsdb_log_commit_block(db_log_data)); ovsdb_log_close(db_log_data); ovsdb_log_close(log); diff --git a/ovsdb/ovsdb-util.c b/ovsdb/ovsdb-util.c index 303191dc87d..ec453789010 100644 --- a/ovsdb/ovsdb-util.c +++ b/ovsdb/ovsdb-util.c @@ -291,9 +291,15 @@ ovsdb_util_write_string_string_column(struct ovsdb_row *row, size_t i; column = ovsdb_table_schema_get_column(row->table->schema, column_name); + if (!column) { + VLOG_WARN("No %s column present in the %s table", + column_name, row->table->schema->name); + goto unwind; + } datum = ovsdb_util_get_datum(row, column_name, OVSDB_TYPE_STRING, OVSDB_TYPE_STRING, UINT_MAX); if (!datum) { +unwind: for (i = 0; i < n; i++) { free(keys[i]); free(values[i]); diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c index 8cbefbe3d21..298616a64d0 100644 --- a/ovsdb/ovsdb.c +++ b/ovsdb/ovsdb.c @@ -39,10 +39,13 @@ #include "transaction.h" #include "transaction-forward.h" #include "trigger.h" +#include "unixctl.h" #include "openvswitch/vlog.h" VLOG_DEFINE_THIS_MODULE(ovsdb); +size_t n_weak_refs = 0; + struct ovsdb_schema * ovsdb_schema_create(const char *name, const char *version, const char *cksum) { @@ -175,6 +178,39 @@ ovsdb_is_valid_version(const char *s) return ovsdb_parse_version(s, &version); } +/* If set to 'true', database schema conversion operations in the storage + * may not contain the converted data, only the schema. Currently affects + * only the clustered storage. */ +static bool use_no_data_conversion = true; + +static void +ovsdb_no_data_conversion_enable(struct unixctl_conn *conn, int argc OVS_UNUSED, + const char *argv[] OVS_UNUSED, + void *arg OVS_UNUSED) +{ + use_no_data_conversion = true; + unixctl_command_reply(conn, NULL); +} + +void +ovsdb_no_data_conversion_disable(void) +{ + if (!use_no_data_conversion) { + return; + } + use_no_data_conversion = false; + unixctl_command_register("ovsdb/file/no-data-conversion-enable", "", + 0, 0, ovsdb_no_data_conversion_enable, NULL); +} + +/* Returns true if the database storage allows conversion records without + * data specified. */ +bool +ovsdb_conversion_with_no_data_supported(const struct ovsdb *db) +{ + return use_no_data_conversion && ovsdb_storage_is_clustered(db->storage); +} + /* Returns the number of tables in 'schema''s root set. */ static size_t root_set_size(const struct ovsdb_schema *schema) @@ -428,6 +464,8 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage) db->n_atoms = 0; + db->read_only = false; + db->is_relay = false; ovs_list_init(&db->txn_forward_new); hmap_init(&db->txn_forward_sent); @@ -546,6 +584,8 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage) if (db->storage) { ovsdb_storage_get_memory_usage(db->storage, usage); } + + simap_put(usage, "n-weak-refs", n_weak_refs); } struct ovsdb_table * @@ -585,7 +625,9 @@ compaction_thread(void *aux) struct json *data; VLOG_DBG("%s: Compaction thread started.", state->db->name); - data = ovsdb_to_txn_json(state->db, "compacting database online"); + data = ovsdb_to_txn_json(state->db, "compacting database online", + /* Do not allow shallow copies to avoid races. */ + false); state->data = json_serialized_object_create(data); json_destroy(data); @@ -633,7 +675,8 @@ ovsdb_snapshot(struct ovsdb *db, bool trim_memory OVS_UNUSED) if (!applied_index) { /* Parallel compaction is not supported for standalone databases. */ state = xzalloc(sizeof *state); - state->data = ovsdb_to_txn_json(db, "compacting database online"); + state->data = ovsdb_to_txn_json(db, + "compacting database online", true); state->schema = ovsdb_schema_to_json(db->schema); } else if (ovsdb_snapshot_ready(db)) { xpthread_join(db->snap_state->thread, NULL); @@ -708,5 +751,8 @@ ovsdb_replace(struct ovsdb *dst, struct ovsdb *src) dst->rbac_role = ovsdb_get_table(dst, "RBAC_Role"); + /* Get statistics from the new database. */ + dst->n_atoms = src->n_atoms; + ovsdb_destroy(src); } diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h index d05e7c64a69..325900bc6d3 100644 --- a/ovsdb/ovsdb.h +++ b/ovsdb/ovsdb.h @@ -114,6 +114,9 @@ struct ovsdb { size_t n_atoms; /* Total number of ovsdb atoms in the database. */ + bool read_only; /* If 'true', JSON-RPC clients are not allowed to change + * the data. */ + /* Relay mode. */ bool is_relay; /* True, if database is in relay mode. */ /* List that holds transactions waiting to be forwarded to the server. */ @@ -125,9 +128,16 @@ struct ovsdb { struct ovsdb_compaction_state *snap_state; }; +/* Total number of 'weak reference' objects in all databases + * and transactions. */ +extern size_t n_weak_refs; + struct ovsdb *ovsdb_create(struct ovsdb_schema *, struct ovsdb_storage *); void ovsdb_destroy(struct ovsdb *); +void ovsdb_no_data_conversion_disable(void); +bool ovsdb_conversion_with_no_data_supported(const struct ovsdb *); + void ovsdb_get_memory_usage(const struct ovsdb *, struct simap *usage); struct ovsdb_table *ovsdb_get_table(const struct ovsdb *, const char *); diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c index dd14d81091f..27c3aad99c4 100644 --- a/ovsdb/raft-rpc.c +++ b/ovsdb/raft-rpc.c @@ -283,6 +283,9 @@ raft_vote_request_to_jsonrpc(const struct raft_vote_request *rq, json_object_put(args, "leadership_transfer", json_boolean_create(true)); } + if (rq->is_prevote) { + json_object_put(args, "is_prevote", json_boolean_create(true)); + } } static void @@ -294,6 +297,8 @@ raft_vote_request_from_jsonrpc(struct ovsdb_parser *p, rq->last_log_term = raft_parse_required_uint64(p, "last_log_term"); rq->leadership_transfer = raft_parse_optional_boolean(p, "leadership_transfer") == 1; + rq->is_prevote + = raft_parse_optional_boolean(p, "is_prevote") == 1; } static void @@ -305,6 +310,9 @@ raft_format_vote_request(const struct raft_vote_request *rq, struct ds *s) if (rq->leadership_transfer) { ds_put_cstr(s, " leadership_transfer=true"); } + if (rq->is_prevote) { + ds_put_cstr(s, " is_prevote=true"); + } } /* raft_vote_reply. */ @@ -326,6 +334,9 @@ raft_vote_reply_to_jsonrpc(const struct raft_vote_reply *rpy, { raft_put_uint64(args, "term", rpy->term); json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote)); + if (rpy->is_prevote) { + json_object_put(args, "is_prevote", json_boolean_create(true)); + } } static void @@ -334,6 +345,7 @@ raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p, { rpy->term = raft_parse_required_uint64(p, "term"); rpy->vote = raft_parse_required_uuid(p, "vote"); + rpy->is_prevote = raft_parse_optional_boolean(p, "is_prevote") == 1; } static void @@ -341,6 +353,9 @@ raft_format_vote_reply(const struct raft_vote_reply *rpy, struct ds *s) { ds_put_format(s, " term=%"PRIu64, rpy->term); ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote)); + if (rpy->is_prevote) { + ds_put_cstr(s, " is_prevote=true"); + } } /* raft_add_server_request */ @@ -1007,8 +1022,10 @@ raft_rpc_get_vote(const union raft_rpc *rpc) case RAFT_RPC_BECOME_LEADER: return NULL; - case RAFT_RPC_VOTE_REPLY: - return &raft_vote_reply_cast(rpc)->vote; + case RAFT_RPC_VOTE_REPLY: { + const struct raft_vote_reply *rpy = raft_vote_reply_cast(rpc); + return rpy->is_prevote ? NULL : &rpy->vote; + } default: OVS_NOT_REACHED(); diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h index 221f24d0012..7677c35b4e0 100644 --- a/ovsdb/raft-rpc.h +++ b/ovsdb/raft-rpc.h @@ -125,12 +125,15 @@ struct raft_vote_request { uint64_t last_log_index; /* Index of candidate's last log entry. */ uint64_t last_log_term; /* Term of candidate's last log entry. */ bool leadership_transfer; /* True to override minimum election timeout. */ + bool is_prevote; /* True: pre-vote; False: real vote (default). */ }; struct raft_vote_reply { struct raft_rpc_common common; uint64_t term; /* Current term, for candidate to update itself. */ struct uuid vote; /* Server ID of vote. */ + bool is_prevote; /* Copy of the is_prevote from the request, + * primarily for validation. */ }; struct raft_add_server_request { diff --git a/ovsdb/raft.c b/ovsdb/raft.c index b2361b1737a..9c3c351b5be 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -22,6 +22,7 @@ #include #include +#include "cooperative-multitasking.h" #include "hash.h" #include "jsonrpc.h" #include "lockfile.h" @@ -80,6 +81,7 @@ enum raft_failure_test { FT_STOP_RAFT_RPC, FT_TRANSFER_LEADERSHIP, FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ, + FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD, }; static enum raft_failure_test failure_test; @@ -279,6 +281,7 @@ struct raft { /* Used for joining a cluster. */ bool joining; /* Attempting to join the cluster? */ struct sset remote_addresses; /* Addresses to try to find other servers. */ +#define RAFT_JOIN_TIMEOUT_MS 1000 long long int join_timeout; /* Time to re-send add server request. */ /* Used for leaving a cluster. */ @@ -305,6 +308,11 @@ struct raft { /* Candidates only. Reinitialized at start of election. */ int n_votes; /* Number of votes for me. */ + bool prevote_passed; /* Indicates if it passed pre-vote phase. + * Pre-vote mechanism is introduced in raft + * paper section 9.6. We implement it as a + * sub-state of candidate to minimize the + * change and keep backward compatibility. */ /* Followers and candidates only. */ bool candidate_retrying; /* The earlier election timed-out and we are @@ -372,12 +380,14 @@ static void raft_become_follower(struct raft *); static void raft_reset_election_timer(struct raft *); static void raft_reset_ping_timer(struct raft *); static void raft_send_heartbeats(struct raft *); -static void raft_start_election(struct raft *, bool leadership_transfer); +static void raft_start_election(struct raft *, bool is_prevote, + bool leadership_transfer); static bool raft_truncate(struct raft *, uint64_t new_end); static void raft_get_servers_from_log(struct raft *, enum vlog_level); static void raft_get_election_timer_from_log(struct raft *); static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); +static bool raft_has_uncommitted_configuration(const struct raft *); static void raft_run_reconfigure(struct raft *); @@ -987,10 +997,13 @@ raft_reset_election_timer(struct raft *raft) raft->election_timeout = raft->election_base + duration; } +#define RAFT_TIMER_THRESHOLD(t) (t / 3) + static void raft_reset_ping_timer(struct raft *raft) { - raft->ping_timeout = time_msec() + raft->election_timer / 3; + raft->ping_timeout = + time_msec() + RAFT_TIMER_THRESHOLD(raft->election_timer); } static void @@ -1005,8 +1018,13 @@ raft_conn_update_probe_interval(struct raft *raft, struct raft_conn *r_conn) * inactivity probe follower will just try to initiate election * indefinitely staying in 'candidate' role. And the leader will continue * to send heartbeats to the dead connection thinking that remote server - * is still part of the cluster. */ - int probe_interval = raft->election_timer + ELECTION_RANGE_MSEC; + * is still part of the cluster. + * + * While joining, the real value of the election timeout is not known to + * this server, so using the maximum. */ + int probe_interval = (raft->joining ? ELECTION_MAX_MSEC + : raft->election_timer) + + ELECTION_RANGE_MSEC; jsonrpc_session_set_probe_interval(r_conn->js, probe_interval); } @@ -1069,10 +1087,11 @@ raft_open(struct ovsdb_log *log, struct raft **raftp) /* If there's only one server, start an election right away so that the * cluster bootstraps quickly. */ if (hmap_count(&raft->servers) == 1) { - raft_start_election(raft, false); + /* No pre-vote needed since we are the only one. */ + raft_start_election(raft, false, false); } } else { - raft->join_timeout = time_msec() + 1000; + raft->join_timeout = time_msec() + RAFT_JOIN_TIMEOUT_MS; } raft_reset_ping_timer(raft); @@ -1250,10 +1269,30 @@ raft_transfer_leadership(struct raft *raft, const char *reason) return; } - struct raft_server *s; + struct raft_server **servers, *s; + uint64_t threshold = 0; + size_t n = 0, start, i; + + servers = xmalloc(hmap_count(&raft->servers) * sizeof *servers); + HMAP_FOR_EACH (s, hmap_node, &raft->servers) { - if (!uuid_equals(&raft->sid, &s->sid) - && s->phase == RAFT_PHASE_STABLE) { + if (uuid_equals(&raft->sid, &s->sid) + || s->phase != RAFT_PHASE_STABLE) { + continue; + } + if (s->match_index > threshold) { + threshold = s->match_index; + } + servers[n++] = s; + } + + start = n ? random_range(n) : 0; + +retry: + for (i = 0; i < n; i++) { + s = servers[(start + i) % n]; + + if (s->match_index >= threshold) { struct raft_conn *conn = raft_find_conn_by_sid(raft, &s->sid); if (!conn) { continue; @@ -1269,7 +1308,10 @@ raft_transfer_leadership(struct raft *raft, const char *reason) .term = raft->term, } }; - raft_send_to_conn(raft, &rpc, conn); + + if (!raft_send_to_conn(raft, &rpc, conn)) { + continue; + } raft_record_note(raft, "transfer leadership", "transferring leadership to %s because %s", @@ -1277,6 +1319,23 @@ raft_transfer_leadership(struct raft *raft, const char *reason) break; } } + + if (n && i == n && threshold) { + if (threshold > raft->commit_index) { + /* Failed to transfer to servers with the highest 'match_index'. + * Try other servers that are not behind the majority. */ + threshold = raft->commit_index; + } else { + /* Try any other server. It is safe, because they either have all + * the append requests queued up for them before the leadership + * transfer message or their connection is broken and we will not + * transfer anyway. */ + threshold = 0; + } + goto retry; + } + + free(servers); } /* Send a RemoveServerRequest to the rest of the servers in the cluster. @@ -1360,10 +1419,12 @@ void raft_take_leadership(struct raft *raft) { if (raft->role != RAFT_LEADER) { - raft_start_election(raft, true); + raft_start_election(raft, false, true); } } +static void raft_run_cb(void *arg); + /* Closes everything owned by 'raft' that might be visible outside the process: * network connections, commands, etc. This is part of closing 'raft'; it is * also used if 'raft' has failed in an unrecoverable way. */ @@ -1390,6 +1451,8 @@ raft_close__(struct raft *raft) LIST_FOR_EACH_SAFE (conn, list_node, &raft->conns) { raft_conn_close(conn); } + + cooperative_multitasking_remove(&raft_run_cb, raft); } /* Closes and frees 'raft'. @@ -1766,12 +1829,12 @@ raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote) return true; } -static void +static bool raft_accept_vote(struct raft *raft, struct raft_server *s, const struct uuid *vote) { if (uuid_equals(&s->vote, vote)) { - return; + return false; } if (!uuid_is_zero(&s->vote)) { static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1); @@ -1785,13 +1848,18 @@ raft_accept_vote(struct raft *raft, struct raft_server *s, s->vote = *vote; if (uuid_equals(vote, &raft->sid) && ++raft->n_votes > hmap_count(&raft->servers) / 2) { - raft_become_leader(raft); + return true; } + return false; } static void -raft_start_election(struct raft *raft, bool leadership_transfer) +raft_start_election(struct raft *raft, bool is_prevote, + bool leadership_transfer) { + /* Leadership transfer doesn't use pre-vote. */ + ovs_assert(!is_prevote || !leadership_transfer); + if (raft->leaving) { return; } @@ -1801,7 +1869,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) return; } - if (!raft_set_term(raft, raft->term + 1, &raft->sid)) { + if (!is_prevote && !raft_set_term(raft, raft->term + 1, &raft->sid)) { return; } @@ -1809,26 +1877,33 @@ raft_start_election(struct raft *raft, bool leadership_transfer) raft->leader_sid = UUID_ZERO; raft->role = RAFT_CANDIDATE; - /* If there was no leader elected since last election, we know we are - * retrying now. */ - raft->candidate_retrying = !raft->had_leader; - raft->had_leader = false; + raft->prevote_passed = !is_prevote; + + if (is_prevote || leadership_transfer) { + /* If there was no leader elected since last election, we know we are + * retrying now. */ + raft->candidate_retrying = !raft->had_leader; + raft->had_leader = false; + + raft->election_start = time_msec(); + raft->election_won = 0; + } raft->n_votes = 0; - raft->election_start = time_msec(); - raft->election_won = 0; raft->leadership_transfer = leadership_transfer; static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); if (!VLOG_DROP_INFO(&rl)) { long long int now = time_msec(); + char *comment = is_prevote ? "pre-vote" : "vote"; if (now >= raft->election_timeout) { VLOG_INFO("term %"PRIu64": %lld ms timeout expired, " - "starting election", - raft->term, now - raft->election_base); + "starting election (%s)", + raft->term, now - raft->election_base, comment); } else { - VLOG_INFO("term %"PRIu64": starting election", raft->term); + VLOG_INFO("term %"PRIu64": starting election (%s)", + raft->term, comment); } } raft_reset_election_timer(raft); @@ -1853,6 +1928,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) ? raft->entries[raft->log_end - raft->log_start - 1].term : raft->snap.term), .leadership_transfer = leadership_transfer, + .is_prevote = is_prevote, }, }; if (failure_test != FT_DONT_SEND_VOTE_REQUEST) { @@ -1861,7 +1937,13 @@ raft_start_election(struct raft *raft, bool leadership_transfer) } /* Vote for ourselves. */ - raft_accept_vote(raft, me, &raft->sid); + if (raft_accept_vote(raft, me, &raft->sid)) { + /* We just started vote, so it shouldn't be accepted yet unless this is + * a one-node cluster. In such case we don't do pre-vote, and become + * leader immediately. */ + ovs_assert(!is_prevote); + raft_become_leader(raft); + } } static void @@ -2041,10 +2123,10 @@ raft_run(struct raft *raft) raft_reset_election_timer(raft); } else { raft_become_follower(raft); - raft_start_election(raft, false); + raft_start_election(raft, true, false); } } else { - raft_start_election(raft, false); + raft_start_election(raft, hmap_count(&raft->servers) > 1, false); } } @@ -2054,7 +2136,7 @@ raft_run(struct raft *raft) } if (raft->joining && time_msec() >= raft->join_timeout) { - raft->join_timeout = time_msec() + 1000; + raft->join_timeout = time_msec() + RAFT_JOIN_TIMEOUT_MS; LIST_FOR_EACH (conn, list_node, &raft->conns) { raft_send_add_server_request(raft, conn); } @@ -2088,6 +2170,13 @@ raft_run(struct raft *raft) raft_reset_ping_timer(raft); } + uint64_t interval = raft->joining + ? RAFT_JOIN_TIMEOUT_MS + : RAFT_TIMER_THRESHOLD(raft->election_timer); + cooperative_multitasking_set( + &raft_run_cb, (void *) raft, time_msec(), + interval + interval / 10, "raft_run"); + /* Do this only at the end; if we did it as soon as we set raft->left or * raft->failed in handling the RemoveServerReply, then it could easily * cause references to freed memory in RPC sessions, etc. */ @@ -2096,6 +2185,14 @@ raft_run(struct raft *raft) } } +static void +raft_run_cb(void *arg) +{ + struct raft *raft = (struct raft *) arg; + + raft_run(raft); +} + static void raft_wait_session(struct jsonrpc_session *js) { @@ -2210,12 +2307,55 @@ raft_get_eid(const struct raft *raft, uint64_t index) return &raft->snap.eid; } -const struct uuid * +static const struct uuid * raft_current_eid(const struct raft *raft) { return raft_get_eid(raft, raft->log_end - 1); } +bool +raft_precheck_prereq(const struct raft *raft, const struct uuid *prereq) +{ + if (!uuid_equals(raft_current_eid(raft), prereq)) { + VLOG_DBG("%s: prerequisites (" UUID_FMT ") " + "do not match current eid (" UUID_FMT ")", + __func__, UUID_ARGS(prereq), + UUID_ARGS(raft_current_eid(raft))); + return false; + } + + /* Incomplete commands on a leader will not change the leader's current + * 'eid' on commit as they are already part of the leader's log. */ + if (raft->role == RAFT_LEADER) { + return true; + } + + /* Having incomplete commands on a follower means that the leader has + * these commands and they will change the prerequisites once added to + * the leader's log. + * + * There is a chance that all these commands will actually fail and the + * record with current prerequisites will in fact succeed, but, since + * these are our own commands, the chances are low. */ + struct raft_command *cmd; + HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) { + /* Skip commands that are already part of the log (have non-zero + * index) and ones that do not carry any data (have zero 'eid'), + * as they can't change prerequisites. + * + * Database will not re-run triggers unless the data changes or + * one of the data-carrying triggers completes. So, pre-check must + * not fail if there are no outstanding data-carrying commands. */ + if (!cmd->index && !uuid_is_zero(&cmd->eid)) { + VLOG_DBG("%s: follower still has an incomplete command " + UUID_FMT, __func__, UUID_ARGS(&cmd->eid)); + return false; + } + } + + return true; +} + static struct raft_command * raft_command_create_completed(enum raft_command_status status) { @@ -2649,15 +2789,22 @@ raft_become_follower(struct raft *raft) * new configuration. Our AppendEntries processing will properly update * the server configuration later, if necessary. * + * However, since we're sending replies about a failure to add, those new + * servers has to be cleaned up. Otherwise, they will stuck in a 'CATCHUP' + * phase in case this server regains leadership before they join through + * the current new leader. They are not yet in 'raft->servers', so not + * part of the shared configuration. + * * Also we do not complete commands here, as they can still be completed * if their log entries have already been replicated to other servers. * If the entries were actually committed according to the new leader, our * AppendEntries processing will complete the corresponding commands. */ struct raft_server *s; - HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) { + HMAP_FOR_EACH_POP (s, hmap_node, &raft->add_servers) { raft_send_add_server_reply__(raft, &s->sid, s->address, false, RAFT_SERVER_LOST_LEADERSHIP); + raft_server_destroy(s); } if (raft->remove_server) { raft_send_remove_server_reply__(raft, &raft->remove_server->sid, @@ -2721,6 +2868,13 @@ raft_send_heartbeats(struct raft *raft) raft_reset_ping_timer(raft); } +static void +raft_join_complete(struct raft *raft) +{ + raft->joining = false; + raft_update_probe_intervals(raft); +} + /* Initializes the fields in 's' that represent the leader's view of the * server. */ static void @@ -2758,6 +2912,18 @@ raft_become_leader(struct raft *raft) raft_reset_election_timer(raft); raft_reset_ping_timer(raft); + if (raft->joining) { + /* It is possible that the server committing this one to the list of + * servers lost leadership before the entry is committed but after + * it was already replicated to majority of servers. In this case + * other servers will recognize this one as a valid cluster member + * and may transfer leadership to it and vote for it. This way + * we're becoming a cluster leader without receiving reply for a + * join request and will commit addition of this server ourselves. */ + VLOG_INFO_RL(&rl, "elected as leader while joining"); + raft_join_complete(raft); + } + struct raft_server *s; HMAP_FOR_EACH (s, hmap_node, &raft->servers) { raft_server_init_leader(raft, s); @@ -2916,12 +3082,12 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) } while (raft->commit_index < new_commit_index) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); uint64_t index = ++raft->commit_index; const struct raft_entry *e = raft_get_entry(raft, index); if (raft_entry_has_data(e)) { struct raft_command *cmd = raft_find_command_by_eid(raft, &e->eid); - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); if (cmd) { if (!cmd->index && raft->role == RAFT_LEADER) { @@ -2965,6 +3131,35 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index) * reallocate raft->entries, which would invalidate 'e', so * this case must be last, after the one for 'e->data'. */ raft_run_reconfigure(raft); + } else if (e->servers && !raft_has_uncommitted_configuration(raft)) { + struct ovsdb_error *error; + struct raft_server *s; + struct hmap servers; + + error = raft_servers_from_json(e->servers, &servers); + ovs_assert(!error); + HMAP_FOR_EACH (s, hmap_node, &servers) { + struct raft_server *server = raft_find_server(raft, &s->sid); + + if (server && server->phase == RAFT_PHASE_COMMITTING) { + /* This server lost leadership while committing + * server 's', but it was committed later by a + * new leader. */ + server->phase = RAFT_PHASE_STABLE; + } + + if (raft->joining && uuid_equals(&s->sid, &raft->sid)) { + /* Leadership change happened before previous leader + * could commit the change of a servers list, but it + * was replicated and a new leader committed it. */ + VLOG_INFO_RL(&rl, + "added to configuration without reply " + "(eid: "UUID_FMT", commit index: %"PRIu64")", + UUID_ARGS(&e->eid), index); + raft_join_complete(raft); + } + } + raft_servers_destroy(&servers); } } @@ -3673,6 +3868,10 @@ raft_handle_vote_request__(struct raft *raft, return false; } + if (rq->is_prevote) { + return true; + } + /* Record a vote for the peer. */ if (!raft_set_term(raft, raft->term, &rq->common.sid)) { return false; @@ -3685,7 +3884,7 @@ raft_handle_vote_request__(struct raft *raft, static void raft_send_vote_reply(struct raft *raft, const struct uuid *dst, - const struct uuid *vote) + const struct uuid *vote, bool is_prevote) { union raft_rpc rpy = { .vote_reply = { @@ -3695,6 +3894,7 @@ raft_send_vote_reply(struct raft *raft, const struct uuid *dst, }, .term = raft->term, .vote = *vote, + .is_prevote = is_prevote, }, }; raft_send(raft, &rpy); @@ -3705,7 +3905,9 @@ raft_handle_vote_request(struct raft *raft, const struct raft_vote_request *rq) { if (raft_handle_vote_request__(raft, rq)) { - raft_send_vote_reply(raft, &rq->common.sid, &raft->vote); + raft_send_vote_reply(raft, &rq->common.sid, + rq->is_prevote ? &rq->common.sid : &raft->vote, + rq->is_prevote); } } @@ -3723,7 +3925,14 @@ raft_handle_vote_reply(struct raft *raft, struct raft_server *s = raft_find_peer(raft, &rpy->common.sid); if (s) { - raft_accept_vote(raft, s, &rpy->vote); + if (raft_accept_vote(raft, s, &rpy->vote)) { + if (raft->prevote_passed) { + raft_become_leader(raft); + } else { + /* Start the real election. */ + raft_start_election(raft, false, false); + } + } } } @@ -3877,6 +4086,10 @@ raft_handle_add_server_request(struct raft *raft, "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid), rq->address, CID_ARGS(&raft->cid)); raft_send_append_request(raft, s, 0, "initialize new server"); + + if (failure_test == FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD) { + failure_test = FT_TRANSFER_LEADERSHIP; + } } static void @@ -3891,7 +4104,7 @@ raft_handle_add_server_reply(struct raft *raft, } if (rpy->success) { - raft->joining = false; + raft_join_complete(raft); /* It is tempting, at this point, to check that this server is part of * the current configuration. However, this is not necessarily the @@ -4357,7 +4570,7 @@ raft_handle_become_leader(struct raft *raft, VLOG_INFO("received leadership transfer from %s in term %"PRIu64, raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf), rq->term); - raft_start_election(raft, true); + raft_start_election(raft, false, true); } } @@ -4865,6 +5078,7 @@ raft_get_election_timer_from_log(struct raft *raft) break; } } + raft_update_probe_intervals(raft); } static void @@ -5002,6 +5216,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED, } else if (!strcmp(test, "transfer-leadership-after-sending-append-request")) { failure_test = FT_TRANSFER_LEADERSHIP_AFTER_SEND_APPEND_REQ; + } else if (!strcmp(test, "transfer-leadership-after-starting-to-add")) { + failure_test = FT_TRANSFER_LEADERSHIP_AFTER_STARTING_TO_ADD; } else if (!strcmp(test, "transfer-leadership")) { failure_test = FT_TRANSFER_LEADERSHIP; } else if (!strcmp(test, "clear")) { diff --git a/ovsdb/raft.h b/ovsdb/raft.h index 403ed3dd732..5833aaf23b2 100644 --- a/ovsdb/raft.h +++ b/ovsdb/raft.h @@ -26,7 +26,8 @@ * ========== * * Based on Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and - * Practice", available at https://ramcloud.stanford.edu/~ongaro/thesis.pdf. + * Practice", available at + * https://github.com/ongardie/dissertation/blob/master/stanford.pdf. * References to sections, pages, and figures are from this thesis. Quotations * in comments also come from this work, in accordance with its license notice, * reproduced below: @@ -188,5 +189,5 @@ struct ovsdb_error *raft_store_snapshot(struct raft *, void raft_take_leadership(struct raft *); void raft_transfer_leadership(struct raft *, const char *reason); -const struct uuid *raft_current_eid(const struct raft *); +bool raft_precheck_prereq(const struct raft *, const struct uuid *prereq); #endif /* lib/raft.h */ diff --git a/ovsdb/relay.c b/ovsdb/relay.c index 9ff6ed8f393..71a5b8e1cec 100644 --- a/ovsdb/relay.c +++ b/ovsdb/relay.c @@ -127,7 +127,8 @@ static struct ovsdb_cs_ops relay_cs_ops = { void ovsdb_relay_add_db(struct ovsdb *db, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux) + void *schema_change_aux, + const struct jsonrpc_session_options *options) { struct relay_ctx *ctx; @@ -138,6 +139,7 @@ ovsdb_relay_add_db(struct ovsdb *db, const char *remote, ctx = shash_find_data(&relay_dbs, db->name); if (ctx) { ovsdb_cs_set_remote(ctx->cs, remote, true); + ovsdb_cs_set_jsonrpc_options(ctx->cs, options); VLOG_DBG("%s: relay source set to '%s'", db->name, remote); return; } @@ -152,10 +154,23 @@ ovsdb_relay_add_db(struct ovsdb *db, const char *remote, shash_add(&relay_dbs, db->name, ctx); ovsdb_cs_set_leader_only(ctx->cs, false); ovsdb_cs_set_remote(ctx->cs, remote, true); + ovsdb_cs_set_jsonrpc_options(ctx->cs, options); VLOG_DBG("added database: %s, %s", db->name, remote); } +/* Updates the probe interval for all relay connections to the specified + * value. */ +void +ovsdb_relay_set_probe_interval(int probe_interval) +{ + struct shash_node *node; + SHASH_FOR_EACH (node, &relay_dbs) { + struct relay_ctx *ctx = node->data; + ovsdb_cs_set_probe_interval(ctx->cs, probe_interval); + } +} + void ovsdb_relay_del_db(struct ovsdb *db) { @@ -301,6 +316,8 @@ static void ovsdb_relay_parse_update(struct relay_ctx *ctx, const struct ovsdb_cs_update_event *update) { + struct ovsdb_error *error = NULL; + if (!ctx->db) { return; } @@ -308,15 +325,27 @@ ovsdb_relay_parse_update(struct relay_ctx *ctx, if (update->monitor_reply && ctx->new_schema) { /* There was a schema change. Updating a database with a new schema * before processing monitor reply with the new data. */ - ctx->schema_change_cb(ctx->db, ctx->new_schema, - ctx->schema_change_aux); + error = ctx->schema_change_cb(ctx->db, ctx->new_schema, &UUID_ZERO, + false, ctx->schema_change_aux); + if (error) { + /* Should never happen, but handle this case anyway. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + char *s = ovsdb_error_to_string_free(error); + + VLOG_ERR_RL(&rl, "%s", s); + free(s); + + ovsdb_cs_flag_inconsistency(ctx->cs); + return; + } ovsdb_schema_destroy(ctx->new_schema); ctx->new_schema = NULL; } struct ovsdb_cs_db_update *du; - struct ovsdb_error *error = ovsdb_cs_parse_db_update(update->table_updates, - update->version, &du); + + error = ovsdb_cs_parse_db_update(update->table_updates, + update->version, &du); if (!error) { if (update->clear) { error = ovsdb_relay_clear(ctx->db); @@ -386,6 +415,7 @@ ovsdb_relay_run(void) } ovsdb_cs_event_destroy(event); } + ovsdb_txn_history_run(ctx->db); } } diff --git a/ovsdb/relay.h b/ovsdb/relay.h index 390ea70c827..19cd3ef602a 100644 --- a/ovsdb/relay.h +++ b/ovsdb/relay.h @@ -19,20 +19,33 @@ #include +#include "reconnect.h" + struct json; +struct jsonrpc_session_options; struct ovsdb; struct ovsdb_schema; +struct uuid; + +#define RELAY_SOURCE_DEFAULT_PROBE_INTERVAL RECONNECT_DEFAULT_PROBE_INTERVAL -typedef void (*schema_change_callback)(struct ovsdb *, - const struct ovsdb_schema *, void *aux); +typedef struct ovsdb_error *(*schema_change_callback)( + struct ovsdb *, + const struct ovsdb_schema *, + const struct uuid *, + bool conversion_with_no_data, + void *aux); void ovsdb_relay_add_db(struct ovsdb *, const char *remote, schema_change_callback schema_change_cb, - void *schema_change_aux); + void *schema_change_aux, + const struct jsonrpc_session_options *); void ovsdb_relay_del_db(struct ovsdb *); void ovsdb_relay_run(void); void ovsdb_relay_wait(void); +void ovsdb_relay_set_probe_interval(int probe_interval); + bool ovsdb_relay_is_connected(struct ovsdb *); #endif /* OVSDB_RELAY_H */ diff --git a/ovsdb/replication.c b/ovsdb/replication.c index 477c69d701b..56720cb105d 100644 --- a/ovsdb/replication.c +++ b/ovsdb/replication.c @@ -38,16 +38,7 @@ VLOG_DEFINE_THIS_MODULE(replication); -static char *sync_from; static struct uuid server_uuid; -static struct jsonrpc_session *session; -static unsigned int session_seqno = UINT_MAX; - -static struct jsonrpc_msg *create_monitor_request(struct ovsdb_schema *); -static void add_monitored_table(struct ovsdb_table_schema *table, - struct json *monitor_requests); - -static struct ovsdb_error *reset_database(struct ovsdb *db); static struct ovsdb_error *process_notification(struct json *, struct ovsdb *); static struct ovsdb_error *process_table_update(struct json *table_update, @@ -55,27 +46,6 @@ static struct ovsdb_error *process_table_update(struct json *table_update, struct ovsdb *database, struct ovsdb_txn *txn); -/* Maps from db name to sset of table names. */ -static struct shash excluded_tables = SHASH_INITIALIZER(&excluded_tables); - -static void excluded_tables_clear(void); -static void excluded_tables_add(const char *database, const char *table); -static bool excluded_tables_find(const char *database, const char *table); - - -/* Keep track of request IDs of all outstanding OVSDB requests. */ -static struct hmap request_ids = HMAP_INITIALIZER(&request_ids); - -struct request_ids_hmap_node { - struct hmap_node hmap; - struct json *request_id; - struct ovsdb *db; /* associated database */ -}; -void request_ids_add(const struct json *id, struct ovsdb *db); -bool request_ids_lookup_and_free(const struct json *id, struct ovsdb **db); -static void request_ids_destroy(void); -void request_ids_clear(void); - enum ovsdb_replication_state { RPL_S_INIT, RPL_S_SERVER_ID_REQUESTED, @@ -85,168 +55,216 @@ enum ovsdb_replication_state { RPL_S_REPLICATING, RPL_S_ERR /* Error, no longer replicating. */ }; -static enum ovsdb_replication_state state; - struct replication_db { struct ovsdb *db; + bool schema_version_higher; /* Points to the schema received from the active server if * the local db schema version is higher. NULL otherwise. */ struct ovsdb_schema *active_db_schema; + + char *sync_from; + char *excluded_tables_str; + struct sset excluded_tables; + + struct json *request_id; /* Id of the outstanding OVSDB request. */ + + struct jsonrpc_session *session; + unsigned int session_seqno; + + enum ovsdb_replication_state state; }; static bool is_replication_possible(struct ovsdb_schema *local_db_schema, struct ovsdb_schema *active_db_schema); +static struct jsonrpc_msg *create_monitor_request(struct replication_db *, + struct ovsdb_schema *); +static void add_monitored_table(struct ovsdb_table_schema *table, + struct json *monitor_requests); + + /* All DBs known to ovsdb-server. The actual replication dbs are stored * in 'replication dbs', which is a subset of all dbs and remote dbs whose * schema matches. */ -static struct shash local_dbs = SHASH_INITIALIZER(&local_dbs); -static struct shash *replication_dbs; +static struct shash replication_dbs = SHASH_INITIALIZER(&replication_dbs); + +static void replication_db_destroy(struct replication_db *); +static struct ovsdb_error *reset_database(struct replication_db *); -static struct shash *replication_dbs_create(void); -static void replication_dbs_destroy(void); /* Find 'struct ovsdb' by name within 'replication_dbs' */ static struct replication_db *find_db(const char *db_name); + +static char *set_excluded_tables(struct replication_db *, const char *excluded) + OVS_WARN_UNUSED_RESULT; + +static void request_id_set(struct replication_db *, const struct json *id); +static void request_id_clear(struct replication_db *); +static bool request_id_compare_and_free(struct replication_db *, + const struct json *id); void -replication_init(const char *sync_from_, const char *exclude_tables, - const struct uuid *server, int probe_interval) +replication_set_db(struct ovsdb *db, const char *sync_from, + const char *exclude_tables, const struct uuid *server, + const struct jsonrpc_session_options *options) { - free(sync_from); - sync_from = xstrdup(sync_from_); - /* Caller should have verified that the 'exclude_tables' is - * parseable. An error here is unexpected. */ - ovs_assert(!set_excluded_tables(exclude_tables, false)); + struct replication_db *rdb = find_db(db->name); - replication_dbs_destroy(); + if (uuid_is_zero(&server_uuid)) { + /* Keep a copy of local server uuid. */ + server_uuid = *server; + } else { + ovs_assert(uuid_equals(&server_uuid, server)); + } - shash_clear(&local_dbs); - if (session) { - jsonrpc_session_close(session); + ovs_assert(sync_from); + + if (rdb + && nullable_string_is_equal(rdb->excluded_tables_str, exclude_tables) + && nullable_string_is_equal(rdb->sync_from, sync_from)) { + jsonrpc_session_set_options(rdb->session, options); + return; } - session = jsonrpc_session_open(sync_from, true); - session_seqno = UINT_MAX; + if (!rdb) { + rdb = xzalloc(sizeof *rdb); + rdb->db = db; + sset_init(&rdb->excluded_tables); + rdb->schema_version_higher = false; + shash_add(&replication_dbs, db->name, rdb); + } else { + replication_db_destroy(rdb); + } + + rdb->sync_from = xstrdup(sync_from); + rdb->excluded_tables_str = nullable_xstrdup(exclude_tables); + /* Caller should have verified that the 'exclude_tables' is + * parseable. An error here is unexpected. */ + ovs_assert(!set_excluded_tables(rdb, exclude_tables)); - jsonrpc_session_set_probe_interval(session, probe_interval); + rdb->session = jsonrpc_session_open(rdb->sync_from, true); + rdb->session_seqno = UINT_MAX; - /* Keep a copy of local server uuid. */ - server_uuid = *server; + jsonrpc_session_set_options(rdb->session, options); - state = RPL_S_INIT; + rdb->state = RPL_S_INIT; + rdb->db->read_only = true; } void -replication_add_local_db(const char *database, struct ovsdb *db) +replication_remove_db(const struct ovsdb *db) { - shash_add_assert(&local_dbs, database, db); + struct replication_db *rdb; + + rdb = shash_find_and_delete(&replication_dbs, db->name); + if (rdb) { + replication_db_destroy(rdb); + free(rdb); + } } static void -send_schema_requests(const struct json *result) +send_schema_request(struct replication_db *rdb) { - for (size_t i = 0; i < result->array.n; i++) { - const struct json *name = result->array.elems[i]; - if (name->type == JSON_STRING) { - /* Send one schema request for each remote DB. */ - const char *db_name = json_string(name); - struct replication_db *rdb = find_db(db_name); - if (rdb) { - struct jsonrpc_msg *request = - jsonrpc_create_request( - "get_schema", - json_array_create_1( - json_string_create(db_name)), - NULL); - - request_ids_add(request->id, rdb->db); - jsonrpc_session_send(session, request); - } - } - } + struct jsonrpc_msg *request = + jsonrpc_create_request( + "get_schema", + json_array_create_1(json_string_create(rdb->db->name)), + NULL); + + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); } -void -replication_run(void) +static void +replication_run_db(struct replication_db *rdb) { - if (!session) { + if (!rdb->session) { return; } - jsonrpc_session_run(session); + jsonrpc_session_run(rdb->session); - for (int i = 0; jsonrpc_session_is_connected(session) && i < 50; i++) { + for (int i = 0; i < 50; i++) { struct jsonrpc_msg *msg; unsigned int seqno; - seqno = jsonrpc_session_get_seqno(session); - if (seqno != session_seqno || state == RPL_S_INIT) { - session_seqno = seqno; - request_ids_clear(); + if (!jsonrpc_session_is_connected(rdb->session)) { + break; + } + + seqno = jsonrpc_session_get_seqno(rdb->session); + if (seqno != rdb->session_seqno || rdb->state == RPL_S_INIT) { + rdb->session_seqno = seqno; + request_id_clear(rdb); + struct jsonrpc_msg *request; request = jsonrpc_create_request("get_server_id", json_array_create_empty(), NULL); - request_ids_add(request->id, NULL); - jsonrpc_session_send(session, request); + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); - state = RPL_S_SERVER_ID_REQUESTED; - VLOG_DBG("send server ID request."); + rdb->state = RPL_S_SERVER_ID_REQUESTED; + VLOG_DBG("%s: send server ID request.", rdb->db->name); } - msg = jsonrpc_session_recv(session); + msg = jsonrpc_session_recv(rdb->session); if (!msg) { continue; } - if (msg->type == JSONRPC_NOTIFY && state != RPL_S_ERR + if (msg->type == JSONRPC_NOTIFY && rdb->state != RPL_S_ERR && !strcmp(msg->method, "update")) { if (msg->params->type == JSON_ARRAY && msg->params->array.n == 2 && msg->params->array.elems[0]->type == JSON_STRING) { char *db_name = msg->params->array.elems[0]->string; - struct replication_db *rdb = find_db(db_name); - if (rdb) { + + if (!strcmp(db_name, rdb->db->name)) { struct ovsdb_error *error; error = process_notification(msg->params->array.elems[1], rdb->db); if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } + } else { + VLOG_WARN("%s: received update for unexpected database %s", + rdb->db->name, db_name); + rdb->state = RPL_S_ERR; } } } else if (msg->type == JSONRPC_REPLY) { - struct replication_db *rdb; - struct ovsdb *db; - if (!request_ids_lookup_and_free(msg->id, &db)) { - VLOG_WARN("received unexpected reply"); + if (!request_id_compare_and_free(rdb, msg->id)) { + VLOG_WARN("%s: received unexpected reply.", rdb->db->name); goto next; } - switch (state) { + switch (rdb->state) { case RPL_S_SERVER_ID_REQUESTED: { struct uuid uuid; if (msg->result->type != JSON_STRING || !uuid_from_string(&uuid, json_string(msg->result))) { struct ovsdb_error *error; error = ovsdb_error("get_server_id failed", - "Server ID is not valid UUID"); + "%s: Server ID is not valid UUID", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; break; } if (uuid_equals(&uuid, &server_uuid)) { struct ovsdb_error *error; error = ovsdb_error("Server ID check failed", - "Self replicating is not allowed"); + "%s: Self replicating is not allowed", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; break; } @@ -254,25 +272,32 @@ replication_run(void) request = jsonrpc_create_request("list_dbs", json_array_create_empty(), NULL); - request_ids_add(request->id, NULL); - jsonrpc_session_send(session, request); + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); - replication_dbs_destroy(); - replication_dbs = replication_dbs_create(); - state = RPL_S_DB_REQUESTED; + rdb->state = RPL_S_DB_REQUESTED; break; } case RPL_S_DB_REQUESTED: if (msg->result->type != JSON_ARRAY) { struct ovsdb_error *error; error = ovsdb_error("list_dbs failed", - "list_dbs response is not array"); + "%s: list_dbs response is not array", + rdb->db->name); + ovsdb_error_assert(error); + rdb->state = RPL_S_ERR; + } else if (!json_array_contains_string(msg->result, + rdb->db->name)) { + struct ovsdb_error *error; + error = ovsdb_error("list_dbs failed", + "%s: database name is not in the list", + rdb->db->name); ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } else { - send_schema_requests(msg->result); - VLOG_DBG("Send schema requests"); - state = RPL_S_SCHEMA_REQUESTED; + send_schema_request(rdb); + VLOG_DBG("%s: send schema request.", rdb->db->name); + rdb->state = RPL_S_SCHEMA_REQUESTED; } break; @@ -283,19 +308,22 @@ replication_run(void) error = ovsdb_schema_from_json(msg->result, &schema); if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; + break; } - rdb = find_db(schema->name); - if (!rdb) { + if (strcmp(rdb->db->name, schema->name)) { /* Unexpected schema. */ - VLOG_WARN("unexpected schema %s", schema->name); - state = RPL_S_ERR; + VLOG_WARN("%s: unexpected schema %s.", + rdb->db->name, schema->name); + rdb->state = RPL_S_ERR; + ovsdb_schema_destroy(schema); + break; } else if (!ovsdb_schema_equal(schema, rdb->db->schema)) { /* Schmea version mismatch. */ - VLOG_INFO("Schema version mismatch, checking if %s can " - "still be replicated or not.", - schema->name); + VLOG_INFO("%s: Schema version mismatch, checking if %s can" + " still be replicated or not.", + rdb->db->name, schema->name); if (is_replication_possible(rdb->db->schema, schema)) { VLOG_INFO("%s can be replicated.", schema->name); rdb->schema_version_higher = true; @@ -305,68 +333,48 @@ replication_run(void) rdb->active_db_schema = schema; } else { VLOG_INFO("%s cannot be replicated.", schema->name); - struct replication_db *r = - shash_find_and_delete(replication_dbs, - schema->name); - if (r->active_db_schema) { - ovsdb_schema_destroy(r->active_db_schema); - } - free(r); + rdb->state = RPL_S_ERR; ovsdb_schema_destroy(schema); + break; } } else { ovsdb_schema_destroy(schema); } - /* After receiving schemas, reset the local databases that - * will be monitored and send out monitor requests for them. */ - if (hmap_is_empty(&request_ids)) { - struct shash_node *node; - - if (shash_is_empty(replication_dbs)) { - VLOG_WARN("Nothing to replicate."); - state = RPL_S_ERR; - } else { - SHASH_FOR_EACH (node, replication_dbs) { - rdb = node->data; - struct jsonrpc_msg *request = - create_monitor_request( - rdb->schema_version_higher ? - rdb->active_db_schema : rdb->db->schema); - - request_ids_add(request->id, rdb->db); - jsonrpc_session_send(session, request); - VLOG_DBG("Send monitor requests"); - state = RPL_S_MONITOR_REQUESTED; - } - } - } + /* Send out a monitor request. */ + struct jsonrpc_msg *request = + create_monitor_request(rdb, rdb->schema_version_higher + ? rdb->active_db_schema + : rdb->db->schema); + + request_id_set(rdb, request->id); + jsonrpc_session_send(rdb->session, request); + VLOG_DBG("%s: send monitor request.", rdb->db->name); + rdb->state = RPL_S_MONITOR_REQUESTED; break; } case RPL_S_MONITOR_REQUESTED: { /* Reply to monitor requests. */ struct ovsdb_error *error; - VLOG_INFO("Monitor request received. Resetting the database"); + VLOG_INFO("%s: Monitor reply received. " + "Resetting the database.", rdb->db->name); /* Resetting the database here has few risks. If the * process_notification() fails, the database is completely * lost locally. In case that node becomes active, then * there is a chance of complete data loss in the active/standy * cluster. */ - error = reset_database(db); + error = reset_database(rdb); if (!error) { - error = process_notification(msg->result, db); + error = process_notification(msg->result, rdb->db); } if (error) { ovsdb_error_assert(error); - state = RPL_S_ERR; + rdb->state = RPL_S_ERR; } else { - /* Transition to replicating state after receiving - * all replies of "monitor" requests. */ - if (hmap_is_empty(&request_ids)) { - VLOG_DBG("Listening to monitor updates"); - state = RPL_S_REPLICATING; - } + VLOG_DBG("%s: Listening to monitor updates.", + rdb->db->name); + rdb->state = RPL_S_REPLICATING; } break; } @@ -386,24 +394,40 @@ replication_run(void) } } +void +replication_run(void) +{ + struct shash_node *node; + + SHASH_FOR_EACH (node, &replication_dbs) { + replication_run_db(node->data); + } +} + void replication_wait(void) { - if (session) { - jsonrpc_session_wait(session); - jsonrpc_session_recv_wait(session); + struct shash_node *node; + + SHASH_FOR_EACH (node, &replication_dbs) { + struct replication_db *rdb = node->data; + + if (rdb->session) { + jsonrpc_session_wait(rdb->session); + jsonrpc_session_recv_wait(rdb->session); + } } } -/* Parse 'excluded' to rebuild 'excluded_tables'. If 'dryrun' is false, the - * current set of excluded tables will be wiped out, regardless of whether - * 'excluded' can be parsed. If 'dryrun' is true, only parses 'excluded' and +/* Parse 'excluded' to rebuild 'rdb->excluded_tables'. If 'rdb' is not NULL, + * the current set of excluded tables will be wiped out, regardless of whether + * 'excluded' can be parsed. If 'rdb' is NULL, only parses 'excluded' and * reports any errors, without modifying the list of exclusions. * - * On error, returns the error string, which the caller is - * responsible for freeing. Returns NULL otherwise. */ -char * OVS_WARN_UNUSED_RESULT -set_excluded_tables(const char *excluded, bool dryrun) + * On error, returns the error string, which the caller is responsible for + * freeing. Returns NULL otherwise. */ +static char * OVS_WARN_UNUSED_RESULT +set_excluded_tables__(struct replication_db *rdb, const char *excluded) { struct sset set = SSET_INITIALIZER(&set); char *err = NULL; @@ -411,17 +435,22 @@ set_excluded_tables(const char *excluded, bool dryrun) if (excluded) { const char *longname; - if (!dryrun) { - /* Can only add to an empty shash. */ - excluded_tables_clear(); + if (rdb) { + /* Can only add to an empty set. */ + sset_clear(&rdb->excluded_tables); } sset_from_delimited_string(&set, excluded, " ,"); SSET_FOR_EACH (longname, &set) { + if (rdb && !strchr(longname, ':')) { + sset_add(&rdb->excluded_tables, longname); + continue; + } + char *database = xstrdup(longname), *table = NULL; strtok_r(database, ":", &table); - if (table && !dryrun) { - excluded_tables_add(database, table); + if (table && rdb && !strcmp(rdb->db->name, database)) { + sset_add(&rdb->excluded_tables, table); } free(database); @@ -434,120 +463,74 @@ set_excluded_tables(const char *excluded, bool dryrun) done: sset_destroy(&set); - if (err && !dryrun) { + if (err && rdb) { /* On error, destroy the partially built 'excluded_tables'. */ - excluded_tables_clear(); + sset_clear(&rdb->excluded_tables); } return err; } char * OVS_WARN_UNUSED_RESULT -get_excluded_tables(void) +parse_excluded_tables(const char *excluded) { - struct shash_node *node; - struct sset set = SSET_INITIALIZER(&set); - - SHASH_FOR_EACH (node, &excluded_tables) { - const char *database = node->name; - const char *table; - struct sset *tables = node->data; - - SSET_FOR_EACH (table, tables) { - sset_add_and_free(&set, xasprintf("%s:%s", database, table)); - } - } - - /* Output the table list in an sorted order, so that - * the output string will not depend on the hash function - * that used to implement the hmap data structure. This is - * only useful for writting unit tests. */ - const char **sorted = sset_sort(&set); - struct ds ds = DS_EMPTY_INITIALIZER; - size_t i; - for (i = 0; i < sset_count(&set); i++) { - ds_put_format(&ds, "%s,", sorted[i]); - } - - ds_chomp(&ds, ','); - - free(sorted); - sset_destroy(&set); - - return ds_steal_cstr(&ds); + return set_excluded_tables__(NULL, excluded); } -static void -excluded_tables_clear(void) +static char * OVS_WARN_UNUSED_RESULT +set_excluded_tables(struct replication_db *rdb, const char *excluded) { - struct shash_node *node; - SHASH_FOR_EACH (node, &excluded_tables) { - struct sset *tables = node->data; - sset_destroy(tables); - } - - shash_clear_free_data(&excluded_tables); + return set_excluded_tables__(rdb, excluded); } -static void -excluded_tables_add(const char *database, const char *table) +char * OVS_WARN_UNUSED_RESULT +get_excluded_tables(const struct ovsdb *db) { - struct sset *tables = shash_find_data(&excluded_tables, database); + const struct replication_db *rdb = find_db(db->name); - if (!tables) { - tables = xmalloc(sizeof *tables); - sset_init(tables); - shash_add(&excluded_tables, database, tables); + if (!rdb) { + return xstrdup(""); } - sset_add(tables, table); -} + struct sset set = SSET_INITIALIZER(&set); + const char *table; + char *result; -static bool -excluded_tables_find(const char *database, const char *table) -{ - struct sset *tables = shash_find_data(&excluded_tables, database); - return tables && sset_contains(tables, table); -} + SSET_FOR_EACH (table, &rdb->excluded_tables) { + sset_add_and_free(&set, xasprintf("%s:%s", rdb->db->name, table)); + } -void -disconnect_active_server(void) -{ - jsonrpc_session_close(session); - session = NULL; + result = sset_join(&set, ",", ""); + sset_destroy(&set); + + return result; } void replication_destroy(void) { - excluded_tables_clear(); - shash_destroy(&excluded_tables); + struct shash_node *node; - if (sync_from) { - free(sync_from); - sync_from = NULL; + SHASH_FOR_EACH (node, &replication_dbs) { + replication_db_destroy(node->data); } - - request_ids_destroy(); - replication_dbs_destroy(); - - shash_destroy(&local_dbs); + shash_destroy_free_data(&replication_dbs); } static struct replication_db * find_db(const char *db_name) { - return shash_find_data(replication_dbs, db_name); + return shash_find_data(&replication_dbs, db_name); } static struct ovsdb_error * -reset_database(struct ovsdb *db) +reset_database(struct replication_db *rdb) { - struct ovsdb_txn *txn = ovsdb_txn_create(db); + struct ovsdb_txn *txn = ovsdb_txn_create(rdb->db); struct shash_node *table_node; - SHASH_FOR_EACH (table_node, &db->tables) { + SHASH_FOR_EACH (table_node, &rdb->db->tables) { /* Delete all rows if the table is not excluded. */ - if (!excluded_tables_find(db->schema->name, table_node->name)) { + if (!sset_contains(&rdb->excluded_tables, table_node->name)) { struct ovsdb_table *table = table_node->data; struct ovsdb_row *row; HMAP_FOR_EACH_SAFE (row, hmap_node, &table->rows) { @@ -565,7 +548,7 @@ reset_database(struct ovsdb *db) * Caller is responsible for disposing 'request'. */ static struct jsonrpc_msg * -create_monitor_request(struct ovsdb_schema *schema) +create_monitor_request(struct replication_db *rdb, struct ovsdb_schema *schema) { struct jsonrpc_msg *request; struct json *monitor; @@ -579,7 +562,7 @@ create_monitor_request(struct ovsdb_schema *schema) struct ovsdb_table_schema *table = nodes[j]->data; /* Monitor all tables not excluded. */ - if (!excluded_tables_find(db_name, table->name)) { + if (!sset_contains(&rdb->excluded_tables, table->name)) { add_monitored_table(table, monitor_request); } } @@ -689,114 +672,77 @@ process_table_update(struct json *table_update, const char *table_name, return NULL; } -void -request_ids_add(const struct json *id, struct ovsdb *db) +static void +request_id_set(struct replication_db *rdb, const struct json *id) { - struct request_ids_hmap_node *node = xmalloc(sizeof *node); + ovs_assert(!rdb->request_id); + rdb->request_id = json_clone(id); +} - node->request_id = json_clone(id); - node->db = db; - hmap_insert(&request_ids, &node->hmap, json_hash(id, 0)); +static void +request_id_clear(struct replication_db *rdb) +{ + json_destroy(rdb->request_id); + rdb->request_id = NULL; } -/* Look up 'id' from 'request_ids', if found, remove the found id from - * 'request_ids' and free its memory. If not found, 'request_ids' does - * not change. Sets '*db' to the database for the request (NULL if not - * found). +/* Compare 'id' with sent 'request_id'. If it matches, clear the current + * 'request_id'. If it doesn't match, 'request_id' does not change. * - * Return true if 'id' is found, false otherwise. + * Return true if 'id' matches, false otherwise. */ -bool -request_ids_lookup_and_free(const struct json *id, struct ovsdb **db) +static bool +request_id_compare_and_free(struct replication_db *rdb, const struct json *id) { - struct request_ids_hmap_node *node; - - HMAP_FOR_EACH_WITH_HASH (node, hmap, json_hash(id, 0), &request_ids) { - if (json_equal(id, node->request_id)) { - hmap_remove(&request_ids, &node->hmap); - *db = node->db; - json_destroy(node->request_id); - free(node); - return true; - } + if (rdb->request_id && json_equal(id, rdb->request_id)) { + request_id_clear(rdb); + return true; } - - *db = NULL; return false; } static void -request_ids_destroy(void) +replication_db_destroy(struct replication_db *rdb) { - struct request_ids_hmap_node *node; - - HMAP_FOR_EACH_POP (node, hmap, &request_ids) { - json_destroy(node->request_id); - free(node); + if (!rdb) { + return; } - hmap_destroy(&request_ids); -} -void -request_ids_clear(void) -{ - request_ids_destroy(); - hmap_init(&request_ids); -} + free(rdb->sync_from); + rdb->sync_from = NULL; -static struct shash * -replication_dbs_create(void) -{ - struct shash *new = xmalloc(sizeof *new); - shash_init(new); + free(rdb->excluded_tables_str); + rdb->excluded_tables_str = NULL; + sset_destroy(&rdb->excluded_tables); - struct shash_node *node; - SHASH_FOR_EACH (node, &local_dbs) { - struct replication_db *repl_db = xmalloc(sizeof *repl_db); - repl_db->db = node->data; - repl_db->schema_version_higher = false; - repl_db->active_db_schema = NULL; - shash_add(new, node->name, repl_db); - } + request_id_clear(rdb); - return new; -} - -static void -replication_dbs_destroy(void) -{ - if (!replication_dbs) { - return; + if (rdb->session) { + jsonrpc_session_close(rdb->session); + rdb->session = NULL; } - struct shash_node *node; - - SHASH_FOR_EACH_SAFE (node, replication_dbs) { - hmap_remove(&replication_dbs->map, &node->node); - struct replication_db *rdb = node->data; - if (rdb->active_db_schema) { - ovsdb_schema_destroy(rdb->active_db_schema); - } - free(rdb); - free(node->name); - free(node); + if (rdb->active_db_schema) { + ovsdb_schema_destroy(rdb->active_db_schema); + rdb->active_db_schema = NULL; } - hmap_destroy(&replication_dbs->map); - free(replication_dbs); - replication_dbs = NULL; + rdb->schema_version_higher = false; + rdb->db->read_only = false; } /* Return true if replication just started or is ongoing. * Return false if the connection failed, or the replication * was not able to start. */ bool -replication_is_alive(void) +replication_is_alive(const struct ovsdb *db) { - if (session) { - return jsonrpc_session_is_alive(session) && state != RPL_S_ERR; + const struct replication_db *rdb = find_db(db->name); + + if (!rdb || !rdb->session) { + return false; } - return false; + return jsonrpc_session_is_alive(rdb->session) && rdb->state != RPL_S_ERR; } /* Return the last error reported on a connection by 'session'. The @@ -806,60 +752,59 @@ replication_is_alive(void) * Return a negative value if replication session has error, or the * replication was not able to start. */ int -replication_get_last_error(void) +replication_get_last_error(const struct ovsdb *db) { + const struct replication_db *rdb = find_db(db->name); int err = 0; - if (session) { - err = jsonrpc_session_get_last_error(session); + if (rdb && rdb->session) { + err = jsonrpc_session_get_last_error(rdb->session); if (!err) { - err = (state == RPL_S_ERR) ? ENOENT : 0; + err = (rdb->state == RPL_S_ERR) ? ENOENT : 0; } } return err; } -char * -replication_status(void) +char * OVS_WARN_UNUSED_RESULT +replication_status(const struct ovsdb *db) { - bool alive = session && jsonrpc_session_is_alive(session); + const struct replication_db *rdb = find_db(db->name); + + if (!rdb) { + return xasprintf("%s is not configured for replication", db->name); + } + + bool alive = rdb->session && jsonrpc_session_is_alive(rdb->session); struct ds ds = DS_EMPTY_INITIALIZER; if (alive) { - switch(state) { + switch (rdb->state) { case RPL_S_INIT: case RPL_S_SERVER_ID_REQUESTED: case RPL_S_DB_REQUESTED: case RPL_S_SCHEMA_REQUESTED: case RPL_S_MONITOR_REQUESTED: - ds_put_format(&ds, "connecting: %s", sync_from); + ds_put_format(&ds, "connecting: %s", rdb->sync_from); break; case RPL_S_REPLICATING: { - struct shash_node *node; - - ds_put_format(&ds, "replicating: %s\n", sync_from); - ds_put_cstr(&ds, "database:"); - SHASH_FOR_EACH (node, replication_dbs) { - ds_put_format(&ds, " %s,", node->name); - } - ds_chomp(&ds, ','); + ds_put_format(&ds, "replicating: %s\n", rdb->sync_from); - if (!shash_is_empty(&excluded_tables)) { - ds_put_char(&ds, '\n'); + if (!sset_is_empty(&rdb->excluded_tables)) { ds_put_cstr(&ds, "exclude: "); - ds_put_and_free_cstr(&ds, get_excluded_tables()); + ds_put_and_free_cstr(&ds, get_excluded_tables(db)); } break; } case RPL_S_ERR: - ds_put_format(&ds, "Replication to (%s) failed\n", sync_from); + ds_put_format(&ds, "Replication to (%s) failed", rdb->sync_from); break; default: OVS_NOT_REACHED(); } } else { - ds_put_format(&ds, "not connected to %s", sync_from); + ds_put_format(&ds, "not connected to %s", rdb->sync_from); } return ds_steal_cstr(&ds); } @@ -913,10 +858,12 @@ is_replication_possible(struct ovsdb_schema *local_db_schema, } void -replication_set_probe_interval(int probe_interval) +replication_set_probe_interval(const struct ovsdb *db, int probe_interval) { - if (session) { - jsonrpc_session_set_probe_interval(session, probe_interval); + const struct replication_db *rdb = find_db(db->name); + + if (rdb && rdb->session) { + jsonrpc_session_set_probe_interval(rdb->session, probe_interval); } } diff --git a/ovsdb/replication.h b/ovsdb/replication.h index 6d1be820f3d..38886b6be9b 100644 --- a/ovsdb/replication.h +++ b/ovsdb/replication.h @@ -20,47 +20,47 @@ #include struct ovsdb; +struct jsonrpc_session_options; /* Replication module runs when OVSDB server runs in the backup mode. * * API Usage *=========== * - * - replication_init() needs to be called whenever OVSDB server switches into + * - replication_set_db() needs to be called whenever database switches into * the backup mode. * - * - replication_add_local_db() should be called immediately after to add all - * known database that OVSDB server owns, one at a time. + * - replication_remove_db() needs to be called whenever backup database + * switches into an active mode. * * - replication_destroy() should be called when OVSDB server shutdown to * reclaim resources. * * - replication_run(), replication_wait(), replication_is_alive() and * replication_get_last_error() should be call within the main loop - * whenever OVSDB server runs in the backup mode. + * whenever OVSDB has backup databases. * - * - set_excluded_tables(), get_excluded_tables(), disconnect_active_server() - * and replication_usage() are support functions used mainly by unixctl - * commands. + * - parse_excluded_tables(), get_excluded_tables() and replication_usage() + * are support functions used mainly by unixctl commands. */ #define REPLICATION_DEFAULT_PROBE_INTERVAL 60000 -void replication_init(const char *sync_from, const char *exclude_tables, - const struct uuid *server, int probe_interval); +void replication_set_db(struct ovsdb *, const char *sync_from, + const char *exclude_tables, const struct uuid *server, + const struct jsonrpc_session_options *); +void replication_remove_db(const struct ovsdb *); + void replication_run(void); void replication_wait(void); void replication_destroy(void); void replication_usage(void); -void replication_add_local_db(const char *databse, struct ovsdb *db); -bool replication_is_alive(void); -int replication_get_last_error(void); -char *replication_status(void); -void replication_set_probe_interval(int); +bool replication_is_alive(const struct ovsdb *); +int replication_get_last_error(const struct ovsdb *); +char *replication_status(const struct ovsdb *); +void replication_set_probe_interval(const struct ovsdb *, int probe_interval); -char *set_excluded_tables(const char *excluded, bool dryrun) - OVS_WARN_UNUSED_RESULT; -char *get_excluded_tables(void) OVS_WARN_UNUSED_RESULT; -void disconnect_active_server(void); +char *parse_excluded_tables(const char *excluded) OVS_WARN_UNUSED_RESULT; +char *get_excluded_tables(const struct ovsdb *) OVS_WARN_UNUSED_RESULT; #endif /* ovsdb/replication.h */ diff --git a/ovsdb/row.c b/ovsdb/row.c index 3f0bb8acf12..6b52509a91c 100644 --- a/ovsdb/row.c +++ b/ovsdb/row.c @@ -21,8 +21,9 @@ #include "openvswitch/dynamic-string.h" #include "openvswitch/json.h" -#include "ovsdb-error.h" #include "openvswitch/shash.h" +#include "ovsdb-error.h" +#include "ovsdb.h" #include "sort.h" #include "table.h" #include "util.h" @@ -78,6 +79,7 @@ ovsdb_weak_ref_clone(struct ovsdb_weak_ref *src) ovsdb_type_clone(&weak->type, &src->type); weak->column_idx = src->column_idx; weak->by_key = src->by_key; + n_weak_refs++; return weak; } @@ -130,6 +132,7 @@ ovsdb_weak_ref_destroy(struct ovsdb_weak_ref *weak) } ovsdb_type_destroy(&weak->type); free(weak); + n_weak_refs--; } struct ovsdb_row * @@ -299,12 +302,14 @@ ovsdb_row_columns_to_string(const struct ovsdb_row *row, struct ovsdb_error * ovsdb_row_from_json(struct ovsdb_row *row, const struct json *json, struct ovsdb_symbol_table *symtab, - struct ovsdb_column_set *included) + struct ovsdb_column_set *included, bool is_diff) { struct ovsdb_table_schema *schema = row->table->schema; struct ovsdb_error *error; struct shash_node *node; + ovs_assert(!is_diff || !symtab); + if (json->type != JSON_OBJECT) { return ovsdb_syntax_error(json, NULL, "row must be JSON object"); } @@ -321,8 +326,13 @@ ovsdb_row_from_json(struct ovsdb_row *row, const struct json *json, column_name, schema->name); } - error = ovsdb_datum_from_json(&datum, &column->type, node->data, - symtab); + if (is_diff) { + error = ovsdb_transient_datum_from_json(&datum, &column->type, + node->data); + } else { + error = ovsdb_datum_from_json(&datum, &column->type, node->data, + symtab); + } if (error) { return error; } @@ -396,7 +406,10 @@ ovsdb_row_set_add_row(struct ovsdb_row_set *set, const struct ovsdb_row *row) set->rows = x2nrealloc(set->rows, &set->allocated_rows, sizeof *set->rows); } - set->rows[set->n_rows++] = row; + + if (set->rows) { + set->rows[set->n_rows++] = row; + } } struct json * diff --git a/ovsdb/row.h b/ovsdb/row.h index ff91288fed3..6f5e58acb3f 100644 --- a/ovsdb/row.h +++ b/ovsdb/row.h @@ -114,7 +114,8 @@ void ovsdb_row_columns_to_string(const struct ovsdb_row *, struct ovsdb_error *ovsdb_row_from_json(struct ovsdb_row *, const struct json *, struct ovsdb_symbol_table *, - struct ovsdb_column_set *included) + struct ovsdb_column_set *included, + bool is_diff) OVS_WARN_UNUSED_RESULT; struct json *ovsdb_row_to_json(const struct ovsdb_row *, const struct ovsdb_column_set *include); @@ -129,6 +130,7 @@ ovsdb_row_get_uuid(const struct ovsdb_row *row) static inline struct uuid * ovsdb_row_get_uuid_rw(struct ovsdb_row *row) { + ovsdb_datum_unshare(&row->fields[OVSDB_COL_UUID], &ovsdb_type_uuid); return &row->fields[OVSDB_COL_UUID].keys[0].uuid; } @@ -141,6 +143,7 @@ ovsdb_row_get_version(const struct ovsdb_row *row) static inline struct uuid * ovsdb_row_get_version_rw(struct ovsdb_row *row) { + ovsdb_datum_unshare(&row->fields[OVSDB_COL_VERSION], &ovsdb_type_uuid); return &row->fields[OVSDB_COL_VERSION].keys[0].uuid; } diff --git a/ovsdb/storage.c b/ovsdb/storage.c index e8f95ce6428..c5aec545944 100644 --- a/ovsdb/storage.c +++ b/ovsdb/storage.c @@ -623,7 +623,7 @@ ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, struct ovsdb_write * OVS_WARN_UNUSED_RESULT ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, - const struct json *schema, + const struct ovsdb_schema *schema, const struct json *data, const struct uuid *prereq, struct uuid *resultp) @@ -633,13 +633,23 @@ ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, if (storage->error) { w->error = ovsdb_error_clone(storage->error); } else if (storage->raft) { - struct json *txn_json = json_array_create_2(json_clone(schema), - json_clone(data)); - w->command = raft_command_execute(storage->raft, txn_json, - prereq, &result); - json_destroy(txn_json); + /* Clustered storage doesn't support ephemeral columns. */ + w->error = ovsdb_schema_check_for_ephemeral_columns(schema); + if (!w->error) { + struct json *schema_json, *txn_json; + + schema_json = ovsdb_schema_to_json(schema); + txn_json = json_array_create_2(schema_json, json_clone(data)); + w->command = raft_command_execute(storage->raft, txn_json, + prereq, &result); + json_destroy(txn_json); + } } else if (storage->log) { - w->error = ovsdb_storage_store_snapshot__(storage, schema, data, 0); + struct json *schema_json = ovsdb_schema_to_json(schema); + + w->error = ovsdb_storage_store_snapshot__(storage, schema_json, + data, 0); + json_destroy(schema_json); } else { /* When 'error' and 'command' are both null, it indicates that the * command is complete. This is fine since this unbacked storage drops @@ -651,11 +661,12 @@ ovsdb_storage_write_schema_change(struct ovsdb_storage *storage, return w; } -const struct uuid * -ovsdb_storage_peek_last_eid(struct ovsdb_storage *storage) +bool +ovsdb_storage_precheck_prereq(const struct ovsdb_storage *storage, + const struct uuid *prereq) { if (!storage->raft) { - return NULL; + return true; } - return raft_current_eid(storage->raft); + return raft_precheck_prereq(storage->raft, prereq); } diff --git a/ovsdb/storage.h b/ovsdb/storage.h index a1fdaa564e4..7079ea261f8 100644 --- a/ovsdb/storage.h +++ b/ovsdb/storage.h @@ -85,7 +85,7 @@ struct ovsdb_error *ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, struct ovsdb_write *ovsdb_storage_write_schema_change( struct ovsdb_storage *, - const struct json *schema, const struct json *data, + const struct ovsdb_schema *, const struct json *data, const struct uuid *prereq, struct uuid *result) OVS_WARN_UNUSED_RESULT; @@ -96,6 +96,9 @@ struct ovsdb_storage *ovsdb_storage_open_standalone(const char *filename, bool rw); struct ovsdb_schema *ovsdb_storage_read_schema(struct ovsdb_storage *); -const struct uuid *ovsdb_storage_peek_last_eid(struct ovsdb_storage *); +/* Checks that there is a chance for a record with specified prerequisites + * to be successfully written to the storage. */ +bool ovsdb_storage_precheck_prereq(const struct ovsdb_storage *, + const struct uuid *prereq); #endif /* ovsdb/storage.h */ diff --git a/ovsdb/table.c b/ovsdb/table.c index 66071ce2f88..3e89ddd44a0 100644 --- a/ovsdb/table.c +++ b/ovsdb/table.c @@ -368,7 +368,8 @@ ovsdb_table_execute_insert(struct ovsdb_txn *txn, const struct uuid *row_uuid, struct ovsdb_row *row = ovsdb_row_create(table); - struct ovsdb_error *error = ovsdb_row_from_json(row, json_row, NULL, NULL); + struct ovsdb_error *error = ovsdb_row_from_json(row, json_row, + NULL, NULL, false); if (!error) { *ovsdb_row_get_uuid_rw(row) = *row_uuid; ovsdb_txn_row_insert(txn, row); @@ -411,11 +412,13 @@ ovsdb_table_execute_update(struct ovsdb_txn *txn, const struct uuid *row_uuid, struct ovsdb_column_set columns = OVSDB_COLUMN_SET_INITIALIZER; struct ovsdb_row *update = ovsdb_row_create(table); struct ovsdb_error *error = ovsdb_row_from_json(update, json_row, - NULL, &columns); + NULL, &columns, xor); if (!error && (xor || !ovsdb_row_equal_columns(row, update, &columns))) { - error = ovsdb_row_update_columns(ovsdb_txn_row_modify(txn, row), - update, &columns, xor); + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + error = ovsdb_row_update_columns(rw_row, update, &columns, xor); } ovsdb_column_set_destroy(&columns); diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index bb997b45b5d..65eca647837 100644 --- a/ovsdb/transaction.c +++ b/ovsdb/transaction.c @@ -34,6 +34,7 @@ #include "storage.h" #include "table.h" #include "uuid.h" +#include "util.h" VLOG_DEFINE_THIS_MODULE(transaction); @@ -71,6 +72,8 @@ struct ovsdb_txn_table { * 'new'. * * - A row modified by a transaction will have non-null 'old' and 'new'. + * It may have non-null 'diff' as well in this case, but it is not + * necessary. * * - 'old' and 'new' both null indicates that a row was added then deleted * within a single transaction. Most of the time we instead delete the @@ -82,6 +85,7 @@ struct ovsdb_txn_row { struct hmap_node hmap_node; /* In ovsdb_txn_table's txn_rows hmap. */ struct ovsdb_row *old; /* The old row. */ struct ovsdb_row *new; /* The new row. */ + struct ovsdb_row *diff; /* The difference between old and new. */ size_t n_refs; /* Number of remaining references. */ /* These members are the same as the corresponding members of 'old' or @@ -154,6 +158,7 @@ ovsdb_txn_row_abort(struct ovsdb_txn *txn OVS_UNUSED, { struct ovsdb_row *old = txn_row->old; struct ovsdb_row *new = txn_row->new; + struct ovsdb_row *diff = txn_row->diff; ovsdb_txn_row_prefree(txn_row); if (!old) { @@ -183,6 +188,7 @@ ovsdb_txn_row_abort(struct ovsdb_txn *txn OVS_UNUSED, } ovsdb_row_destroy(new); + ovsdb_row_destroy(diff); free(txn_row); return NULL; @@ -249,7 +255,10 @@ find_or_make_txn_row(struct ovsdb_txn *txn, const struct ovsdb_table *table, if (!txn_row) { const struct ovsdb_row *row = ovsdb_table_get_row(table, uuid); if (row) { - txn_row = ovsdb_txn_row_modify(txn, row)->txn_row; + struct ovsdb_row *rw_row; + + ovsdb_txn_row_modify(txn, row, &rw_row, NULL); + txn_row = rw_row->txn_row; } } return txn_row; @@ -321,7 +330,8 @@ update_row_ref_count(struct ovsdb_txn *txn, struct ovsdb_txn_row *r) const struct ovsdb_column *column = node->data; struct ovsdb_error *error; - if (bitmap_is_set(r->changed, column->index)) { + if (bitmap_is_set(r->changed, column->index) + && ovsdb_type_has_strong_refs(&column->type)) { if (r->old && !r->new) { error = ovsdb_txn_adjust_row_refs( txn, r->old, column, @@ -337,12 +347,13 @@ update_row_ref_count(struct ovsdb_txn *txn, struct ovsdb_txn_row *r) return error; } } else if (r->old && r->new) { - struct ovsdb_datum added, removed; + struct ovsdb_datum added, removed, *diff; + diff = r->diff ? &r->diff->fields[column->index] : NULL; ovsdb_datum_added_removed(&added, &removed, &r->old->fields[column->index], &r->new->fields[column->index], - &column->type); + diff, &column->type); error = ovsdb_txn_adjust_row_refs( txn, r->old, column, &removed, -1); @@ -431,6 +442,9 @@ delete_garbage_row(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) return NULL; } + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; + row = txn_row->new; txn_row->new = NULL; hmap_remove(&txn_row->table->rows, &row->hmap_node); @@ -542,6 +556,7 @@ ovsdb_txn_row_commit(struct ovsdb_txn *txn OVS_UNUSED, txn_row->new->n_refs = txn_row->n_refs; } ovsdb_row_destroy(txn_row->old); + ovsdb_row_destroy(txn_row->diff); free(txn_row); return NULL; @@ -576,6 +591,7 @@ ovsdb_txn_update_weak_refs(struct ovsdb_txn *txn OVS_UNUSED, dst_row = CONST_CAST(struct ovsdb_row *, ovsdb_table_get_row(weak->dst_table, &weak->dst)); + ovs_assert(dst_row); ovs_assert(!ovsdb_row_find_weak_ref(dst_row, weak)); hmap_insert(&dst_row->dst_refs, &weak->dst_node, ovsdb_weak_ref_hash(weak)); @@ -587,7 +603,7 @@ ovsdb_txn_update_weak_refs(struct ovsdb_txn *txn OVS_UNUSED, } static void -add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, +add_weak_ref(const struct ovsdb_row *src, const struct ovsdb_row *dst_, struct ovs_list *ref_list, const union ovsdb_atom *key, const union ovsdb_atom *value, bool by_key, const struct ovsdb_column *column) @@ -595,13 +611,13 @@ add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, struct ovsdb_row *dst = CONST_CAST(struct ovsdb_row *, dst_); struct ovsdb_weak_ref *weak; - if (txn_row->new == dst) { + if (src == dst) { return; } weak = xzalloc(sizeof *weak); - weak->src_table = txn_row->new->table; - weak->src = *ovsdb_row_get_uuid(txn_row->new); + weak->src_table = src->table; + weak->src = *ovsdb_row_get_uuid(src); weak->dst_table = dst->table; weak->dst = *ovsdb_row_get_uuid(dst); ovsdb_type_clone(&weak->type, &column->type); @@ -613,10 +629,12 @@ add_weak_ref(struct ovsdb_txn_row *txn_row, const struct ovsdb_row *dst_, weak->column_idx = column->index; hmap_node_nullify(&weak->dst_node); ovs_list_push_back(ref_list, &weak->src_node); + + n_weak_refs++; } static void -find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, +find_and_add_weak_ref(const struct ovsdb_row *src, const union ovsdb_atom *key, const union ovsdb_atom *value, const struct ovsdb_column *column, @@ -628,7 +646,7 @@ find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, : ovsdb_table_get_row(column->type.value.uuid.refTable, &value->uuid); if (row) { - add_weak_ref(txn_row, row, ref_list, key, value, by_key, column); + add_weak_ref(src, row, ref_list, key, value, by_key, column); } else if (not_found) { if (uuid_is_zero(by_key ? &key->uuid : &value->uuid)) { *zero = true; @@ -637,11 +655,36 @@ find_and_add_weak_ref(struct ovsdb_txn_row *txn_row, } } +static void +find_and_add_weak_refs(const struct ovsdb_row *src, + const struct ovsdb_datum *datum, + const struct ovsdb_column *column, + struct ovs_list *ref_list, + struct ovsdb_datum *not_found, bool *zero) +{ + unsigned int i; + + if (ovsdb_base_type_is_weak_ref(&column->type.key)) { + for (i = 0; i < datum->n; i++) { + find_and_add_weak_ref(src, &datum->keys[i], + datum->values ? &datum->values[i] : NULL, + column, true, ref_list, not_found, zero); + } + } + + if (ovsdb_base_type_is_weak_ref(&column->type.value)) { + for (i = 0; i < datum->n; i++) { + find_and_add_weak_ref(src, &datum->keys[i], &datum->values[i], + column, false, ref_list, not_found, zero); + } + } +} + static struct ovsdb_error * OVS_WARN_UNUSED_RESULT assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) { struct ovsdb_weak_ref *weak; - struct ovsdb_table *table; + struct ovsdb_table *table = txn_row->table; struct shash_node *node; if (txn_row->old && !txn_row->new) { @@ -663,6 +706,15 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) ovs_assert(ovs_list_is_empty(&weak->src_node)); ovs_list_insert(&src_txn_row->deleted_refs, &weak->src_node); } + + /* Creating refs that needs to be removed on commit. */ + SHASH_FOR_EACH (node, &table->schema->columns) { + const struct ovsdb_column *column = node->data; + struct ovsdb_datum *datum = &txn_row->old->fields[column->index]; + + find_and_add_weak_refs(txn_row->old, datum, column, + &txn_row->deleted_refs, NULL, NULL); + } } if (!txn_row->new) { @@ -673,14 +725,17 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) return NULL; } - table = txn_row->table; SHASH_FOR_EACH (node, &table->schema->columns) { const struct ovsdb_column *column = node->data; struct ovsdb_datum *datum = &txn_row->new->fields[column->index]; struct ovsdb_datum added, removed, deleted_refs; - unsigned int orig_n, i; + unsigned int orig_n; bool zero = false; + if (!ovsdb_type_has_weak_refs(&column->type)) { + continue; + } + orig_n = datum->n; /* Collecting all key-value pairs that references deleted rows. */ @@ -696,39 +751,34 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) ovsdb_datum_sort_unique(&deleted_refs, &column->type); /* Removing elements that references deleted rows. */ - ovsdb_datum_subtract(datum, &column->type, - &deleted_refs, &column->type); + if (deleted_refs.n) { + ovsdb_datum_subtract(datum, &column->type, + &deleted_refs, &column->type); + } ovsdb_datum_destroy(&deleted_refs, &column->type); /* Generating the difference between old and new data. */ - if (txn_row->old) { - ovsdb_datum_added_removed(&added, &removed, - &txn_row->old->fields[column->index], - datum, &column->type); - } else { - ovsdb_datum_init_empty(&removed); - ovsdb_datum_clone(&added, datum); + ovsdb_datum_init_empty(&added); + ovsdb_datum_init_empty(&removed); + if (datum->n != orig_n + || bitmap_is_set(txn_row->changed, column->index)) { + if (txn_row->old) { + struct ovsdb_datum *diff; + + diff = txn_row->diff && datum->n == orig_n + ? &txn_row->diff->fields[column->index] : NULL; + ovsdb_datum_added_removed(&added, &removed, + &txn_row->old->fields[column->index], + datum, diff, &column->type); + } else { + ovsdb_datum_clone(&added, datum); + } } /* Checking added data and creating new references. */ ovsdb_datum_init_empty(&deleted_refs); - if (ovsdb_base_type_is_weak_ref(&column->type.key)) { - for (i = 0; i < added.n; i++) { - find_and_add_weak_ref(txn_row, &added.keys[i], - added.values ? &added.values[i] : NULL, - column, true, &txn_row->added_refs, - &deleted_refs, &zero); - } - } - - if (ovsdb_base_type_is_weak_ref(&column->type.value)) { - for (i = 0; i < added.n; i++) { - find_and_add_weak_ref(txn_row, &added.keys[i], - &added.values[i], - column, false, &txn_row->added_refs, - &deleted_refs, &zero); - } - } + find_and_add_weak_refs(txn_row->new, &added, column, + &txn_row->added_refs, &deleted_refs, &zero); if (deleted_refs.n) { /* Removing all the references that doesn't point to valid rows. */ ovsdb_datum_sort_unique(&deleted_refs, &column->type); @@ -741,28 +791,16 @@ assess_weak_refs(struct ovsdb_txn *txn, struct ovsdb_txn_row *txn_row) /* Creating refs that needs to be removed on commit. This includes * both: the references that got directly removed from the datum and * references removed due to deletion of a referenced row. */ - if (ovsdb_base_type_is_weak_ref(&column->type.key)) { - for (i = 0; i < removed.n; i++) { - find_and_add_weak_ref(txn_row, &removed.keys[i], - removed.values - ? &removed.values[i] : NULL, - column, true, &txn_row->deleted_refs, - NULL, NULL); - } - } - - if (ovsdb_base_type_is_weak_ref(&column->type.value)) { - for (i = 0; i < removed.n; i++) { - find_and_add_weak_ref(txn_row, &removed.keys[i], - &removed.values[i], - column, false, &txn_row->deleted_refs, - NULL, NULL); - } - } + find_and_add_weak_refs(txn_row->new, &removed, column, + &txn_row->deleted_refs, NULL, NULL); ovsdb_datum_destroy(&removed, &column->type); if (datum->n != orig_n) { bitmap_set1(txn_row->changed, column->index); + /* Can no longer rely on the previous diff. */ + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; + if (datum->n < column->type.n_min) { const struct uuid *row_uuid = ovsdb_row_get_uuid(txn_row->new); if (zero && !txn_row->old) { @@ -1161,6 +1199,7 @@ ovsdb_txn_destroy_cloned(struct ovsdb_txn *txn) if (r->new) { ovsdb_row_destroy(r->new); } + ovs_assert(!r->diff); hmap_remove(&t->txn_rows, &r->hmap_node); free(r); } @@ -1238,24 +1277,20 @@ struct ovsdb_txn_progress { bool ovsdb_txn_precheck_prereq(const struct ovsdb *db) { - const struct uuid *eid = ovsdb_storage_peek_last_eid(db->storage); - if (!eid) { - return true; - } - return uuid_equals(&db->prereq, eid); + return ovsdb_storage_precheck_prereq(db->storage, &db->prereq); } struct ovsdb_txn_progress * ovsdb_txn_propose_schema_change(struct ovsdb *db, - const struct json *schema, - const struct json *data) + const struct ovsdb_schema *schema, + const struct json *data, + struct uuid *txnid) { struct ovsdb_txn_progress *progress = xzalloc(sizeof *progress); progress->storage = db->storage; - struct uuid next; struct ovsdb_write *write = ovsdb_storage_write_schema_change( - db->storage, schema, data, &db->prereq, &next); + db->storage, schema, data, &db->prereq, txnid); if (!ovsdb_write_is_complete(write)) { progress->write = write; } else { @@ -1422,7 +1457,8 @@ ovsdb_txn_create_txn_table(struct ovsdb_txn *txn, struct ovsdb_table *table) static struct ovsdb_txn_row * ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, - const struct ovsdb_row *old_, struct ovsdb_row *new) + const struct ovsdb_row *old_, struct ovsdb_row *new, + struct ovsdb_row *diff) { const struct ovsdb_row *row = old_ ? old_ : new; struct ovsdb_row *old = CONST_CAST(struct ovsdb_row *, old_); @@ -1436,6 +1472,7 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, txn_row->table = row->table; txn_row->old = old; txn_row->new = new; + txn_row->diff = diff; txn_row->n_refs = old ? old->n_refs : 0; txn_row->serial = serial - 1; @@ -1448,6 +1485,9 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, if (new) { new->txn_row = txn_row; } + if (diff) { + diff->txn_row = txn_row; + } txn_table = ovsdb_txn_create_txn_table(txn, table); hmap_insert(&txn_table->txn_rows, &txn_row->hmap_node, @@ -1456,24 +1496,38 @@ ovsdb_txn_row_create(struct ovsdb_txn *txn, struct ovsdb_table *table, return txn_row; } -struct ovsdb_row * -ovsdb_txn_row_modify(struct ovsdb_txn *txn, const struct ovsdb_row *ro_row_) +void +ovsdb_txn_row_modify(struct ovsdb_txn *txn, const struct ovsdb_row *ro_row_, + struct ovsdb_row **rw_row, struct ovsdb_row **diff) { struct ovsdb_row *ro_row = CONST_CAST(struct ovsdb_row *, ro_row_); + ovs_assert(ro_row); + ovs_assert(rw_row); + if (ro_row->txn_row) { ovs_assert(ro_row == ro_row->txn_row->new); - return ro_row; + *rw_row = ro_row; + if (diff) { + *diff = ro_row->txn_row->diff; + } else { + /* Caller will modify the row without updating the diff. + * Destroy the existing diff, if any, so it will not be + * used for this row anymore. Modification will always + * return NULL from this point on. */ + ovsdb_row_destroy(ro_row->txn_row->diff); + ro_row->txn_row->diff = NULL; + } } else { struct ovsdb_table *table = ro_row->table; - struct ovsdb_row *rw_row; - - rw_row = ovsdb_row_clone(ro_row); - rw_row->n_refs = ro_row->n_refs; - ovsdb_txn_row_create(txn, table, ro_row, rw_row); - hmap_replace(&table->rows, &ro_row->hmap_node, &rw_row->hmap_node); - return rw_row; + *rw_row = ovsdb_row_clone(ro_row); + (*rw_row)->n_refs = ro_row->n_refs; + if (diff) { + *diff = ovsdb_row_create(table); + } + ovsdb_txn_row_create(txn, table, ro_row, *rw_row, diff ? *diff : NULL); + hmap_replace(&table->rows, &ro_row->hmap_node, &(*rw_row)->hmap_node); } } @@ -1485,7 +1539,7 @@ ovsdb_txn_row_insert(struct ovsdb_txn *txn, struct ovsdb_row *row) uuid_generate(ovsdb_row_get_version_rw(row)); - ovsdb_txn_row_create(txn, table, NULL, row); + ovsdb_txn_row_create(txn, table, NULL, row, NULL); hmap_insert(&table->rows, &row->hmap_node, hash); } @@ -1501,9 +1555,11 @@ ovsdb_txn_row_delete(struct ovsdb_txn *txn, const struct ovsdb_row *row_) hmap_remove(&table->rows, &row->hmap_node); if (!txn_row) { - ovsdb_txn_row_create(txn, table, row, NULL); + ovsdb_txn_row_create(txn, table, row, NULL, NULL); } else { ovs_assert(txn_row->new == row); + ovsdb_row_destroy(txn_row->diff); + txn_row->diff = NULL; if (txn_row->old) { txn_row->new = NULL; } else { diff --git a/ovsdb/transaction.h b/ovsdb/transaction.h index 6b5bb7f24b2..f659838dc81 100644 --- a/ovsdb/transaction.h +++ b/ovsdb/transaction.h @@ -21,6 +21,8 @@ struct json; struct ovsdb; +struct ovsdb_row; +struct ovsdb_schema; struct ovsdb_table; struct uuid; @@ -41,15 +43,17 @@ struct ovsdb_error *ovsdb_txn_propose_commit_block(struct ovsdb_txn *, void ovsdb_txn_complete(struct ovsdb_txn *); struct ovsdb_txn_progress *ovsdb_txn_propose_schema_change( - struct ovsdb *, const struct json *schema, const struct json *data); + struct ovsdb *, const struct ovsdb_schema *, + const struct json *data, struct uuid *txnid); bool ovsdb_txn_progress_is_complete(const struct ovsdb_txn_progress *); const struct ovsdb_error *ovsdb_txn_progress_get_error( const struct ovsdb_txn_progress *); void ovsdb_txn_progress_destroy(struct ovsdb_txn_progress *); -struct ovsdb_row *ovsdb_txn_row_modify(struct ovsdb_txn *, - const struct ovsdb_row *); +void ovsdb_txn_row_modify(struct ovsdb_txn *, const struct ovsdb_row *, + struct ovsdb_row **row_new, + struct ovsdb_row **row_diff); void ovsdb_txn_row_insert(struct ovsdb_txn *, struct ovsdb_row *); void ovsdb_txn_row_delete(struct ovsdb_txn *, const struct ovsdb_row *); diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c index 7d3003bca32..8c00fec181f 100644 --- a/ovsdb/trigger.c +++ b/ovsdb/trigger.c @@ -20,6 +20,7 @@ #include #include +#include "cooperative-multitasking.h" #include "file.h" #include "openvswitch/json.h" #include "jsonrpc.h" @@ -31,6 +32,7 @@ #include "transaction-forward.h" #include "openvswitch/vlog.h" #include "util.h" +#include "uuid.h" VLOG_DEFINE_THIS_MODULE(trigger); @@ -52,6 +54,7 @@ ovsdb_trigger_init(struct ovsdb_session *session, struct ovsdb *db, trigger->db = db; ovs_list_push_back(&trigger->db->triggers, &trigger->node); trigger->request = request; + trigger->converted_db = NULL; trigger->reply = NULL; trigger->progress = NULL; trigger->txn_forward = NULL; @@ -69,6 +72,7 @@ ovsdb_trigger_destroy(struct ovsdb_trigger *trigger) ovsdb_txn_progress_destroy(trigger->progress); ovsdb_txn_forward_destroy(trigger->db, trigger->txn_forward); ovs_list_remove(&trigger->node); + ovsdb_destroy(trigger->converted_db); jsonrpc_msg_destroy(trigger->request); jsonrpc_msg_destroy(trigger->reply); free(trigger->role); @@ -143,6 +147,30 @@ ovsdb_trigger_prereplace_db(struct ovsdb_trigger *trigger) } } +/* Find among incomplete triggers one that caused database conversion + * with specified transaction ID. */ +struct ovsdb * +ovsdb_trigger_find_and_steal_converted_db(const struct ovsdb *db, + const struct uuid *txnid) +{ + struct ovsdb *converted_db = NULL; + struct ovsdb_trigger *t; + + if (uuid_is_zero(txnid)) { + return NULL; + } + + LIST_FOR_EACH_SAFE (t, node, &db->triggers) { + if (t->db == db && t->converted_db + && uuid_equals(&t->conversion_txnid, txnid)) { + converted_db = t->converted_db; + t->converted_db = NULL; + break; + } + } + return converted_db; +} + bool ovsdb_trigger_run(struct ovsdb *db, long long int now) { @@ -154,6 +182,8 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) bool disconnect_all = false; LIST_FOR_EACH_SAFE (t, node, &db->triggers) { + cooperative_multitasking_yield(); + if (run_triggers || now - t->created >= t->timeout_msec || t->progress || t->txn_forward) { @@ -200,7 +230,6 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) ovs_assert(!t->progress); struct ovsdb_txn *txn = NULL; - struct ovsdb *newdb = NULL; if (!strcmp(t->request->method, "transact")) { if (!ovsdb_txn_precheck_prereq(t->db)) { return false; @@ -252,6 +281,14 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) return false; } + if (t->read_only) { + trigger_convert_error( + t, ovsdb_error("not allowed", "conversion is not allowed " + "for read-only database %s", + t->db->schema->name)); + return false; + } + /* Validate parameters. */ const struct json *params = t->request->params; if (params->type != JSON_ARRAY || params->array.n != 2) { @@ -272,21 +309,28 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) new_schema->name, t->db->schema->name); } if (!error) { - error = ovsdb_convert(t->db, new_schema, &newdb); + ovsdb_destroy(t->converted_db); + error = ovsdb_convert(t->db, new_schema, &t->converted_db); } - ovsdb_schema_destroy(new_schema); if (error) { + ovsdb_schema_destroy(new_schema); trigger_convert_error(t, error); return false; } - /* Make the new copy into a transaction log record. */ - struct json *txn_json = ovsdb_to_txn_json( - newdb, "converted by ovsdb-server"); + struct json *txn_json; + if (ovsdb_conversion_with_no_data_supported(t->db)) { + txn_json = json_null_create(); + } else { + /* Make the new copy into a transaction log record. */ + txn_json = ovsdb_to_txn_json( + t->converted_db, "converted by ovsdb-server", true); + } /* Propose the change. */ t->progress = ovsdb_txn_propose_schema_change( - t->db, new_schema_json, txn_json); + t->db, new_schema, txn_json, &t->conversion_txnid); + ovsdb_schema_destroy(new_schema); json_destroy(txn_json); t->reply = jsonrpc_create_reply(json_object_create(), t->request->id); @@ -307,13 +351,13 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) ovsdb_txn_progress_destroy(t->progress); t->progress = NULL; ovsdb_trigger_complete(t); - if (newdb) { - ovsdb_replace(t->db, newdb); + if (t->converted_db) { + ovsdb_replace(t->db, t->converted_db); + t->converted_db = NULL; return true; } return false; } - ovsdb_destroy(newdb); /* Fall through to the general handling for the "committing" state. We * abort the transaction--if and when it eventually commits, we'll read diff --git a/ovsdb/trigger.h b/ovsdb/trigger.h index d060c72e5c7..87ff4d0531b 100644 --- a/ovsdb/trigger.h +++ b/ovsdb/trigger.h @@ -17,6 +17,7 @@ #define OVSDB_TRIGGER_H 1 #include "openvswitch/list.h" +#include "openvswitch/uuid.h" struct ovsdb; @@ -54,6 +55,8 @@ struct ovsdb_trigger { struct ovs_list node; struct ovsdb_session *session; /* Session that owns this trigger. */ struct ovsdb *db; /* Database on which trigger acts. */ + struct ovsdb *converted_db; /* Result of the 'convert' request. */ + struct uuid conversion_txnid; /* txnid of the conversion request. */ struct jsonrpc_msg *request; /* Database request. */ struct jsonrpc_msg *reply; /* Result (null if none yet). */ struct ovsdb_txn_progress *progress; @@ -77,6 +80,10 @@ void ovsdb_trigger_cancel(struct ovsdb_trigger *, const char *reason); void ovsdb_trigger_prereplace_db(struct ovsdb_trigger *); +struct ovsdb *ovsdb_trigger_find_and_steal_converted_db( + const struct ovsdb *, const struct uuid *) + OVS_WARN_UNUSED_RESULT; + bool ovsdb_trigger_run(struct ovsdb *, long long int now); void ovsdb_trigger_wait(struct ovsdb *, long long int now); diff --git a/python/.gitignore b/python/.gitignore index 60ace6f05b5..ad5486af838 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,2 +1,3 @@ dist/ *.egg-info +setup.py diff --git a/python/TODO.rst b/python/TODO.rst index 3a53489f128..acc5461e2f2 100644 --- a/python/TODO.rst +++ b/python/TODO.rst @@ -32,3 +32,10 @@ Python Bindings To-do List * Support write-only-changed monitor mode (equivalent of OVSDB_IDL_WRITE_CHANGED_ONLY). + +* socket_util: + + * Add equivalent fuctions to inet_parse_passive, parse_sockaddr_components, + et al. to better support using async dns. The reconnect code will + currently log a warning when inet_parse_active() returns w/o yet having + resolved an address, but will continue to connect and eventually succeed. diff --git a/python/automake.mk b/python/automake.mk index d00911828c6..d0523870d67 100644 --- a/python/automake.mk +++ b/python/automake.mk @@ -16,6 +16,7 @@ ovs_pyfiles = \ python/ovs/compat/sortedcontainers/sorteddict.py \ python/ovs/compat/sortedcontainers/sortedset.py \ python/ovs/daemon.py \ + python/ovs/dns_resolve.py \ python/ovs/db/__init__.py \ python/ovs/db/custom_index.py \ python/ovs/db/data.py \ @@ -55,6 +56,7 @@ ovs_pyfiles = \ ovs_pytests = \ python/ovs/tests/test_decoders.py \ + python/ovs/tests/test_dns_resolve.py \ python/ovs/tests/test_filter.py \ python/ovs/tests/test_kv.py \ python/ovs/tests/test_list.py \ @@ -64,34 +66,33 @@ ovs_pytests = \ # These python files are used at build time but not runtime, # so they are not installed. EXTRA_DIST += \ - python/build/__init__.py \ - python/build/extract_ofp_fields.py \ - python/build/nroff.py \ - python/build/soutil.py + python/ovs_build_helpers/__init__.py \ + python/ovs_build_helpers/extract_ofp_fields.py \ + python/ovs_build_helpers/nroff.py \ + python/ovs_build_helpers/soutil.py # PyPI support. EXTRA_DIST += \ python/ovs/compat/sortedcontainers/LICENSE \ python/README.rst \ - python/setup.py \ python/test_requirements.txt # C extension support. EXTRA_DIST += python/ovs/_json.c -PYFILES = $(ovs_pyfiles) python/ovs/dirs.py $(ovstest_pyfiles) $(ovs_pytests) +PYFILES = $(ovs_pyfiles) python/ovs/dirs.py python/setup.py $(ovstest_pyfiles) $(ovs_pytests) EXTRA_DIST += $(PYFILES) PYCOV_CLEAN_FILES += $(PYFILES:.py=.py,cover) FLAKE8_PYFILES += \ - $(filter-out python/ovs/compat/% python/ovs/dirs.py,$(PYFILES)) \ - python/build/__init__.py \ - python/build/extract_ofp_fields.py \ - python/build/nroff.py \ - python/build/soutil.py \ + $(filter-out python/ovs/compat/% python/ovs/dirs.py python/setup.py,$(PYFILES)) \ + python/ovs_build_helpers/__init__.py \ + python/ovs_build_helpers/extract_ofp_fields.py \ + python/ovs_build_helpers/nroff.py \ + python/ovs_build_helpers/soutil.py \ python/ovs/dirs.py.template \ - python/setup.py + python/setup.py.template nobase_pkgdata_DATA = $(ovs_pyfiles) $(ovstest_pyfiles) ovs-install-data-local: @@ -110,11 +111,14 @@ ovs-install-data-local: $(INSTALL_DATA) python/ovs/dirs.py.tmp $(DESTDIR)$(pkgdatadir)/python/ovs/dirs.py rm python/ovs/dirs.py.tmp -python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py - (cd python/ && $(PYTHON3) setup.py sdist) +.PHONY: python-sdist +python-sdist: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py python/setup.py + cd python/ && $(PYTHON3) -m build --sdist + +.PHONY: pypi-upload +pypi-upload: python-sdist + twine upload python/dist/ovs-$(VERSION).tar.gz -pypi-upload: $(srcdir)/python/ovs/version.py $(ovs_pyfiles) python/ovs/dirs.py - (cd python/ && $(PYTHON3) setup.py sdist upload) install-data-local: ovs-install-data-local UNINSTALL_LOCAL += ovs-uninstall-local @@ -124,8 +128,8 @@ ovs-uninstall-local: ALL_LOCAL += $(srcdir)/python/ovs/version.py $(srcdir)/python/ovs/version.py: config.status $(AM_V_GEN)$(ro_shell) > $(@F).tmp && \ - echo 'VERSION = "$(VERSION)"' >> $(@F).tmp && \ - if cmp -s $(@F).tmp $@; then touch $@; rm $(@F).tmp; else mv $(@F).tmp $@; fi + echo 'VERSION = "$(VERSION)$(VERSION_SUFFIX)"' >> $(@F).tmp && \ + if cmp -s $(@F).tmp $@; then touch $@; else cp $(@F).tmp $@; fi; rm $(@F).tmp ALL_LOCAL += $(srcdir)/python/ovs/dirs.py $(srcdir)/python/ovs/dirs.py: python/ovs/dirs.py.template @@ -142,6 +146,15 @@ $(srcdir)/python/ovs/dirs.py: python/ovs/dirs.py.template EXTRA_DIST += python/ovs/dirs.py.template CLEANFILES += python/ovs/dirs.py +ALL_LOCAL += $(srcdir)/python/setup.py +$(srcdir)/python/setup.py: python/setup.py.template config.status + $(AM_V_GEN)sed \ + -e 's,[@]VERSION[@],$(VERSION),g' \ + < $(srcdir)/python/setup.py.template > $(@F).tmp && \ + if cmp -s $(@F).tmp $@; then touch $@; else cp $(@F).tmp $@; fi; rm $(@F).tmp +EXTRA_DIST += python/setup.py.template +CLEANFILES += python/setup.py + EXTRA_DIST += python/TODO.rst $(srcdir)/python/ovs/flow/ofp_fields.py: $(srcdir)/build-aux/gen_ofp_field_decoders include/openvswitch/meta-flow.h diff --git a/python/ovs/db/custom_index.py b/python/ovs/db/custom_index.py index 587caf5e3e1..3fa03d3c959 100644 --- a/python/ovs/db/custom_index.py +++ b/python/ovs/db/custom_index.py @@ -90,14 +90,21 @@ def index_create(self, name): index = self.indexes[name] = MultiColumnIndex(name) return index + def __getitem__(self, key): + return self.data[key][-1] + def __setitem__(self, key, item): - self.data[key] = item + try: + self.data[key].append(item) + except KeyError: + self.data[key] = [item] for index in self.indexes.values(): index.add(item) def __delitem__(self, key): - val = self.data[key] - del self.data[key] + val = self.data[key].pop() + if len(self.data[key]) == 0: + del self.data[key] for index in self.indexes.values(): index.remove(val) diff --git a/python/ovs/db/idl.py b/python/ovs/db/idl.py index 8e31e02d791..c8cc543465c 100644 --- a/python/ovs/db/idl.py +++ b/python/ovs/db/idl.py @@ -35,9 +35,9 @@ ROW_UPDATE = "update" ROW_DELETE = "delete" -OVSDB_UPDATE = 0 -OVSDB_UPDATE2 = 1 -OVSDB_UPDATE3 = 2 +OVSDB_UPDATE = "update" +OVSDB_UPDATE2 = "update2" +OVSDB_UPDATE3 = "update3" CLUSTERED = "clustered" RELAY = "relay" @@ -77,7 +77,7 @@ def __contains__(self, item): return item in self.keys() -class Monitor(enum.IntEnum): +class Monitor(enum.Enum): monitor = OVSDB_UPDATE monitor_cond = OVSDB_UPDATE2 monitor_cond_since = OVSDB_UPDATE3 @@ -85,9 +85,9 @@ class Monitor(enum.IntEnum): class ConditionState(object): def __init__(self): - self._ack_cond = None + self._ack_cond = [True] self._req_cond = None - self._new_cond = [True] + self._new_cond = None def __iter__(self): return iter([self._new_cond, self._req_cond, self._ack_cond]) @@ -299,6 +299,7 @@ def __init__(self, remote, schema_helper, probe_interval=None, self._server_schema_request_id = None self._server_monitor_request_id = None self._db_change_aware_request_id = None + self._monitor_cancel_request_id = None self._server_db_name = '_Server' self._server_db_table = 'Database' self.server_tables = None @@ -464,23 +465,22 @@ def run(self): self.__parse_update(msg.params[2], OVSDB_UPDATE3) self.last_id = msg.params[1] elif (msg.type == ovs.jsonrpc.Message.T_NOTIFY - and msg.method == "update2" - and len(msg.params) == 2): - # Database contents changed. - self.__parse_update(msg.params[1], OVSDB_UPDATE2) - elif (msg.type == ovs.jsonrpc.Message.T_NOTIFY - and msg.method == "update" + and msg.method in (OVSDB_UPDATE, OVSDB_UPDATE2) and len(msg.params) == 2): # Database contents changed. if msg.params[0] == str(self.server_monitor_uuid): - self.__parse_update(msg.params[1], OVSDB_UPDATE, + self.__parse_update(msg.params[1], msg.method, tables=self.server_tables) self.change_seqno = previous_change_seqno if not self.__check_server_db(): self.force_reconnect() break else: - self.__parse_update(msg.params[1], OVSDB_UPDATE) + self.__parse_update(msg.params[1], msg.method) + elif self.handle_monitor_canceled(msg): + break + elif self.handle_monitor_cancel_reply(msg): + break elif (msg.type == ovs.jsonrpc.Message.T_REPLY and self._monitor_request_id is not None and self._monitor_request_id == msg.id): @@ -494,6 +494,7 @@ def run(self): if not msg.result[0]: self.__clear() self.__parse_update(msg.result[2], OVSDB_UPDATE3) + self.last_id = msg.result[1] elif self.state == self.IDL_S_DATA_MONITOR_COND_REQUESTED: self.__clear() self.__parse_update(msg.result, OVSDB_UPDATE2) @@ -534,7 +535,7 @@ def run(self): # Reply to our "monitor" of _Server request. try: self._server_monitor_request_id = None - self.__parse_update(msg.result, OVSDB_UPDATE, + self.__parse_update(msg.result, OVSDB_UPDATE2, tables=self.server_tables) self.change_seqno = previous_change_seqno if self.__check_server_db(): @@ -573,6 +574,11 @@ def run(self): elif msg.type == ovs.jsonrpc.Message.T_NOTIFY and msg.id == "echo": # Reply to our echo request. Ignore it. pass + elif (msg.type == ovs.jsonrpc.Message.T_ERROR and + self.state == self.IDL_S_SERVER_MONITOR_REQUESTED and + msg.id == self._server_monitor_request_id): + self._server_monitor_request_id = None + self.__send_monitor_request() elif (msg.type == ovs.jsonrpc.Message.T_ERROR and self.state == ( self.IDL_S_DATA_MONITOR_COND_SINCE_REQUESTED) and @@ -615,6 +621,33 @@ def run(self): return initial_change_seqno != self.change_seqno + def handle_monitor_canceled(self, msg): + if msg.type != msg.T_NOTIFY: + return False + if msg.method != "monitor_canceled": + return False + + if msg.params[0] == str(self.uuid): + params = [str(self.server_monitor_uuid)] + elif msg.params[0] == str(self.server_monitor_uuid): + params = [str(self.uuid)] + else: + return False + + mc_msg = ovs.jsonrpc.Message.create_request("monitor_cancel", params) + self._monitor_cancel_request_id = mc_msg.id + self.send_request(mc_msg) + self.restart_fsm() + return True + + def handle_monitor_cancel_reply(self, msg): + if msg.type != msg.T_REPLY: + return False + if msg.id != self._monitor_cancel_request_id: + return False + self._monitor_cancel_request_id = None + return True + def compose_cond_change(self): if not self.cond_changed: return @@ -879,7 +912,7 @@ def __send_server_monitor_request(self): monitor_request = {"columns": columns} monitor_requests[table.name] = [monitor_request] msg = ovs.jsonrpc.Message.create_request( - 'monitor', [self._server_db.name, + 'monitor_cond', [self._server_db.name, str(self.server_monitor_uuid), monitor_requests]) self._server_monitor_request_id = msg.id @@ -980,7 +1013,9 @@ def __process_update2(self, table, uuid, row_update): if not row: raise error.Error('Modify non-existing row') + del table.rows[uuid] old_row = self.__apply_diff(table, row, row_update['modify']) + table.rows[uuid] = row return Notice(ROW_UPDATE, row, Row(self, table, uuid, old_row)) else: raise error.Error(' unknown operation', @@ -1011,9 +1046,10 @@ def __process_update(self, table, uuid, old, new): op = ROW_UPDATE vlog.warn("cannot add existing row %s to table %s" % (uuid, table.name)) + del table.rows[uuid] + changed |= self.__row_update(table, row, new) - if op == ROW_CREATE: - table.rows[uuid] = row + table.rows[uuid] = row if changed: return Notice(ROW_CREATE, row) else: @@ -1025,9 +1061,11 @@ def __process_update(self, table, uuid, old, new): # XXX rate-limit vlog.warn("cannot modify missing row %s in table %s" % (uuid, table.name)) + else: + del table.rows[uuid] + changed |= self.__row_update(table, row, new) - if op == ROW_CREATE: - table.rows[uuid] = row + table.rows[uuid] = row if changed: return Notice(op, row, Row.from_json(self, table, uuid, old)) return False @@ -1223,7 +1261,7 @@ class Row(object): d["a"] = "b" row.mycolumn = d """ - def __init__(self, idl, table, uuid, data): + def __init__(self, idl, table, uuid, data, persist_uuid=False): # All of the explicit references to self.__dict__ below are required # to set real attributes with invoking self.__getattr__(). self.__dict__["uuid"] = uuid @@ -1278,6 +1316,10 @@ def __init__(self, idl, table, uuid, data): # in the dictionary are all None. self.__dict__["_prereqs"] = {} + # Indicates if the specified 'uuid' should be used as the row uuid + # or let the server generate it. + self.__dict__["_persist_uuid"] = persist_uuid + def __lt__(self, other): if not isinstance(other, Row): return NotImplemented @@ -1666,6 +1708,8 @@ def __init__(self, idl): self._inserted_rows = {} # Map from UUID to _InsertedRow + self._operations = [] + def add_comment(self, comment): """Appends 'comment' to the comments that will be passed to the OVSDB server when this transaction is committed. (The comment will be @@ -1801,7 +1845,7 @@ def commit(self): "rows": [rows]}) # Add updates. - any_updates = False + any_updates = bool(self._operations) for row in self._txn_rows.values(): if row._changes is None: if row._table.is_root: @@ -1816,7 +1860,11 @@ def commit(self): op = {"table": row._table.name} if row._data is None: op["op"] = "insert" - op["uuid-name"] = _uuid_name_from_uuid(row.uuid) + if row._persist_uuid: + op["uuid"] = str(row.uuid) + else: + op["uuid-name"] = _uuid_name_from_uuid(row.uuid) + any_updates = True op_index = len(operations) - 1 @@ -1932,6 +1980,8 @@ def commit(self): operations.append({"op": "comment", "comment": "\n".join(self._comments)}) + operations += self._operations + # Dry run? if self.dry_run: operations.append({"op": "abort"}) @@ -1950,6 +2000,21 @@ def commit(self): self.__disassemble() return self._status + def add_op(self, op): + """Add a raw OVSDB operation to the transaction + + This can be useful for re-using the existing Idl connection to take + actions that are difficult or expensive to do with the Idl itself, e.g. + bulk deleting rows from the server without downloading them into a + local cache. + + All ops are applied after any other operations in the transaction. + + :param op: An "op" for an OVSDB "transact" request (rfc 7047 Sec 5.2) + :type op: dict + """ + self._operations.append(op) + def commit_block(self): """Attempts to commit this transaction, blocking until the commit either succeeds or fails. Returns the final commit status, which may @@ -2056,20 +2121,22 @@ def _write(self, row, column, datum): row._mutations['_removes'].pop(column.name, None) row._changes[column.name] = datum.copy() - def insert(self, table, new_uuid=None): + def insert(self, table, new_uuid=None, persist_uuid=False): """Inserts and returns a new row in 'table', which must be one of the ovs.db.schema.TableSchema objects in the Idl's 'tables' dict. The new row is assigned a provisional UUID. If 'uuid' is None then one is randomly generated; otherwise 'uuid' should specify a randomly - generated uuid.UUID not otherwise in use. ovsdb-server will assign a - different UUID when 'txn' is committed, but the IDL will replace any - uses of the provisional UUID in the data to be to be committed by the - UUID assigned by ovsdb-server.""" + generated uuid.UUID not otherwise in use. If 'persist_uuid' is true + and 'new_uuid' is specified, IDL requests the ovsdb-server to assign + the same UUID, otherwise ovsdb-server will assign a different UUID when + 'txn' is committed and the IDL will replace any uses of the provisional + UUID in the data to be committed by the UUID assigned by + ovsdb-server.""" assert self._status == Transaction.UNCOMMITTED if new_uuid is None: new_uuid = uuid.uuid4() - row = Row(self.idl, table, new_uuid, None) + row = Row(self.idl, table, new_uuid, None, persist_uuid=persist_uuid) table.rows[row.uuid] = row self._txn_rows[row.uuid] = row return row diff --git a/python/ovs/dns_resolve.py b/python/ovs/dns_resolve.py new file mode 100644 index 00000000000..41546ad5ca4 --- /dev/null +++ b/python/ovs/dns_resolve.py @@ -0,0 +1,286 @@ +# Copyright (c) 2023 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import enum +import functools +import ipaddress +import os +import time +import typing + +try: + import unbound # type: ignore +except ImportError: + pass + +import ovs.vlog + +vlog = ovs.vlog.Vlog("dns_resolve") + + +class ReqState(enum.Enum): + INVALID = 0 + PENDING = 1 + GOOD = 2 + ERROR = 3 + + +class DNSRequest: + def __init__(self, name: str): + self.name: str = name + self.state: ReqState = ReqState.INVALID + self.time: typing.Optional[float] = None + # set by DNSResolver._callback + self.result: typing.Optional[str] = None + self.ttl: typing.Optional[float] = None + + @property + def expired(self): + return time.time() > self.time + self.ttl + + @property + def is_valid(self): + return self.state == ReqState.GOOD and not self.expired + + def __str__(self): + return (f"DNSRequest(name={self.name}, state={self.state}, " + f"time={self.time}, result={self.result})") + + +class DefaultReqDict(collections.defaultdict): + def __init__(self): + super().__init__(DNSRequest) + + def __missing__(self, key): + ret = self.default_factory(key) + self[key] = ret + return ret + + +class UnboundException(Exception): + def __init__(self, message, errno): + try: + msg = f"{message}: {unbound.ub_strerror(errno)}" + except NameError: + msg = message + super().__init__(msg) + + +def dns_enabled(func): + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + if self.dns_enabled: + return func(self, *args, **kwargs) + vlog.err("DNS support requires the python unbound library") + return wrapper + + +class DNSResolver: + def __init__(self, is_daemon: bool = False): + """Create a resolver instance + + If is_daemon is true, set the resolver to handle requests + asynchronously. The following environment variables are processed: + + OVS_UNBOUND_CONF: The filename for an unbound.conf file + OVS_RESOLV_CONF: A filename to override the system default resolv.conf + OVS_HOSTS_FILE: A filename to override the system default hosts file + + In the event that the unbound library is missing or fails to initialize + DNS lookup support will be disabled and the resolve() method will + return None. + """ + self._is_daemon = is_daemon + try: + self._ctx = unbound.ub_ctx() + self.dns_enabled = True + except Exception: + # The unbound docs mention that this could thrown an exception + # but do not specify what exception that is. This can also + # happen with a missing unbound library. + self.dns_enabled = False + vlog.err("Failed to initialize the unbound library") + return + + # NOTE(twilson) This cache, like the C version, can grow without bound + # and has no cleanup or aging mechanism. Given our usage patterns, this + # should not be a problem. But this should not be used to resolve an + # unbounded list of addresses in a long-running daemon. + self._requests = DefaultReqDict() + + self._ub_call(self._set_unbound_conf) + + # NOTE(twilson) The C version disables DNS in this case. I didn't do + # that here since it could still be useful to resolve addresses from + # /etc/hosts even w/o resolv.conf + self._ub_call(self._set_resolv_conf) + self._ub_call(self._set_hosts_file) + + self._ctx.set_async(True) # Sets threaded behavior for resolve_async() + + def _ub_call(self, fn, *args, **kwargs): + """Convert UnboundExceptions into vlog warnings""" + try: + return fn(*args, **kwargs) + except UnboundException as e: + vlog.warn(e) + + @dns_enabled + def _set_unbound_conf(self): + ub_cfg = os.getenv("OVS_UNBOUND_CONF") + if ub_cfg: + retval = self._ctx.config(ub_cfg) + if retval != 0: + raise UnboundException( + "Failed to set libunbound context config", retval) + + @dns_enabled + def _set_resolv_conf(self): + filename = os.getenv("OVS_RESOLV_CONF") + # The C lib checks that the file exists and also sets filename to + # /etc/resolv.conf on non-Windows, but resolvconf already does this. + retval = self._ctx.resolvconf(filename) + if retval != 0: + location = filename or "system default nameserver" + raise UnboundException(location, retval) + + @dns_enabled + def _set_hosts_file(self): + # The C lib doesn't have the ability to set a hosts file, but it is + # useful to have, especially for writing tests that don't rely on + # network connectivity. hosts(None) uses /etc/hosts. + filename = os.getenv("OVS_HOSTS_FILE") + retval = self._ctx.hosts(filename) + if retval != 0: + location = filename or "system default hosts file" + raise UnboundException(location, retval) + + @dns_enabled + def _callback(self, req: DNSRequest, err: int, result): + if err != 0 or (result.qtype == unbound.RR_TYPE_AAAA + and not result.havedata): + req.state = ReqState.ERROR + vlog.warn(f"{req.name}: failed to resolve") + return + if result.qtype == unbound.RR_TYPE_A and not result.havedata: + self._resolve_async(req, unbound.RR_TYPE_AAAA) + return + try: + ip_str = next(iter(result.data.as_raw_data())) + ip = ipaddress.ip_address(ip_str) # test if IP is valid + # NOTE (twilson) For some reason, accessing result data outside of + # _callback causes a segfault. So just grab and store what we need. + req.result = str(ip) + req.ttl = result.ttl + req.state = ReqState.GOOD + req.time = time.time() + except (ValueError, StopIteration): + req.state = ReqState.ERROR + vlog.err(f"{req.name}: failed to resolve") + + @dns_enabled + def _resolve_sync(self, name: str) -> typing.Optional[str]: + for qtype in (unbound.RR_TYPE_A, unbound.RR_TYPE_AAAA): + err, result = self._ctx.resolve(name, qtype) + if err != 0: + return None + if not result.havedata: + continue + try: + ip = ipaddress.ip_address( + next(iter(result.data.as_raw_data()))) + except (ValueError, StopIteration): + return None + return str(ip) + + return None + + @dns_enabled + def _resolve_async(self, req: DNSRequest, qtype) -> None: + err, _ = self._ctx.resolve_async(req.name, req, self._callback, + qtype) + if err != 0: + req.state = ReqState.ERROR + return None + + req.state = ReqState.PENDING + return None + + @dns_enabled + def resolve(self, name: str) -> typing.Optional[str]: + """Resolve a host name to an IP address + + If the resolver is set to handle requests asynchronously, resolve() + should be recalled until it returns a non-None result. Errors will be + logged. + + :param name: The host name to resolve + :returns: The IP address or None on error or not (yet) found + """ + if not self._is_daemon: + return self._resolve_sync(name) + retval = self._ctx.process() + if retval != 0: + vlog.err(f"dns-resolve error: {unbound.ub_strerror(retval)}") + return None + req = self._requests[name] # Creates a DNSRequest if not found + if req.is_valid: + return req.result + elif req.state != ReqState.PENDING: + self._resolve_async(req, unbound.RR_TYPE_A) + return None + + +_global_resolver: typing.Optional[DNSResolver] = None + + +def init(is_daemon: bool = False) -> DNSResolver: + """Initialize a global DNSResolver + + See DNSResolver.__init__ for more details + """ + global _global_resolver + _global_resolver = DNSResolver(is_daemon) + return _global_resolver + + +def resolve(name: str) -> typing.Optional[str]: + """Resolve a host name to an IP address + + If a DNSResolver instance has not been instantiated, or if it has been + created with is_daemon=False, resolve() will synchronously resolve the + hostname. If DNSResolver has been initialized with is_daemon=True, it + will instead resolve asynchornously and resolve() will return None until + the hostname has been resolved. + + :param name: The host name to resolve + :returns: The IP address or None on error or not (yet) found + """ + if _global_resolver is None: + init() + + # mypy doesn't understand that init() sets _global_resolver, so ignore type + return _global_resolver.resolve(name) # type: ignore + + +def destroy(): + """Destroy the global DNSResolver + + This destroys the global DNSResolver instance and any outstanding + asynchronouse requests. + """ + global _global_resolver + del _global_resolver + _global_resolver = None # noqa: F841 diff --git a/python/ovs/fatal_signal.py b/python/ovs/fatal_signal.py index cb2e99e87d4..16a7e78a03f 100644 --- a/python/ovs/fatal_signal.py +++ b/python/ovs/fatal_signal.py @@ -16,6 +16,7 @@ import os import signal import sys +import threading import ovs.vlog @@ -112,29 +113,29 @@ def _unlink(file_): def _signal_handler(signr, _): _call_hooks(signr) - # Re-raise the signal with the default handling so that the program - # termination status reflects that we were killed by this signal. - signal.signal(signr, signal.SIG_DFL) - os.kill(os.getpid(), signr) - def _atexit_handler(): _call_hooks(0) -recurse = False +mutex = threading.Lock() def _call_hooks(signr): - global recurse - if recurse: + global mutex + if not mutex.acquire(blocking=False): return - recurse = True for hook, cancel, run_at_exit in _hooks: if signr != 0 or run_at_exit: hook() + if signr != 0: + # Re-raise the signal with the default handling so that the program + # termination status reflects that we were killed by this signal. + signal.signal(signr, signal.SIG_DFL) + os.kill(os.getpid(), signr) + _inited = False @@ -150,7 +151,9 @@ def _init(): signal.SIGALRM] for signr in signals: - if signal.getsignal(signr) == signal.SIG_DFL: + handler = signal.getsignal(signr) + if (handler == signal.SIG_DFL or + handler == signal.default_int_handler): signal.signal(signr, _signal_handler) atexit.register(_atexit_handler) @@ -165,7 +168,6 @@ def signal_alarm(timeout): if sys.platform == "win32": import time - import threading class Alarm (threading.Thread): def __init__(self, timeout): diff --git a/python/ovs/flow/kv.py b/python/ovs/flow/kv.py index cceb95e4387..f7d7be0cf1e 100644 --- a/python/ovs/flow/kv.py +++ b/python/ovs/flow/kv.py @@ -85,12 +85,17 @@ class KVDecoders(object): reason, the default_free decoder, must return both the key and value to be stored. + Globally defined "strict" variable controls what to do when decoders do not + contain a valid decoder for a key and a default function is not provided. + If set to True (default), a ParseError is raised. + If set to False, the value will be decoded as a string. + Args: decoders (dict): Optional; A dictionary of decoders indexed by keyword. - default (callable): Optional; A decoder used if a match is not found in - configured decoders. If not provided, the default behavior is to - try to decode the value into an integer and, if that fails, - just return the string as-is. + default (callable): Optional; A function to use if a match is not + found in configured decoders. If not provided, the default behavior + depends on "strict". The function must accept a the key and a value + and return the decoded (key, value) tuple back. default_free (callable): Optional; The decoder used if a match is not found in configured decoders and it's a free value (e.g: a value without a key) Defaults to returning the free value as @@ -98,10 +103,19 @@ class KVDecoders(object): The callable must accept a string and return a key-value pair. """ - def __init__(self, decoders=None, default=None, default_free=None): - self._decoders = decoders or dict() - self._default = default or decode_default + strict = True + + def __init__(self, decoders=None, default=None, default_free=None, + ignore_case=False): + if not decoders: + self._decoders = dict() + elif ignore_case: + self._decoders = {k.lower(): v for k, v in decoders.items()} + else: + self._decoders = decoders + self._default = default self._default_free = default_free or self._default_free_decoder + self._ignore_case = ignore_case def decode(self, keyword, value_str): """Decode a keyword and value. @@ -114,7 +128,11 @@ def decode(self, keyword, value_str): The key (str) and value(any) to be stored. """ - decoder = self._decoders.get(keyword) + decoder = None + if self._ignore_case: + decoder = self._decoders.get(keyword.lower()) + else: + decoder = self._decoders.get(keyword) if decoder: result = decoder(value_str) if isinstance(result, KeyValue): @@ -126,9 +144,14 @@ def decode(self, keyword, value_str): return keyword, value else: if value_str: - return keyword, self._default(value_str) - else: - return self._default_free(keyword) + if self._default: + return self._default(keyword, value_str) + if self.strict: + raise ParseError( + "Cannot parse key {}: No decoder found".format(keyword) + ) + return keyword, decode_default(value_str) + return self._default_free(keyword) @staticmethod def _default_free_decoder(key): @@ -308,7 +331,26 @@ def decode_nested_kv(decoders, value): return {kv.key: kv.value for kv in parser.kv()} -def nested_kv_decoder(decoders=None): +def decode_nested_kv_list(decoders, value): + """A key-value decoder that extracts nested key-value pairs and returns + them in a list of dictionary. + + Args: + decoders (KVDecoders): The KVDecoders to use. + value (str): The value string to decode. + """ + if not value: + # Mark as flag + return True + + parser = KVParser(value, decoders) + parser.parse() + return [{kv.key: kv.value} for kv in parser.kv()] + + +def nested_kv_decoder(decoders=None, is_list=False): """Helper function that creates a nested kv decoder with given KVDecoders.""" + if is_list: + return functools.partial(decode_nested_kv_list, decoders) return functools.partial(decode_nested_kv, decoders) diff --git a/python/ovs/flow/list.py b/python/ovs/flow/list.py index b1e9e3fcaa6..bc466ef89f0 100644 --- a/python/ovs/flow/list.py +++ b/python/ovs/flow/list.py @@ -31,7 +31,12 @@ def decode(self, index, value_str): value_str (str): The value string to decode. """ if index < 0 or index >= len(self._decoders): - return self._default_decoder(index, value_str) + if self._default_decoder: + return self._default_decoder(index, value_str) + else: + raise ParseError( + f"Cannot decode element {index} in list: {value_str}" + ) try: key = self._decoders[index][0] diff --git a/python/ovs/flow/odp.py b/python/ovs/flow/odp.py index 87a3bae2f9a..572dbebe98f 100644 --- a/python/ovs/flow/odp.py +++ b/python/ovs/flow/odp.py @@ -204,6 +204,7 @@ def _action_decoders_args(): """Generate the arguments for the action KVDecoders.""" _decoders = { "drop": decode_flag, + "meter": decode_int, "lb_output": decode_int, "trunc": decode_int, "recirc": decode_int, @@ -225,7 +226,7 @@ def _action_decoders_args(): KVDecoders( { "probability": decode_int, - "collector_sed_id": decode_int, + "collector_set_id": decode_int, "obs_domain_id": decode_int, "obs_point_id": decode_int, "output_port": decode_default, @@ -303,6 +304,21 @@ def _action_decoders_args(): ), "pop_nsh": decode_flag, "tnl_pop": decode_int, + "pop_mpls": KVDecoders({"eth_type": decode_int}), + **dict.fromkeys( + ["push_mpls", "add_mpls"], + nested_kv_decoder( + KVDecoders( + { + "label": decode_int, + "tc": decode_int, + "ttl": decode_int, + "bos": decode_int, + "eth_type": decode_int, + } + ) + ), + ), "ct_clear": decode_flag, "ct": nested_kv_decoder( KVDecoders( @@ -319,46 +335,68 @@ def _action_decoders_args(): ) ), **ODPFlow._tnl_action_decoder_args(), - } - - _decoders["clone"] = nested_kv_decoder( - KVDecoders(decoders=_decoders, default_free=decode_free_output) - ) - - return { - **_decoders, - "sample": nested_kv_decoder( + "hash": nested_kv_decoder( KVDecoders( { - "sample": (lambda x: float(x.strip("%"))), - "actions": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ) - ), + "l4": decode_int, + "sym_l4": decode_int, } ) ), - "check_pkt_len": nested_kv_decoder( + "psample": nested_kv_decoder( KVDecoders( { - "size": decode_int, - "gt": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ) - ), - "le": nested_kv_decoder( - KVDecoders( - decoders=_decoders, - default_free=decode_free_output, - ) - ), + "group": decode_int, + "cookie": decode_default, } ) - ), + ) + } + + _decoders["sample"] = nested_kv_decoder( + KVDecoders( + { + "sample": (lambda x: float(x.strip("%"))), + "actions": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, + ), + is_list=True, + ), + } + ) + ) + + _decoders["clone"] = nested_kv_decoder( + KVDecoders(decoders=_decoders, default_free=decode_free_output), + is_list=True, + ) + + _decoders["check_pkt_len"] = nested_kv_decoder( + KVDecoders( + { + "size": decode_int, + "gt": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, + ), + is_list=True, + ), + "le": nested_kv_decoder( + KVDecoders( + decoders=_decoders, + default_free=decode_free_output, + ), + is_list=True, + ), + } + ) + ) + + return { + **_decoders, } @staticmethod @@ -412,7 +450,7 @@ def _tnl_action_decoder_args(): { "src": decode_int, "dst": decode_int, - "dsum": Mask16, + "csum": Mask16, } ) ), @@ -457,6 +495,14 @@ def _tnl_action_decoder_args(): } ) ), + "srv6": nested_kv_decoder( + KVDecoders( + { + "segments_left": decode_int, + "segs": decode_default, + } + ) + ), } ) ), @@ -499,8 +545,8 @@ def _field_decoders_args(): "src": IPMask, "dst": IPMask, "proto": Mask8, - "tcp_src": Mask16, - "tcp_dst": Mask16, + "tp_src": Mask16, + "tp_dst": Mask16, } ) ), @@ -541,6 +587,8 @@ def _field_decoders_args(): "vxlan": nested_kv_decoder( KVDecoders( { + "flags": decode_int, + "vni": decode_int, "gbp": nested_kv_decoder( KVDecoders( { @@ -548,7 +596,7 @@ def _field_decoders_args(): "flags": Mask8, } ) - ) + ), } ) ), diff --git a/python/ovs/flow/ofp.py b/python/ovs/flow/ofp.py index 0bc110c576e..f011b0460e4 100644 --- a/python/ovs/flow/ofp.py +++ b/python/ovs/flow/ofp.py @@ -30,8 +30,7 @@ decode_move_field, decode_dec_ttl, decode_chk_pkt_larger, - decode_zone, - decode_exec, + decode_field_or_int, decode_learn, ) @@ -105,9 +104,6 @@ def __init__(self, ofp_string, id=None): ValueError if the string is malformed. ParseError if an error in parsing occurs. """ - if " reply " in ofp_string: - return None - sections = list() parts = ofp_string.split("actions=") if len(parts) != 2: @@ -174,12 +170,13 @@ def _gen_info_decoders(): args = { "table": decode_int, "duration": decode_time, - "n_packet": decode_int, + "n_packets": decode_int, "n_bytes": decode_int, "cookie": decode_int, "idle_timeout": decode_time, "hard_timeout": decode_time, "hard_age": decode_time, + "idle_age": decode_time, } return KVDecoders(args) @@ -243,10 +240,12 @@ def _gen_action_decoders(): **OFPFlow._fw_action_decoders_args(), **OFPFlow._control_action_decoders_args(), **OFPFlow._other_action_decoders_args(), + **OFPFlow._instruction_action_decoders_args(), } clone_actions = OFPFlow._clone_actions_decoders_args(actions) actions.update(clone_actions) - return KVDecoders(actions, default_free=decode_free_output) + return KVDecoders(actions, default_free=decode_free_output, + ignore_case=True) @staticmethod def _output_actions_decoders_args(): @@ -272,6 +271,8 @@ def _encap_actions_decoders_args(): "pop_vlan": decode_flag, "strip_vlan": decode_flag, "push_vlan": decode_default, + "pop_mpls": decode_int, + "push_mpls": decode_int, "decap": decode_flag, "encap": decode_encap, } @@ -286,8 +287,8 @@ def _field_action_decoders_args(): "set_mpls_ttl", "mod_nw_tos", "mod_nw_ecn", - "mod_tcp_src", - "mod_tcp_dst", + "mod_tp_src", + "mod_tp_dst", ] return { "load": decode_load_field, @@ -299,9 +300,15 @@ def _field_action_decoders_args(): "mod_dl_src": EthMask, "mod_nw_dst": IPMask, "mod_nw_src": IPMask, + "mod_nw_ttl": decode_int, + "mod_vlan_vid": decode_int, + "set_vlan_vid": decode_int, + "mod_vlan_pcp": decode_int, + "set_vlan_pcp": decode_int, "dec_ttl": decode_dec_ttl, "dec_mpls_ttl": decode_flag, "dec_nsh_ttl": decode_flag, + "delete_field": decode_field, "check_pkt_larger": decode_chk_pkt_larger, **{field: decode_default for field in field_default_decoders}, } @@ -323,12 +330,11 @@ def _fw_action_decoders_args(): KVDecoders( { "commit": decode_flag, - "zone": decode_zone, + "zone": decode_field_or_int, "table": decode_int, "nat": decode_nat, "force": decode_flag, - "exec": functools.partial( - decode_exec, + "exec": nested_kv_decoder( KVDecoders( { **OFPFlow._encap_actions_decoders_args(), @@ -336,12 +342,22 @@ def _fw_action_decoders_args(): **OFPFlow._meta_action_decoders_args(), } ), + is_list=True, ), "alg": decode_default, } ) ), "ct_clear": decode_flag, + "fin_timeout": nested_kv_decoder( + KVDecoders( + { + "idle_timeout": decode_time, + "hard_timeout": decode_time, + } + ) + ), + # learn moved to _clone actions. } @staticmethod @@ -382,21 +398,14 @@ def _clone_actions_decoders_args(action_decoders): actions. """ return { - "learn": decode_learn( - { - **action_decoders, - "fin_timeout": nested_kv_decoder( - KVDecoders( - { - "idle_timeout": decode_time, - "hard_timeout": decode_time, - } - ) - ), - } + "learn": decode_learn(action_decoders), + "clone": nested_kv_decoder( + KVDecoders(action_decoders, default_free=decode_free_output, + ignore_case=True), is_list=True ), - "clone": functools.partial( - decode_exec, KVDecoders(action_decoders) + "write_actions": nested_kv_decoder( + KVDecoders(action_decoders, default_free=decode_free_output, + ignore_case=True), is_list=True ), } @@ -417,8 +426,8 @@ def _other_action_decoders_args(): { "probability": decode_int, "collector_set_id": decode_int, - "obs_domain_id": decode_int, - "obs_point_id": decode_int, + "obs_domain_id": decode_field_or_int, + "obs_point_id": decode_field_or_int, "sampling_port": decode_default, "ingress": decode_flag, "egress": decode_flag, @@ -426,3 +435,15 @@ def _other_action_decoders_args(): ) ), } + + @staticmethod + def _instruction_action_decoders_args(): + """Generate the decoder arguments for instruction actions + (see man(7) ovs-actions).""" + return { + "meter": decode_int, + "clear_actions": decode_flag, + # write_actions moved to _clone actions + "write_metadata": decode_mask(64), + "goto_table": decode_int, + } diff --git a/python/ovs/flow/ofp_act.py b/python/ovs/flow/ofp_act.py index acb16cd9a62..73727428a90 100644 --- a/python/ovs/flow/ofp_act.py +++ b/python/ovs/flow/ofp_act.py @@ -1,17 +1,20 @@ """Defines decoders for OpenFlow actions. """ - -import functools - from ovs.flow.decoders import ( decode_default, decode_time, decode_flag, decode_int, ) -from ovs.flow.kv import nested_kv_decoder, KVDecoders, KeyValue, KVParser +from ovs.flow.kv import ( + nested_kv_decoder, + KVDecoders, + KeyValue, + KVParser, + ParseError, +) from ovs.flow.list import nested_list_decoder, ListDecoders -from ovs.flow.ofp_fields import field_decoders +from ovs.flow.ofp_fields import field_decoders, field_aliases def decode_output(value): @@ -20,7 +23,9 @@ def decode_output(value): Does not support field specification. """ if len(value.split(",")) > 1: - return nested_kv_decoder()(value) + return nested_kv_decoder( + KVDecoders({"port": decode_default, "max_len": decode_int}) + )(value) try: return {"port": int(value)} except ValueError: @@ -30,7 +35,7 @@ def decode_output(value): def decode_controller(value): """Decodes the controller action.""" if not value: - return KeyValue("output", "controller") + return KeyValue("output", {"port": "CONTROLLER"}) else: # Try controller:max_len try: @@ -41,7 +46,18 @@ def decode_controller(value): except ValueError: pass # controller(key[=val], ...) - return nested_kv_decoder()(value) + return nested_kv_decoder( + KVDecoders( + { + "max_len": decode_int, + "reason": decode_default, + "id": decode_int, + "userdata": decode_default, + "pause": decode_flag, + "meter_id": decode_int, + } + ) + )(value) def decode_bundle_load(value): @@ -141,6 +157,12 @@ def decode_field(value): man page: http://www.openvswitch.org/support/dist-docs/ovs-actions.7.txt.""" parts = value.strip("]\n\r").split("[") + if ( + parts[0] not in field_decoders.keys() + and parts[0] not in field_aliases.keys() + ): + raise ParseError("Field not supported: {}".format(parts[0])) + result = { "field": parts[0], } @@ -224,9 +246,9 @@ def decode_chk_pkt_larger(value): return {"pkt_len": pkt_len, "dst": dst} -# CT decoders -def decode_zone(value): - """Decodes the value of the 'zone' keyword (part of the ct action).""" +def decode_field_or_int(value): + """Decodes a value that can be either a subfield specification or an + integer.""" try: return int(value, 0) except ValueError: @@ -234,19 +256,6 @@ def decode_zone(value): return decode_field(value) -def decode_exec(action_decoders, value): - """Decodes the value of the 'exec' keyword (part of the ct action). - - Args: - decode_actions (KVDecoders): The decoders to be used to decode the - nested exec. - value (string): The string to be decoded. - """ - exec_parser = KVParser(value, action_decoders) - exec_parser.parse() - return [{kv.key: kv.value} for kv in exec_parser.kv()] - - def decode_learn(action_decoders): """Create the decoder to be used to decode the 'learn' action. @@ -269,31 +278,36 @@ def decode_learn(action_decoders): action decoding. """ - def decode_learn_field(decoder, value): - """Generates a decoder to be used for the 'field' argument of the - 'learn' action. - - The field can hold a value that should be decoded, either as a field, - or as a the value (see man(7) ovs-actions). - - Args: - decoder (callable): The decoder. + def learn_field_decoding_kv(key, value): + """Decodes a key, value pair from the learn action. + The key must be a decodable field. The value can be either a value + in the format defined for the field or another field. """ - if value in field_decoders.keys(): - # It's a field - return value - else: - return decoder(value) - - learn_field_decoders = { - field: functools.partial(decode_learn_field, decoder) - for field, decoder in field_decoders.items() - } + key_field = decode_field(key) + try: + return key, decode_field(value) + except ParseError: + return key, field_decoders.get(key_field.get("field"))(value) + + def learn_field_decoding_free(key): + """Decodes the free fields found in the learn action. + Free fields indicate that the filed is to be copied from the original. + In order to express that in a dictionary, return the fieldspec as + value. So, the free fild NXM_OF_IP_SRC[], is encoded as: + "NXM_OF_IP_SRC[]": { + "field": "NXM_OF_IP_SRC" + } + That way we also ensure the actual free key is correct. + """ + key_field = decode_field(key) + return key, key_field + learn_decoders = { **action_decoders, - **learn_field_decoders, "idle_timeout": decode_time, "hard_timeout": decode_time, + "fin_idle_timeout": decode_time, + "fin_hard_timeout": decode_time, "priority": decode_int, "cookie": decode_int, "send_flow_rem": decode_flag, @@ -303,4 +317,10 @@ def decode_learn_field(decoder, value): "result_dst": decode_field, } - return functools.partial(decode_exec, KVDecoders(learn_decoders)) + learn_decoder = KVDecoders( + learn_decoders, + default=learn_field_decoding_kv, + default_free=learn_field_decoding_free, + ) + + return nested_kv_decoder(learn_decoder, is_list=True) diff --git a/python/ovs/jsonrpc.py b/python/ovs/jsonrpc.py index d5127268aab..d9fe27aec64 100644 --- a/python/ovs/jsonrpc.py +++ b/python/ovs/jsonrpc.py @@ -377,7 +377,7 @@ def __init__(self, reconnect, rpc, remotes): self.stream = None self.pstream = None self.seqno = 0 - if type(remotes) != list: + if type(remotes) is not list: remotes = [remotes] self.remotes = remotes random.shuffle(self.remotes) diff --git a/python/ovs/socket_util.py b/python/ovs/socket_util.py index 7b41dc44bf1..a26298b75ca 100644 --- a/python/ovs/socket_util.py +++ b/python/ovs/socket_util.py @@ -13,12 +13,14 @@ # limitations under the License. import errno +import ipaddress import os import os.path import random import socket import sys +from ovs import dns_resolve import ovs.fatal_signal import ovs.poller import ovs.vlog @@ -216,7 +218,7 @@ def is_valid_ipv4_address(address): return True -def inet_parse_active(target, default_port): +def _inet_parse_active(target, default_port): address = target.split(":") if len(address) >= 2: host_name = ":".join(address[0:-1]).lstrip('[').rstrip(']') @@ -229,9 +231,24 @@ def inet_parse_active(target, default_port): host_name = address[0] if not host_name: raise ValueError("%s: bad peer name format" % target) + try: + host_name = str(ipaddress.ip_address(host_name)) + except ValueError: + host_name = dns_resolve.resolve(host_name) + if not host_name: + raise ValueError("%s: bad peer name format" % target) return (host_name, port) +def inet_parse_active(target, default_port, raises=True): + try: + return _inet_parse_active(target, default_port) + except ValueError: + if raises: + raise + return ("", default_port) + + def inet_create_socket_active(style, address): try: is_addr_inet = is_valid_ipv4_address(address[0]) @@ -262,7 +279,7 @@ def inet_connect_active(sock, address, family, dscp): def inet_open_active(style, target, default_port, dscp): - address = inet_parse_active(target, default_port) + address = inet_parse_active(target, default_port, raises=False) family, sock = inet_create_socket_active(style, address) if sock is None: return family, sock diff --git a/python/ovs/stream.py b/python/ovs/stream.py index ac5b0fd0c64..dbb6b2e1f77 100644 --- a/python/ovs/stream.py +++ b/python/ovs/stream.py @@ -620,7 +620,7 @@ def open(name): raise Exception('Unknown connection string') try: - sock.listen(10) + sock.listen(64) except socket.error as e: vlog.err("%s: listen: %s" % (name, os.strerror(e.error))) sock.close() @@ -784,7 +784,7 @@ def needs_probes(): @staticmethod def _open(suffix, dscp): - address = ovs.socket_util.inet_parse_active(suffix, 0) + address = ovs.socket_util.inet_parse_active(suffix, 0, raises=False) family, sock = ovs.socket_util.inet_create_socket_active( socket.SOCK_STREAM, address) if sock is None: @@ -824,7 +824,8 @@ def connect(self): self.socket.do_handshake() except ssl.SSLWantReadError: return errno.EAGAIN - except ssl.SSLSyscallError as e: + except (ssl.SSLSyscallError, ssl.SSLZeroReturnError, + ssl.SSLEOFError, OSError) as e: return ovs.socket_util.get_exception_errno(e) return 0 diff --git a/python/ovs/tests/test_dns_resolve.py b/python/ovs/tests/test_dns_resolve.py new file mode 100644 index 00000000000..0698e8f77d9 --- /dev/null +++ b/python/ovs/tests/test_dns_resolve.py @@ -0,0 +1,280 @@ +import contextlib +import ipaddress +import sys +import time +from unittest import mock + +import pytest + +from ovs import dns_resolve +from ovs import socket_util + + +skip_no_unbound = pytest.mark.skipif("unbound" not in dns_resolve.__dict__, + reason="Unbound not installed") + +HOSTS = [("192.0.2.1", "fake.ip4.domain", "192.0.2.1"), + ("2001:db8:2::1", "fake.ip6.domain", "2001:db8:2::1"), + ("192.0.2.2", "fake.both.domain", "192.0.2.2"), + ("2001:db8:2::2", "fake.both.domain", "192.0.2.2")] + + +def _tmp_file(path, content): + path.write_text(content) + assert content == path.read_text() + return path + + +@pytest.fixture(params=[False, True], ids=["not_daemon", "daemon"]) +def resolver_factory(monkeypatch, tmp_path, hosts_file, request): + # Allow delaying the instantiation of the DNSResolver + def resolver_factory(): + with monkeypatch.context() as m: + m.setenv("OVS_HOSTS_FILE", str(hosts_file)) + # Test with both is_daemon False and True + resolver = dns_resolve.init(request.param) + assert resolver._is_daemon == request.param + return resolver + + return resolver_factory + + +@contextlib.contextmanager +def DNSResolver(*args, **kwargs): + """Clean up after returning a dns_resolver.DNSResolver""" + resolver = dns_resolve.init(*args, **kwargs) + try: + yield resolver + finally: + dns_resolve.destroy() + assert dns_resolve._global_resolver is None + + +@pytest.fixture +def unbound_conf(tmp_path): + path = tmp_path / "unbound.conf" + content = """ + server: + verbosity: 1 + """ + return _tmp_file(path, content) + + +@pytest.fixture +def resolv_conf(tmp_path): + path = tmp_path / "resolv.conf" + content = "nameserver 127.0.0.1" + return _tmp_file(path, content) + + +@pytest.fixture +def hosts_file(tmp_path): + path = tmp_path / "hosts" + content = "\n".join(f"{ip}\t{host}" for ip, host, _ in HOSTS) + return _tmp_file(path, content) + + +@pytest.fixture +def missing_file(tmp_path): + f = tmp_path / "missing_file" + assert not f.exists() + return f + + +@pytest.fixture(params=[False, True], ids=["with unbound", "without unbound"]) +def missing_unbound(monkeypatch, request): + if request.param: + if "unbound" in dns_resolve.__dict__: + monkeypatch.setitem(sys.modules, 'unbound', None) + monkeypatch.delitem(dns_resolve.__dict__, "unbound") + elif "unbound" not in dns_resolve.__dict__: + pytest.skip("Unbound not installed") + return request.param + + +def test_missing_unbound(missing_unbound, resolver_factory): + resolver = resolver_factory() # Dont fail even w/o unbound + assert resolver.dns_enabled == (not missing_unbound) + + +def test_DNSRequest_defaults(): + req = dns_resolve.DNSRequest(HOSTS[0][1]) + assert HOSTS[0][1] == req.name + assert req.state == dns_resolve.ReqState.INVALID + assert req.time == req.result == req.ttl is None + assert str(req) + + +def _resolve(resolver, host, fn=dns_resolve.resolve): + """Handle sync/async lookups, giving up if more than 1 second has passed""" + + timeout = 1 + start = time.time() + name = fn(host) + if resolver and resolver._is_daemon: + while name is None: + name = fn(host) + if name: + break + time.sleep(0.01) + end = time.time() + if end - start > timeout: + break + if name: + return name + raise LookupError(f"{host} not found") + + +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_resolve_addresses(missing_unbound, resolver_factory, ip, host, + expected): + resolver = resolver_factory() + if missing_unbound: + with pytest.raises(LookupError): + _resolve(resolver, host) + else: + result = _resolve(resolver, host) + assert ipaddress.ip_address(expected) == ipaddress.ip_address(result) + + +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_resolve_without_init(monkeypatch, missing_unbound, ip, host, expected, + hosts_file): + # make sure we don't have a global resolver + dns_resolve.destroy() + with monkeypatch.context() as m: + m.setenv("OVS_HOSTS_FILE", str(hosts_file)) + if missing_unbound: + with pytest.raises(LookupError): + _resolve(None, host) + else: + res = _resolve(None, host) + assert dns_resolve._global_resolver is not None + assert dns_resolve._global_resolver._is_daemon is False + assert ipaddress.ip_address(expected) == ipaddress.ip_address(res) + + +def test_resolve_unknown_host(missing_unbound, resolver_factory): + resolver = resolver_factory() + with pytest.raises(LookupError): + _resolve(resolver, "fake.notadomain") + + +@skip_no_unbound +def test_resolve_process_error(): + with DNSResolver(True) as resolver: + with mock.patch.object(resolver._ctx, "process", return_value=-1): + assert resolver.resolve("fake.domain") is None + + +@skip_no_unbound +def test_resolve_resolve_error(): + with DNSResolver(False) as resolver: + with mock.patch.object(resolver._ctx, "resolve", + return_value=(-1, None)): + assert resolver.resolve("fake.domain") is None + + +@skip_no_unbound +def test_resolve_resolve_async_error(): + with DNSResolver(True) as resolver: + with mock.patch.object(resolver._ctx, "resolve_async", + return_value=(-1, None)): + with pytest.raises(LookupError): + _resolve(resolver, "fake.domain") + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("unbound_conf", False)]) +def test_set_unbound_conf(monkeypatch, missing_unbound, resolver_factory, + request, file, raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_UNBOUND_CONF", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_unbound_conf() is None + return + with mock.patch.object(resolver._ctx, "config", + side_effect=resolver._ctx.config) as c: + if raises: + with pytest.raises(raises): + resolver._set_unbound_conf() + else: + resolver._set_unbound_conf() + if file: + c.assert_called_once_with(file) + else: + c.assert_not_called() + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("resolv_conf", False)]) +def test_resolv_conf(monkeypatch, missing_unbound, resolver_factory, request, + file, raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_RESOLV_CONF", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_resolv_conf() is None + return + with mock.patch.object(resolver._ctx, "resolvconf", + side_effect=resolver._ctx.resolvconf) as c: + if raises: + with pytest.raises(raises): + resolver._set_resolv_conf() + else: + resolver._set_resolv_conf() + c.assert_called_once_with(file) + + +@pytest.mark.parametrize("file,raises", + [(None, False), + ("missing_file", dns_resolve.UnboundException), + ("hosts_file", False)]) +def test_hosts(monkeypatch, missing_unbound, resolver_factory, request, file, + raises): + if file: + file = str(request.getfixturevalue(file)) + monkeypatch.setenv("OVS_HOSTS_FILE", file) + resolver = resolver_factory() # Doesn't raise + if missing_unbound: + assert resolver._set_hosts_file() is None + return + with mock.patch.object(resolver._ctx, "hosts", + side_effect=resolver._ctx.hosts) as c: + if raises: + with pytest.raises(raises): + resolver._set_hosts_file() + else: + resolver._set_hosts_file() + c.assert_called_once_with(file) + + +def test_UnboundException(missing_unbound): + with pytest.raises(dns_resolve.UnboundException): + raise dns_resolve.UnboundException("Fake exception", -1) + + +@skip_no_unbound +@pytest.mark.parametrize("ip,host,expected", HOSTS) +def test_inet_parse_active(resolver_factory, ip, host, expected): + resolver = resolver_factory() + + def fn(name): + # Return the same thing _resolve() would so we can call + # this multiple times for the is_daemon=True case + return socket_util.inet_parse_active(f"{name}:6640", 6640, + raises=False)[0] or None + + # parsing IPs still works + IP = _resolve(resolver, ip, fn) + assert ipaddress.ip_address(ip) == ipaddress.ip_address(IP) + # parsing hosts works + IP = _resolve(resolver, host, fn) + assert ipaddress.ip_address(IP) == ipaddress.ip_address(expected) diff --git a/python/ovs/tests/test_kv.py b/python/ovs/tests/test_kv.py index c5b66de887b..76887498a57 100644 --- a/python/ovs/tests/test_kv.py +++ b/python/ovs/tests/test_kv.py @@ -1,6 +1,9 @@ import pytest -from ovs.flow.kv import KVParser, KeyValue +from ovs.flow.kv import KVParser, KVDecoders, KeyValue +from ovs.flow.decoders import decode_default + +decoders = KVDecoders(default=lambda k, v: (k, decode_default(v))) @pytest.mark.parametrize( @@ -9,7 +12,7 @@ ( ( "cookie=0x0, duration=147566.365s, table=0, n_packets=39, n_bytes=2574, idle_age=65534, hard_age=65534", # noqa: E501 - None, + decoders, ), [ KeyValue("cookie", 0), @@ -24,7 +27,7 @@ ( ( "load:0x4->NXM_NX_REG13[],load:0x9->NXM_NX_REG11[],load:0x8->NXM_NX_REG12[],load:0x1->OXM_OF_METADATA[],load:0x1->NXM_NX_REG14[],mod_dl_src:0a:58:a9:fe:00:02,resubmit(,8)", # noqa: E501 - None, + decoders, ), [ KeyValue("load", "0x4->NXM_NX_REG13[]"), @@ -36,20 +39,17 @@ KeyValue("resubmit", ",8"), ], ), + (("l1(l2(l3(l4())))", decoders), [KeyValue("l1", "l2(l3(l4()))")]), ( - ("l1(l2(l3(l4())))", None), - [KeyValue("l1", "l2(l3(l4()))")] - ), - ( - ("l1(l2(l3(l4()))),foo:bar", None), + ("l1(l2(l3(l4()))),foo:bar", decoders), [KeyValue("l1", "l2(l3(l4()))"), KeyValue("foo", "bar")], ), ( - ("enqueue:1:2,output=2", None), + ("enqueue:1:2,output=2", decoders), [KeyValue("enqueue", "1:2"), KeyValue("output", 2)], ), ( - ("value_to_reg(100)->someReg[10],foo:bar", None), + ("value_to_reg(100)->someReg[10],foo:bar", decoders), [ KeyValue("value_to_reg", "(100)->someReg[10]"), KeyValue("foo", "bar"), diff --git a/python/ovs/tests/test_odp.py b/python/ovs/tests/test_odp.py index 715be386940..d514e9be32d 100644 --- a/python/ovs/tests/test_odp.py +++ b/python/ovs/tests/test_odp.py @@ -13,6 +13,32 @@ ) +def do_test_section(input_string, section, expected): + flow = ODPFlow(input_string) + kv_list = flow.section(section).data + + assert len(expected) == len(kv_list) + + for i in range(len(expected)): + assert expected[i].key == kv_list[i].key + assert expected[i].value == kv_list[i].value + + # Assert positions relative to action string are OK. + pos = flow.section(section).pos + string = flow.section(section).string + + kpos = kv_list[i].meta.kpos + kstr = kv_list[i].meta.kstring + vpos = kv_list[i].meta.vpos + vstr = kv_list[i].meta.vstring + assert string[kpos : kpos + len(kstr)] == kstr + if vpos != -1: + assert string[vpos : vpos + len(vstr)] == vstr + + # Assert string meta is correct. + assert input_string[pos : pos + len(string)] == string + + @pytest.mark.parametrize( "input_string,expected", [ @@ -109,26 +135,7 @@ ], ) def test_odp_fields(input_string, expected): - odp = ODPFlow(input_string) - match = odp.match_kv - for i in range(len(expected)): - assert expected[i].key == match[i].key - assert expected[i].value == match[i].value - - # Assert positions relative to action string are OK. - mpos = odp.section("match").pos - mstring = odp.section("match").string - - kpos = match[i].meta.kpos - kstr = match[i].meta.kstring - vpos = match[i].meta.vpos - vstr = match[i].meta.vstring - assert mstring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert mstring[vpos : vpos + len(vstr)] == vstr - - # Assert mstring meta is correct. - assert input_string[mpos : mpos + len(mstring)] == mstring + do_test_section(input_string, "match", expected) @pytest.mark.parametrize( @@ -453,24 +460,56 @@ def test_odp_fields(input_string, expected): ], ), ( - "actions:clone(1)" ",clone(clone(push_vlan(vid=12,pcp=0),2),1)", + "actions:tnl_push(header(srv6(segments_left=1,segs(2001:cafe::90,2001:cafe::91))))", # noqa: E501 [ - KeyValue("clone", {"output": {"port": 1}}), KeyValue( - "clone", + "tnl_push", { - "output": {"port": 1}, - "clone": { - "push_vlan": { - "vid": 12, - "pcp": 0, - }, - "output": {"port": 2}, - }, + "header": { + "srv6": { + "segments_left": 1, + "segs": "2001:cafe::90,2001:cafe::91", + } + } }, ), ], ), + ( + "actions:clone(1),clone(clone(push_vlan(vid=12,pcp=0),2),1)", + [ + KeyValue("clone", [{"output": {"port": 1}}]), + KeyValue( + "clone", + [ + { + "clone": [ + { + "push_vlan": { + "vid": 12, + "pcp": 0, + }, + }, + {"output": {"port": 2}}, + ] + }, + {"output": {"port": 1}}, + ], + ), + ], + ), + ( + "actions:clone(recirc(0x1),recirc(0x2))", + [ + KeyValue( + "clone", + [ + {"recirc": 1}, + {"recirc": 2}, + ], + ), + ], + ), ( "actions: check_pkt_len(size=200,gt(4),le(5))" ",check_pkt_len(size=200,gt(drop),le(5))" @@ -480,48 +519,70 @@ def test_odp_fields(input_string, expected): "check_pkt_len", { "size": 200, - "gt": {"output": {"port": 4}}, - "le": {"output": {"port": 5}}, + "gt": [{"output": {"port": 4}}], + "le": [{"output": {"port": 5}}], }, ), KeyValue( "check_pkt_len", { "size": 200, - "gt": {"drop": True}, - "le": {"output": {"port": 5}}, + "gt": [{"drop": True}], + "le": [{"output": {"port": 5}}], }, ), KeyValue( "check_pkt_len", { "size": 200, - "gt": {"ct": {"nat": True}}, - "le": {"drop": True}, + "gt": [{"ct": {"nat": True}}], + "le": [{"drop": True}], }, ), ], ), + ( + "actions:check_pkt_len(size=200,gt(check_pkt_len(size=400,gt(4),le(2))),le(check_pkt_len(size=100,gt(1),le(drop))))", # noqa: E501 + [ + KeyValue( + "check_pkt_len", + { + "size": 200, + "gt": [ + { + "check_pkt_len": { + "size": 400, + "gt": [{"output": {"port": 4}}], + "le": [{"output": {"port": 2}}], + } + } + ], + "le": [ + { + "check_pkt_len": { + "size": 100, + "gt": [{"output": {"port": 1}}], + "le": [{"drop": True}], + } + } + ], + }, + ) + ], + ), + ( + "actions:meter(1),hash(l4(0))", + [ + KeyValue("meter", 1), + KeyValue( + "hash", + { + "l4": 0, + } + ), + ], + ), ], ) def test_odp_actions(input_string, expected): - odp = ODPFlow(input_string) - actions = odp.actions_kv - for i in range(len(expected)): - assert expected[i].key == actions[i].key - assert expected[i].value == actions[i].value - - # Assert positions relative to action string are OK. - apos = odp.section("actions").pos - astring = odp.section("actions").string - - kpos = actions[i].meta.kpos - kstr = actions[i].meta.kstring - vpos = actions[i].meta.vpos - vstr = actions[i].meta.vstring - assert astring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert astring[vpos : vpos + len(vstr)] == vstr - - # Assert astring meta is correct. - assert input_string[apos : apos + len(astring)] == astring + do_test_section(input_string, "actions", expected) diff --git a/python/ovs/tests/test_ofp.py b/python/ovs/tests/test_ofp.py index 7a93b2fd453..d098520cae0 100644 --- a/python/ovs/tests/test_ofp.py +++ b/python/ovs/tests/test_ofp.py @@ -2,10 +2,36 @@ import pytest from ovs.flow.ofp import OFPFlow -from ovs.flow.kv import KeyValue +from ovs.flow.kv import KeyValue, ParseError from ovs.flow.decoders import EthMask, IPMask, decode_mask +def do_test_section(input_string, section, expected): + flow = OFPFlow(input_string) + kv_list = flow.section(section).data + + assert len(expected) == len(kv_list) + + for i in range(len(expected)): + assert expected[i].key == kv_list[i].key + assert expected[i].value == kv_list[i].value + + # Assert positions relative to action string are OK. + pos = flow.section(section).pos + string = flow.section(section).string + + kpos = kv_list[i].meta.kpos + kstr = kv_list[i].meta.kstring + vpos = kv_list[i].meta.vpos + vstr = kv_list[i].meta.vstring + assert string[kpos : kpos + len(kstr)] == kstr + if vpos != -1: + assert string[vpos : vpos + len(vstr)] == vstr + + # Assert string meta is correct. + assert input_string[pos : pos + len(string)] == string + + @pytest.mark.parametrize( "input_string,expected", [ @@ -22,10 +48,25 @@ ( "actions=controller,controller:200", [ - KeyValue("output", "controller"), + KeyValue("output", {"port": "CONTROLLER"}), KeyValue("controller", {"max_len": 200}), ], ), + ( + "actions=controller(max_len=123,reason=no_match,id=456,userdata=00.00.00.12.00.00.00.00,meter_id=12)", # noqa: E501 + [ + KeyValue( + "controller", + { + "max_len": 123, + "reason": "no_match", + "id": 456, + "userdata": "00.00.00.12.00.00.00.00", + "meter_id": 12, + } + ), + ], + ), ( "actions=enqueue(foo,42),enqueue:foo:42,enqueue(bar,4242)", [ @@ -331,12 +372,12 @@ {"table": 69}, {"delete_learned": True}, {"cookie": 3664728752}, - {"OXM_OF_METADATA[]": True}, + {"OXM_OF_METADATA[]": {"field": "OXM_OF_METADATA"}}, {"eth_type": 2048}, - {"NXM_OF_IP_SRC[]": True}, + {"NXM_OF_IP_SRC[]": {"field": "NXM_OF_IP_SRC"}}, {"ip_dst": IPMask("172.30.204.105/32")}, {"nw_proto": 6}, - {"NXM_OF_TCP_SRC[]": "NXM_OF_TCP_DST[]"}, + {"NXM_OF_TCP_SRC[]": {"field": "NXM_OF_TCP_DST"}}, { "load": { "value": 1, @@ -509,26 +550,116 @@ ), ], ), + ( + "actions=POP_VLAN,push_vlan:0x8100,NORMAL,clone(MOD_NW_SRC:192.168.1.1,resubmit(,10))", # noqa: E501 + [ + KeyValue("POP_VLAN", True), + KeyValue("push_vlan", 0x8100), + KeyValue("output", {"port": "NORMAL"}), + KeyValue( + "clone", + [ + {"MOD_NW_SRC": netaddr.IPAddress("192.168.1.1")}, + {"resubmit": {"port": "", "table": 10}}, + ] + ), + ], + ), + ( + "actions=MOD_NW_SRC:192.168.1.1,CONTROLLER,CONTROLLER:123", + [ + KeyValue("MOD_NW_SRC", netaddr.IPAddress("192.168.1.1")), + KeyValue("output", {"port": "CONTROLLER"}), + KeyValue("CONTROLLER", {"max_len": 123}), + ], + ), + ( + "actions=LOCAL,clone(myport,CONTROLLER)", + [ + KeyValue("output", {"port": "LOCAL"}), + KeyValue( + "clone", + [ + {"output": {"port": "myport"}}, + {"output": {"port": "CONTROLLER"}}, + ] + ), + ], + ), + ( + "actions=LOCAL,clone(sample(probability=123))", + [ + KeyValue("output", {"port": "LOCAL"}), + KeyValue( + "clone", + [ + {"sample": { + "probability": 123, + }}, + ] + ), + ], + ), + ( + "actions=doesnotexist(1234)", + ParseError, + ), + ( + "actions=learn(eth_type=nofield)", + ParseError, + ), + ( + "actions=learn(nofield=eth_type)", + ParseError, + ), + ( + "nofield=0x123 actions=drop", + ParseError, + ), + ( + "actions=load:0x12334->NOFILED", + ParseError, + ), ], ) def test_act(input_string, expected): - ofp = OFPFlow(input_string) - actions = ofp.actions_kv - for i in range(len(expected)): - assert expected[i].key == actions[i].key - assert expected[i].value == actions[i].value + if isinstance(expected, type): + with pytest.raises(expected): + OFPFlow(input_string) + return - # Assert positions relative to action string are OK. - apos = ofp.section("actions").pos - astring = ofp.section("actions").string + do_test_section(input_string, "actions", expected) - kpos = actions[i].meta.kpos - kstr = actions[i].meta.kstring - vpos = actions[i].meta.vpos - vstr = actions[i].meta.vstring - assert astring[kpos : kpos + len(kstr)] == kstr - if vpos != -1: - assert astring[vpos : vpos + len(vstr)] == vstr - # Assert astring meta is correct. - assert input_string[apos : apos + len(astring)] == astring +@pytest.mark.parametrize( + "input_string,expected", + [ + ( + "cookie=0x35f946ead8d8f9e4, duration=97746.271s, table=0, n_packets=12, n_bytes=254, idle_age=117, priority=4,in_port=1", # noqa: E501 + ( + [ + KeyValue("cookie", 0x35f946ead8d8f9e4), + KeyValue("duration", 97746.271), + KeyValue("table", 0), + KeyValue("n_packets", 12), + KeyValue("n_bytes", 254), + KeyValue("idle_age", 117), + ], + [ + KeyValue("priority", 4), + KeyValue("in_port", 1) + ], + ), + ), + ], +) +def test_key(input_string, expected): + if isinstance(expected, type): + with pytest.raises(expected): + OFPFlow(input_string) + return + + input_string += " actions=drop" + + do_test_section(input_string, "info", expected[0]) + do_test_section(input_string, "match", expected[1]) diff --git a/python/ovs/unixctl/__init__.py b/python/ovs/unixctl/__init__.py index 8ee31294339..b05f3df7203 100644 --- a/python/ovs/unixctl/__init__.py +++ b/python/ovs/unixctl/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum import sys import ovs.util @@ -19,6 +20,13 @@ commands = {} +@enum.unique +# FIXME: Use @enum.verify(enum.NAMED_FLAGS) from Python 3.11 when available. +class UnixctlOutputFormat(enum.IntFlag): + TEXT = 1 << 0 + JSON = 1 << 1 + + class _UnixctlCommand(object): def __init__(self, usage, min_args, max_args, callback, aux): self.usage = usage diff --git a/python/ovs/unixctl/client.py b/python/ovs/unixctl/client.py index 8283f99bbfc..8a6fcb1b985 100644 --- a/python/ovs/unixctl/client.py +++ b/python/ovs/unixctl/client.py @@ -14,6 +14,7 @@ import os +import ovs.json import ovs.jsonrpc import ovs.stream import ovs.util @@ -41,10 +42,10 @@ def transact(self, command, argv): return error, None, None if reply.error is not None: - return 0, str(reply.error), None + return 0, reply.error, None else: assert reply.result is not None - return 0, None, str(reply.result) + return 0, None, reply.result def close(self): self._conn.close() diff --git a/python/ovs/unixctl/server.py b/python/ovs/unixctl/server.py index 5f9b3e7393b..9a58a38d52d 100644 --- a/python/ovs/unixctl/server.py +++ b/python/ovs/unixctl/server.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse import copy import errno import os @@ -35,6 +36,7 @@ def __init__(self, rpc): assert isinstance(rpc, ovs.jsonrpc.Connection) self._rpc = rpc self._request_id = None + self._fmt = ovs.unixctl.UnixctlOutputFormat.TEXT def run(self): self._rpc.run() @@ -63,10 +65,29 @@ def run(self): return error def reply(self, body): - self._reply_impl(True, body) + assert body is None or isinstance(body, str) + + if body is None: + body = "" + + if self._fmt == ovs.unixctl.UnixctlOutputFormat.JSON: + body = { + "reply-format": "plain", + "reply": body + } + + return self._reply_impl_json(True, body) + + def reply_json(self, body): + self._reply_impl_json(True, body) def reply_error(self, body): - self._reply_impl(False, body) + assert body is None or isinstance(body, str) + + if body is None: + body = "" + + return self._reply_impl_json(False, body) # Called only by unixctl classes. def _close(self): @@ -78,18 +99,11 @@ def _wait(self, poller): if not self._rpc.get_backlog(): self._rpc.recv_wait(poller) - def _reply_impl(self, success, body): + def _reply_impl_json(self, success, body): assert isinstance(success, bool) - assert body is None or isinstance(body, str) assert self._request_id is not None - if body is None: - body = "" - - if body and not body.endswith("\n"): - body += "\n" - if success: reply = Message.create_reply(body, self._request_id) else: @@ -136,6 +150,25 @@ def _unixctl_version(conn, unused_argv, version): conn.reply(version) +def _unixctl_set_options(conn, argv, unused_aux): + assert isinstance(conn, UnixctlConnection) + + parser = argparse.ArgumentParser() + parser.add_argument("--format", default="text", + choices=[fmt.name.lower() + for fmt in ovs.unixctl.UnixctlOutputFormat], + type=str.lower) + + try: + args = parser.parse_args(args=argv) + except argparse.ArgumentError as e: + conn.reply_error(str(e)) + return + + conn._fmt = ovs.unixctl.UnixctlOutputFormat[args.format.upper()] + conn.reply(None) + + class UnixctlServer(object): def __init__(self, listener): assert isinstance(listener, ovs.stream.PassiveStream) @@ -210,48 +243,7 @@ def create(path, version=None): ovs.unixctl.command_register("version", "", 0, 0, _unixctl_version, version) - return 0, UnixctlServer(listener) - - -class UnixctlClient(object): - def __init__(self, conn): - assert isinstance(conn, ovs.jsonrpc.Connection) - self._conn = conn - - def transact(self, command, argv): - assert isinstance(command, str) - assert isinstance(argv, list) - for arg in argv: - assert isinstance(arg, str) - - request = Message.create_request(command, argv) - error, reply = self._conn.transact_block(request) - - if error: - vlog.warn("error communicating with %s: %s" - % (self._conn.name, os.strerror(error))) - return error, None, None - - if reply.error is not None: - return 0, str(reply.error), None - else: - assert reply.result is not None - return 0, None, str(reply.result) - - def close(self): - self._conn.close() - self.conn = None - - @staticmethod - def create(path): - assert isinstance(path, str) + ovs.unixctl.command_register("set-options", "[--format text|json]", 1, + 2, _unixctl_set_options, None) - unix = "unix:%s" % ovs.util.abs_file_name(ovs.dirs.RUNDIR, path) - error, stream = ovs.stream.Stream.open_block( - ovs.stream.Stream.open(unix)) - - if error: - vlog.warn("failed to connect to %s" % path) - return error, None - - return 0, UnixctlClient(ovs.jsonrpc.Connection(stream)) + return 0, UnixctlServer(listener) diff --git a/python/build/__init__.py b/python/ovs_build_helpers/__init__.py similarity index 100% rename from python/build/__init__.py rename to python/ovs_build_helpers/__init__.py diff --git a/python/build/extract_ofp_fields.py b/python/ovs_build_helpers/extract_ofp_fields.py similarity index 100% rename from python/build/extract_ofp_fields.py rename to python/ovs_build_helpers/extract_ofp_fields.py diff --git a/python/build/nroff.py b/python/ovs_build_helpers/nroff.py similarity index 100% rename from python/build/nroff.py rename to python/ovs_build_helpers/nroff.py diff --git a/python/build/soutil.py b/python/ovs_build_helpers/soutil.py similarity index 100% rename from python/build/soutil.py rename to python/ovs_build_helpers/soutil.py diff --git a/python/setup.py b/python/setup.py.template similarity index 84% rename from python/setup.py rename to python/setup.py.template index 27684c40469..e7d59f2ca3f 100644 --- a/python/setup.py +++ b/python/setup.py.template @@ -23,24 +23,16 @@ import setuptools -VERSION = "unknown" +VERSION = "@VERSION@" -try: - # Try to set the version from the generated ovs/version.py - exec(open("ovs/version.py").read()) -except IOError: - print("Ensure version.py is created by running make python/ovs/version.py", - file=sys.stderr) - sys.exit(-1) - -try: - # Try to open generated ovs/dirs.py. However, in this case we - # don't need to exec() - open("ovs/dirs.py") -except IOError: - print("Ensure dirs.py is created by running make python/ovs/dirs.py", - file=sys.stderr) - sys.exit(-1) +for x in ("version.py", "dirs.py"): + try: + # Try to open generated ovs/{version,dirs}.py + open(f"ovs/{x}") + except IOError: + print(f"Ensure {x} is created by running make python/ovs/{x}", + file=sys.stderr) + sys.exit(-1) ext_errors = (CCompilerError, ExecError, PlatformError) if sys.platform == 'win32': @@ -99,8 +91,7 @@ def build_extension(self, ext): 'Topic :: System :: Networking', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', ], ext_modules=[setuptools.Extension("ovs._json", sources=["ovs/_json.c"], @@ -110,7 +101,8 @@ def build_extension(self, ext): cmdclass={'build_ext': try_build_ext}, install_requires=['sortedcontainers'], extras_require={':sys_platform == "win32"': ['pywin32 >= 1.0'], - 'flow': ['netaddr', 'pyparsing']}, + 'flow': ['netaddr', 'pyparsing'], + 'dns': ['unbound']}, ) try: diff --git a/python/test_requirements.txt b/python/test_requirements.txt index 6aaee13e3fe..a1424506b64 100644 --- a/python/test_requirements.txt +++ b/python/test_requirements.txt @@ -1,3 +1,7 @@ -pytest netaddr +packaging +pyftpdlib pyparsing +pytest +scapy +tftpy diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in index 67268cb7833..f129bc64625 100644 --- a/rhel/openvswitch-fedora.spec.in +++ b/rhel/openvswitch-fedora.spec.in @@ -26,8 +26,10 @@ %bcond_without libcapng # To enable DPDK support, specify '--with dpdk' when building %bcond_with dpdk -# To enable AF_XDP support, specify '--with afxdp' when building -%bcond_with afxdp +# To disable AF_XDP support, specify '--without afxdp' when building +%bcond_without afxdp +# To control the USDT support +%bcond_without usdt # If there is a need to automatically enable the package after installation, # specify the "--with autoenable" @@ -71,11 +73,14 @@ BuildRequires: libcap-ng libcap-ng-devel %endif %if %{with dpdk} BuildRequires: libpcap-devel numactl-devel -BuildRequires: dpdk-devel >= 21.11 +BuildRequires: dpdk-devel >= 23.11 Provides: %{name}-dpdk = %{version}-%{release} %endif %if %{with afxdp} -BuildRequires: libbpf-devel numactl-devel +BuildRequires: libxdp-devel libbpf-devel numactl-devel +%endif +%if %{with usdt} +BuildRequires: libbpf-devel systemtap-sdt-devel %endif BuildRequires: unbound unbound-devel @@ -113,7 +118,7 @@ Summary: Open vSwitch python3 bindings License: ASL 2.0 BuildArch: noarch Requires: python3 -Suggests: python3-netaddr python3-pyparsing +Suggests: python3-netaddr python3-pyparsing python3-unbound %{?python_provide:%python_provide python3-openvswitch = %{version}-%{release}} %description -n python3-openvswitch @@ -171,11 +176,17 @@ This package provides IPsec tunneling support for OVS tunnels. %endif %if %{with afxdp} --enable-afxdp \ +%else + --disable-afxdp \ +%endif +%if %{with usdt} + --enable-usdt-probes \ %endif --enable-ssl \ --disable-static \ --enable-shared \ --with-pkidir=%{_sharedstatedir}/openvswitch/pki \ + --with-version-suffix=-%{release} \ PYTHON3=%{__python3} build-aux/dpdkstrip.py \ @@ -238,8 +249,6 @@ rm -rf $RPM_BUILD_ROOT/%{_datadir}/openvswitch/python/ install -d -m 0755 $RPM_BUILD_ROOT/%{_sharedstatedir}/openvswitch -touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/conf.db -touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/.conf.db.~lock~ touch $RPM_BUILD_ROOT%{_sysconfdir}/openvswitch/system-id.conf install -p -m 644 -D selinux/openvswitch-custom.pp \ @@ -328,6 +337,21 @@ if [ $1 -eq 1 ]; then fi %endif +# Ensure that /etc/openvswitch/conf.db links to /var/lib/openvswitch, +# moving an existing file if there is one. +# +# Ditto for .conf.db.~lock~. +for base in conf.db .conf.db.~lock~; do + new=/var/lib/openvswitch/$base + old=/etc/openvswitch/$base + if test -f $old && test ! -e $new; then + mv $old $new + fi + if test ! -e $old && test ! -h $old; then + ln -s $new $old + fi +done + %if 0%{?systemd_post:1} # This may not enable openvswitch service or do daemon-reload. %systemd_post %{name}.service @@ -383,6 +407,7 @@ fi %{_bindir}/ovs-pcap %{_bindir}/ovs-tcpdump %{_bindir}/ovs-tcpundump +%{_datadir}/openvswitch/scripts/usdt/* %{_mandir}/man8/ovs-test.8* %{_mandir}/man8/ovs-vlan-test.8* %{_mandir}/man8/ovs-l3ping.8* @@ -413,8 +438,8 @@ fi %endif %dir %{_sysconfdir}/openvswitch %{_sysconfdir}/openvswitch/default.conf -%config %ghost %{_sysconfdir}/openvswitch/conf.db -%ghost %{_sysconfdir}/openvswitch/.conf.db.~lock~ +%config %ghost %{_sharedstatedir}/openvswitch/conf.db +%ghost %{_sharedstatedir}/openvswitch/.conf.db.~lock~ %config %ghost %{_sysconfdir}/openvswitch/system-id.conf %config(noreplace) %{_sysconfdir}/sysconfig/openvswitch %defattr(-,root,root) @@ -486,7 +511,11 @@ fi %{_prefix}/lib/udev/rules.d/91-vfio.rules %endif %doc NOTICE README.rst NEWS rhel/README.RHEL.rst -/var/lib/openvswitch +%if %{with dpdk} +%attr(750,openvswitch,hugetlbfs) /var/lib/openvswitch +%else +%attr(750,openvswitch,openvswitch) /var/lib/openvswitch +%endif %attr(750,root,root) /var/log/openvswitch %ghost %attr(755,root,root) %{_rundir}/openvswitch %ghost %attr(644,root,root) %{_rundir}/openvswitch.useropts diff --git a/rhel/usr_lib_systemd_system_ovsdb-server.service b/rhel/usr_lib_systemd_system_ovsdb-server.service index 49dc06e38c2..558632320cc 100644 --- a/rhel/usr_lib_systemd_system_ovsdb-server.service +++ b/rhel/usr_lib_systemd_system_ovsdb-server.service @@ -29,3 +29,4 @@ ExecStop=/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd stop ExecReload=/usr/share/openvswitch/scripts/ovs-ctl --no-ovs-vswitchd \ ${OVS_USER_OPT} \ --no-monitor restart $OPTIONS +TimeoutSec=300 diff --git a/tests/.gitignore b/tests/.gitignore index 83b1cb3b489..3a8c4597564 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,6 +3,7 @@ /Makefile.in /atconfig /atlocal +/clang-analyzer-results/ /idltest.c /idltest.h /idltest.ovsidl diff --git a/tests/alb.at b/tests/alb.at index 922185d61d8..32dc40a1b66 100644 --- a/tests/alb.at +++ b/tests/alb.at @@ -2,10 +2,6 @@ AT_BANNER([PMD Auto Load Balance]) m4_divert_push([PREPARE_TESTS]) -get_log_next_line_num () { - LINENUM=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) -} - m4_divert_pop([PREPARE_TESTS]) m4_define([DUMMY_NUMA], [--dummy-numa="0,0"]) diff --git a/tests/appctl.py b/tests/appctl.py index b85b364fac5..5f4b2754a33 100644 --- a/tests/appctl.py +++ b/tests/appctl.py @@ -37,6 +37,19 @@ def connect_to_target(target): return client +def reply_to_string(reply, fmt=ovs.unixctl.UnixctlOutputFormat.TEXT, + fmt_flags={}): + if fmt == ovs.unixctl.UnixctlOutputFormat.TEXT: + body = str(reply) + else: + body = ovs.json.to_string(reply, **fmt_flags) + + if body and not body.endswith("\n"): + body += "\n" + + return body + + def main(): parser = argparse.ArgumentParser(description="Python Implementation of" " ovs-appctl.") @@ -49,25 +62,51 @@ def main(): help="Arguments to the command.") parser.add_argument("-T", "--timeout", metavar="SECS", help="wait at most SECS seconds for a response") + parser.add_argument("-f", "--format", metavar="FMT", + help="Output format.", default="text", + choices=[fmt.name.lower() + for fmt in ovs.unixctl.UnixctlOutputFormat], + type=str.lower) + parser.add_argument("--pretty", action="store_true", + help="Format the output in a more readable fashion." + " Requires: --format json.") args = parser.parse_args() + if (args.format != ovs.unixctl.UnixctlOutputFormat.JSON.name.lower() + and args.pretty): + ovs.util.ovs_fatal(0, "--pretty is supported with --format json only") + signal_alarm(int(args.timeout) if args.timeout else None) ovs.vlog.Vlog.init() target = args.target + format = ovs.unixctl.UnixctlOutputFormat[args.format.upper()] + format_flags = dict(pretty=True) if args.pretty else {} client = connect_to_target(target) + + if format != ovs.unixctl.UnixctlOutputFormat.TEXT: + err_no, error, _ = client.transact( + "set-options", ["--format", args.format]) + + if err_no: + ovs.util.ovs_fatal(err_no, "%s: transaction error" % target) + elif error is not None: + sys.stderr.write(reply_to_string(error)) + ovs.util.ovs_error(0, "%s: server returned an error" % target) + sys.exit(2) + err_no, error, result = client.transact(args.command, args.argv) client.close() if err_no: ovs.util.ovs_fatal(err_no, "%s: transaction error" % target) elif error is not None: - sys.stderr.write(error) + sys.stderr.write(reply_to_string(error)) ovs.util.ovs_error(0, "%s: server returned an error" % target) sys.exit(2) else: assert result is not None - sys.stdout.write(result) + sys.stdout.write(reply_to_string(result, format, format_flags)) if __name__ == '__main__': diff --git a/tests/atlocal.in b/tests/atlocal.in index e02248f6f82..8565a0bae9f 100644 --- a/tests/atlocal.in +++ b/tests/atlocal.in @@ -2,9 +2,13 @@ HAVE_OPENSSL='@HAVE_OPENSSL@' OPENSSL_SUPPORTS_SNI='@OPENSSL_SUPPORTS_SNI@' HAVE_UNBOUND='@HAVE_UNBOUND@' +HAVE_BACKTRACE='@HAVE_BACKTRACE@' +HAVE_UNWIND='@HAVE_UNWIND@' EGREP='@EGREP@' PYTHON3='@PYTHON3@' CFLAGS='@CFLAGS@' +HAVE_TCA_HTB_RATE64='@HAVE_TCA_HTB_RATE64@' +HAVE_TCA_POLICE_PKTRATE64='@HAVE_TCA_POLICE_PKTRATE64@' # PYTHONCOERCECLOCALE=0 disables the Unicode compatibility warning on # stderr that breaks almost any Python3 test (PEP 0538) @@ -172,23 +176,15 @@ fi # Set HAVE_TC find_command tc -# When HAVE_TC=yes, check if the current tc supports adding pps filter -SUPPORT_TC_INGRESS_PPS="no" -if test $HAVE_TC="yes"; then - ip link add veth0 type veth peer name veth1 - tc qdisc add dev veth0 handle ffff: ingress - if tc filter add dev veth0 parent ffff: u32 match u32 0 0 police pkts_rate 100 pkts_burst 10; then - SUPPORT_TC_INGRESS_PPS="yes" - fi - ip link del veth0 -fi - # Set HAVE_TCPDUMP find_command tcpdump # Set HAVE_LFTP find_command lftp +# Set HAVE_ETHTOOL +find_command ethtool + CURL_OPT="-g -v --max-time 1 --retry 2 --retry-delay 1 --connect-timeout 1" # Determine whether "diff" supports "normal" diffs. (busybox diff does not.) @@ -198,16 +194,6 @@ else DIFF_SUPPORTS_NORMAL_FORMAT=no fi -# Check whether UB Sanitizer is being used. -case "$CFLAGS" in -*fsanitize=undefined*) - TESTS_WITH_UBSAN=yes - ;; -*) - TESTS_WITH_UBSAN=no - ;; -esac - # Turn off proxies. unset http_proxy unset https_proxy @@ -231,30 +217,47 @@ export OVS_CTL_TIMEOUT # # We disable leak detection because otherwise minor leaks that don't # matter break everything. -ASAN_OPTIONS=detect_leaks=0:abort_on_error=true:log_path=asan:$ASAN_OPTIONS +ASAN_OPTIONS=detect_leaks=0:abort_on_error=true:log_path=sanitizers:$ASAN_OPTIONS export ASAN_OPTIONS # Add some default flags for UndefinedBehaviorSanitizer, if it was used # for the build. -UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=true:log_path=ubsan:$UBSAN_OPTIONS +UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=true:log_path=sanitizers:$UBSAN_OPTIONS export UBSAN_OPTIONS # Check whether Python test requirements are available. REQUIREMENT_PATH=$abs_top_srcdir/python/test_requirements.txt $PYTHON3 -c ' import os import pathlib -import pkg_resources import sys +PACKAGING = True +try: + from packaging import requirements + from importlib import metadata +except ModuleNotFoundError: + PACKAGING = False + import pkg_resources + with pathlib.Path(os.path.join(os.getenv("REQUIREMENT_PATH"))).open() as reqs: - for req in pkg_resources.parse_requirements(reqs): - try: - pkg_resources.require(str(req)) - except pkg_resources.DistributionNotFound: - sys.exit(2) + if PACKAGING: + for req in reqs.readlines(): + try: + r = requirements.Requirement(req.strip()) + if metadata.version(r.name) not in r.specifier: + raise metadata.PackageNotFoundError + except metadata.PackageNotFoundError: + sys.exit(2) + else: + for req in pkg_resources.parse_requirements(reqs): + try: + pkg_resources.require(str(req)) + except pkg_resources.DistributionNotFound: + sys.exit(2) ' case $? in 0) HAVE_PYTEST=yes ;; 2) HAVE_PYTEST=no ;; - *) echo "$0: unexpected error probing Python unit test requirements" >&2 ;; + *) HAVE_PYTEST=no + echo "$0: unexpected error probing Python unit test requirements" >&2 ;; esac diff --git a/tests/automake.mk b/tests/automake.mk index d509cf93504..edfc2cb3359 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -19,6 +19,8 @@ EXTRA_DIST += \ $(OVSDB_CLUSTER_TESTSUITE) \ tests/atlocal.in \ $(srcdir)/package.m4 \ + $(srcdir)/tests/test-dpparse.py \ + $(srcdir)/tests/test-ofparse.py \ $(srcdir)/tests/testsuite \ $(srcdir)/tests/testsuite.patch @@ -108,7 +110,8 @@ TESTSUITE_AT = \ tests/mcast-snooping.at \ tests/packet-type-aware.at \ tests/nsh.at \ - tests/drop-stats.at + tests/drop-stats.at \ + tests/learning-switch.at EXTRA_DIST += $(FUZZ_REGRESSION_TESTS) FUZZ_REGRESSION_TESTS = \ @@ -143,10 +146,6 @@ $(srcdir)/tests/fuzz-regression-list.at: tests/automake.mk echo "TEST_FUZZ_REGRESSION([$$basename])"; \ done > $@.tmp && mv $@.tmp $@ -EXTRA_DIST += $(MFEX_AUTOVALIDATOR_TESTS) -MFEX_AUTOVALIDATOR_TESTS = \ - tests/mfex_fuzzy.py - OVSDB_CLUSTER_TESTSUITE_AT = \ tests/ovsdb-cluster-testsuite.at \ tests/ovsdb-execution.at \ @@ -184,7 +183,8 @@ SYSTEM_TESTSUITE_AT = \ SYSTEM_OFFLOADS_TESTSUITE_AT = \ tests/system-common-macros.at \ tests/system-offloads-traffic.at \ - tests/system-offloads-testsuite.at + tests/system-offloads-testsuite.at \ + tests/system-offloads-testsuite-macros.at SYSTEM_DPDK_TESTSUITE_AT = \ tests/system-common-macros.at \ @@ -211,8 +211,7 @@ AUTOTEST_PATH = utilities:vswitchd:ovsdb:vtep:tests:ipsec:$(PTHREAD_WIN32_DIR_DL check-local: set $(SHELL) '$(TESTSUITE)' -C tests AUTOTEST_PATH=$(AUTOTEST_PATH); \ "$$@" $(TESTSUITEFLAGS) || \ - (test -z "$$(find $(TESTSUITE_DIR) -name 'asan.*')" && \ - test -z "$$(find $(TESTSUITE_DIR) -name 'ubsan.*')" && \ + (test -z "$$(find $(TESTSUITE_DIR) -name 'sanitizers.*')" && \ test X'$(RECHECK)' = Xyes && "$$@" --recheck) # Python Coverage support. @@ -451,10 +450,12 @@ tests_ovstest_SOURCES = \ tests/test-barrier.c \ tests/test-bundle.c \ tests/test-byte-order.c \ + tests/test-byteq.c \ tests/test-classifier.c \ tests/test-ccmap.c \ tests/test-cmap.c \ tests/test-conntrack.c \ + tests/test-cooperative-multitasking.c \ tests/test-csum.c \ tests/test-flows.c \ tests/test-hash.c \ @@ -474,6 +475,7 @@ tests_ovstest_SOURCES = \ tests/test-packets.c \ tests/test-random.c \ tests/test-rcu.c \ + tests/test-rculist.c \ tests/test-reconnect.c \ tests/test-rstp.c \ tests/test-sflow.c \ @@ -497,7 +499,8 @@ endif if LINUX tests_ovstest_SOURCES += \ tests/test-netlink-conntrack.c \ - tests/test-netlink-policy.c + tests/test-netlink-policy.c \ + tests/test-psample.c endif tests_ovstest_LDADD = lib/libopenvswitch.la @@ -516,12 +519,15 @@ tests_test_type_props_SOURCES = tests/test-type-props.c CHECK_PYFILES = \ tests/appctl.py \ tests/flowgen.py \ - tests/mfex_fuzzy.py \ + tests/genpkts.py \ tests/ovsdb-monitor-sort.py \ + tests/system-dpdk-find-device.py \ tests/test-daemon.py \ + tests/test-dpparse.py \ tests/test-json.py \ tests/test-jsonrpc.py \ tests/test-l7.py \ + tests/test-ofparse.py \ tests/test-ovsdb.py \ tests/test-reconnect.py \ tests/test-stream.py \ diff --git a/tests/checkpatch.at b/tests/checkpatch.at index fdcdb846e1c..34971c514ca 100755 --- a/tests/checkpatch.at +++ b/tests/checkpatch.at @@ -1,14 +1,21 @@ AT_BANNER([checkpatch]) OVS_START_SHELL_HELPERS -# try_checkpatch PATCH [ERRORS] +# try_checkpatch PATCH [ERRORS] [checkpatch-args] # # Runs checkpatch, if installed, on the given PATCH, expecting the # specified set of ERRORS (and warnings). try_checkpatch() { # Take the patch to test from $1. Remove an initial four-space indent # from it and, if it is just headers with no body, add a null body. + # If it does not have a 'Subject', add a valid one. echo "$1" | sed 's/^ //' > test.patch + if grep 'Subject\:' test.patch >/dev/null 2>&1; then : + else + sed -i'' -e '1i\ +Subject: Patch this is. +' test.patch + fi if grep '---' expout >/dev/null 2>&1; then : else printf '\n---\n' >> test.patch @@ -22,11 +29,11 @@ try_checkpatch() { fi if test -s expout; then - AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py -q test.patch], + AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py $3 -q test.patch], [1], [stdout]) AT_CHECK([sed '/^Lines checked:/,$d' stdout], [0], [expout]) else - AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py -q test.patch]) + AT_CHECK([$PYTHON3 $top_srcdir/utilities/checkpatch.py $3 -q test.patch]) fi } OVS_END_SHELL_HELPERS @@ -346,6 +353,31 @@ try_checkpatch \ if (--mcs->n_refs==0) { " +try_checkpatch \ + "COMMON_PATCH_HEADER + +char *string; + +char **list; + +char ***ptr_list; + " + +try_checkpatch \ + "COMMON_PATCH_HEADER + +char** list; + " \ + "WARNING: Line lacks whitespace around operator + #8 FILE: A.c:1: + char** list; + " + +try_checkpatch \ + "COMMON_PATCH_HEADER + +char*** list; + " \ + "WARNING: Line lacks whitespace around operator + #8 FILE: A.c:1: + char*** list; + " + AT_CLEANUP AT_SETUP([checkpatch - check misuse APIs]) @@ -560,3 +592,45 @@ try_checkpatch \ " AT_CLEANUP + +AT_SETUP([checkpatch - subject]) +try_checkpatch \ + "Author: A + Commit: A + Subject: netdev: invalid case and dot ending + + Signed-off-by: A" \ + "WARNING: The subject summary should start with a capital. + WARNING: The subject summary should end with a dot. + Subject: netdev: invalid case and dot ending" + +try_checkpatch \ + "Author: A + Commit: A + Subject: netdev: This is a way to long commit summary and therefor it should report a WARNING! + + Signed-off-by: A" \ + "WARNING: The subject, ': ', is over 70 characters, i.e., 85. + Subject: netdev: This is a way to long commit summary and therefor it should report a WARNING!" + +AT_CLEANUP + +AT_SETUP([checkpatch - ignore committer as signoff]) +try_checkpatch \ + "Author: A + Commit: B + Subject: netdev: Subject. + + Signed-off-by: A" \ + "ERROR: Committer B needs to sign off." + +try_checkpatch \ + "Author: A + Commit: B + Subject: netdev: Subject. + + Signed-off-by: A" \ + "" \ + "--skip-committer-signoff" + +AT_CLEANUP diff --git a/tests/classifier.at b/tests/classifier.at index f652b59837b..93a13f32b13 100644 --- a/tests/classifier.at +++ b/tests/classifier.at @@ -65,6 +65,94 @@ Datapath actions: 2 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([flow classifier - lookup segmentation - final stage]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=33,tcp,tp_dst=80,tcp_flags=+psh,action=output(2) +table=0 in_port=1 priority=0,ip,action=drop +table=0 in_port=2 priority=16,icmp6,nw_ttl=255,icmp_type=135,icmp_code=0,nd_target=1000::1 ,action=output(1) +table=0 in_port=2 priority=0,ip,action=drop +table=0 in_port=3 action=resubmit(,1) +table=1 in_port=3 priority=45,ct_state=+trk+rpl,ct_nw_proto=6,ct_tp_src=3/0x1,tcp,tp_dst=80,tcp_flags=+psh,action=output(2) +table=1 in_port=3 priority=10,ip,action=drop +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=syn'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=syn|ack'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=ack|psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=+psh +Datapath actions: 2 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=80,tcp_flags=-psh +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=79'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=0x40/0xfff0,tcp_flags=-psh +Datapath actions: drop +]) + +dnl Having both the port and the tcp flags in the resulting megaflow below +dnl is redundant, but that is how ports trie logic is implemented. +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=81'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,tcp,in_port=1,nw_frag=no,tp_dst=81,tcp_flags=-psh +Datapath actions: drop +]) + +dnl nd_target is redundant in the megaflow below and it is also not relevant +dnl for an icmp reply. Datapath may discard that match, but it is OK as long +dnl as we have prerequisites (icmp_type) in the match as well. +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=128,icmpv6_code=0"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x80/0xfc,nd_target=:: +Datapath actions: drop +]) + +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=:: +Datapath actions: drop +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0,nd_target=1000::1"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=1000::1 +Datapath actions: 1 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=2,eth_src=f6:d2:b0:19:5e:7b,eth_dst=d2:49:19:91:78:fe,dl_type=0x86dd,ipv6_src=1000::3,ipv6_dst=1000::4,nw_proto=58,nw_ttl=255,icmpv6_type=135,icmpv6_code=0,nd_target=1000::2"], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,eth,icmp6,in_port=2,nw_ttl=255,nw_frag=no,icmp_type=0x87/0xff,icmp_code=0x0/0xff,nd_target=1000::2 +Datapath actions: drop +]) + +dnl Check that ports' mask doesn't affect ct ports. +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=3,ct_state=trk|rpl,ct_nw_proto=6,ct_tp_src=3,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=80,tcp_flags=psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,ct_state=+rpl+trk,ct_nw_proto=6,ct_tp_src=0x1/0x1,eth,tcp,in_port=3,nw_frag=no,tp_dst=80,tcp_flags=+psh +Datapath actions: 2 +]) +AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=3,ct_state=trk|rpl,ct_nw_proto=6,ct_tp_src=3,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=6,nw_tos=0,nw_ttl=128,tp_src=8,tp_dst=79,tcp_flags=psh'], [0], [stdout]) +AT_CHECK([tail -2 stdout], [0], + [Megaflow: recirc_id=0,ct_state=+rpl+trk,ct_nw_proto=6,ct_tp_src=0x1/0x1,eth,tcp,in_port=3,nw_frag=no,tp_dst=0x40/0xfff0,tcp_flags=+psh +Datapath actions: drop +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([flow classifier prefix lookup]) AT_SETUP([flow classifier - prefix lookup]) OVS_VSWITCHD_START @@ -188,6 +276,13 @@ for src in 0 1 2 3 4 5 6 7; do AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=1,dl_type=0x0800,nw_src=10.0.0.$src,nw_dst=10.0.0.$dst"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: $out ]) + dnl Check detailed output for conjunctive match. + if test $out = 3; then + AT_CHECK_UNQUOTED([cat stdout | grep conj\\. | sort], [0], [dnl + -> conj. priority=100,ip,nw_dst=10.0.0.$dst + -> conj. priority=100,ip,nw_src=10.0.0.$src +]) + fi done done OVS_VSWITCHD_STOP @@ -330,6 +425,98 @@ ovs-ofctl: "conjunction" actions may be used along with "note" but not any other OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conjunctive match with same priority]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=2 +conj_id=2,actions=drop + +priority=10,ip,ip_dst=10.0.0.1,actions=conjunction(1,1/2) +priority=10,ip,ip_src=10.0.0.2,actions=conjunction(1,2/2) +priority=10,ip,ip_dst=10.0.0.3,actions=conjunction(2,1/2) +priority=10,ip,in_port=1,actions=conjunction(2,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that "priority=10,ip,in_port=1,actions=conjunction(2,2/2)" is +# correctly excluded from the output. +AT_CHECK([ovs-appctl ofproto/trace br0 "in_port=1,dl_type=0x0800,nw_dst=10.0.0.1,nw_src=10.0.0.2" | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,ip,nw_dst=10.0.0.1 + -> conj. priority=10,ip,nw_src=10.0.0.2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with metadata]) +OVS_VSWITCHD_START +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0,len=4}->tun_metadata0"]) +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=1,len=8}->tun_metadata1"]) +AT_DATA([flows.txt], [dnl +conj_id=7,actions=drop + +priority=5,tun_metadata0=0x1,actions=conjunction(7,1/2) +priority=5,tun_metadata1=0x2,actions=conjunction(7,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that tunnel metadata is included in the output. +AT_CHECK([ovs-appctl ofproto/trace br0 "tun_metadata0=0x1,tun_metadata1=0x2,in_port=br0" | grep conj\\. | sort], [0], [dnl + -> conj. priority=5,tun_metadata0=0x1 + -> conj. priority=5,tun_metadata1=0x2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with or without port map]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=drop +conj_id=2,actions=drop + +priority=10,ip,actions=conjunction(1,1/2),conjunction(2,1/2) +priority=10,in_port=p1,actions=conjunction(1,2/2) +priority=10,in_port=p2,actions=conjunction(1,2/2) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p1" --names | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=p1 + -> conj. priority=10,ip +]) +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p2" | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=2 + -> conj. priority=10,ip +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conjunctive match with resubmit]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 +AT_DATA([flows.txt], [dnl +conj_id=1,actions=resubmit(,2) +priority=10,ip,actions=conjunction(1,1/2) +priority=10,in_port=p1,actions=conjunction(1,2/2) +priority=10,in_port=p2,actions=conjunction(1,2/2) + +table=2,conj_id=7,actions=resubmit(,3) +table=2,priority=20,ip,actions=conjunction(7,1/2) +table=2,priority=20,in_port=p1,actions=conjunction(7,2/2) +table=2,priority=20,in_port=p2,actions=conjunction(7,2/2) + +table=3,actions=drop +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +# Check that conj_flows are reset for each table and that they are output +# exactly once. +AT_CHECK([ovs-appctl ofproto/trace br0 "ip,in_port=p1" --names | grep conj\\. | sort], [0], [dnl + -> conj. priority=10,in_port=p1 + -> conj. priority=10,ip + -> conj. priority=20,in_port=p1 + -> conj. priority=20,ip +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + # Flow classifier a packet with excess of padding. AT_SETUP([flow classifier - packet with extra padding]) OVS_VSWITCHD_START diff --git a/tests/daemon.at b/tests/daemon.at index d7981f9d23a..6cb8b98883d 100644 --- a/tests/daemon.at +++ b/tests/daemon.at @@ -78,12 +78,9 @@ AT_CLEANUP AT_SETUP([daemon --monitor]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start the daemon and wait for the pidfile to get created. on_exit 'kill $(cat *.pid)' @@ -150,12 +147,9 @@ AT_CLEANUP AT_SETUP([daemon --detach --monitor]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS on_exit 'kill $(cat *.pid)' @@ -234,3 +228,52 @@ OVS_WAIT_UNTIL([sc query ovsdb-server | grep STATE | grep STOPPED > /dev/null 2> AT_CHECK([sc delete ovsdb-server], [0], [[[SC]] DeleteService SUCCESS ]) AT_CLEANUP + +AT_SETUP([backtrace without monitor]) +AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. +ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS + +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --no-db \ + --log-file --verbose=DBG], [0], [ignore], [ignore]) +OVS_WAIT_UNTIL([test -s ovsdb-server.pid]) +child=$(cat ovsdb-server.pid) + +AT_CAPTURE_FILE([ovsdb-server.log]) + +AT_CHECK([kill -SEGV $child]) + +OVS_WAIT_UNTIL([grep -q "^SIGSEGV detected, backtrace:" ovsdb-server.log]) + +AT_CLEANUP + +AT_SETUP([backtrace with monitor]) +AT_SKIP_IF([test "$HAVE_BACKTRACE" = "no" && test "$HAVE_UNWIND" = "no"]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. +ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS + +on_exit 'kill $(cat *.pid)' + +AT_CHECK([ovsdb-server --detach --monitor --no-chdir --pidfile --no-db \ + --log-file --verbose=DBG], [0], [ignore], [ignore]) +OVS_WAIT_UNTIL([test -s ovsdb-server.pid]) +child=$(cat ovsdb-server.pid) + +AT_CAPTURE_FILE([ovsdb-server.log]) + +AT_CHECK([kill -SEGV $child]) + +OVS_WAIT_UNTIL([grep -q "backtrace(monitor)|WARN|SIGSEGV detected, backtrace:" ovsdb-server.log]) +OVS_WAIT_UNTIL([grep -q "daemon_unix(monitor)|ERR|1 crashes: pid .* died, killed (Segmentation fault)" ovsdb-server.log]) + +# Wait until a new process is started before exiting, so it will be +# stopped correctly. +OVS_WAIT_UNTIL([test -s ovsdb-server.pid && test $(cat ovsdb-server.pid) != $child]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +AT_CLEANUP diff --git a/tests/dpctl.at b/tests/dpctl.at index 7454a51ec6b..a87f67f9870 100644 --- a/tests/dpctl.at +++ b/tests/dpctl.at @@ -135,3 +135,25 @@ AT_CHECK([ovs-appctl dpctl/dump-flows dummy@br0 | sort], [0], [dnl AT_CHECK([ovs-appctl dpctl/del-dp dummy@br0]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([dpctl - ct-set-limits ct-get-limits ct-del-limits]) +OVS_VSWITCHD_START +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=,], [0], [default limit=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=x], [2], [], + [ovs-vswitchd: invalid zone (Invalid argument) +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=]) +AT_CHECK([ovs-appctl dpctl/ct-set-limits zone=0,limit=0]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0], [0], [default limit=0 +zone=0,limit=0,count=0 +]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=0]) + +OVS_VSWITCHD_STOP +AT_CLEANUP \ No newline at end of file diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 3179e1645d8..bdc24cc3071 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -6,8 +6,8 @@ m4_divert_push([PREPARE_TESTS]) # that vary from one run to another (e.g., timing and bond actions). strip_timers () { sed ' - s/duration:[0-9]*\.[0-9]*/duration:0.0/ - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/duration:[0-9\.][0-9\.]*/duration:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ ' } @@ -15,7 +15,7 @@ strip_xout () { sed ' s/mega_ufid:[-0-9a-f]* // s/ufid:[-0-9a-f]* // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/actions:.*/actions: / s/packets:[0-9]*/packets:0/ s/bytes:[0-9]*/bytes:0/ @@ -26,7 +26,7 @@ strip_xout_keep_actions () { sed ' s/mega_ufid:[-0-9a-f]* // s/ufid:[-0-9a-f]* // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/packets:[0-9]*/packets:0/ s/bytes:[0-9]*/bytes:0/ ' | sort @@ -51,7 +51,7 @@ filter_hw_packet_netdev_dummy () { filter_flow_dump () { grep 'flow_dump ' | sed ' s/.*flow_dump // - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ ' | sort | uniq } @@ -72,13 +72,13 @@ ovs-appctl time/warp 5000 AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack)']) OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:01,dst=50:54:00:00:02:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) ]) AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack)' --len 1024]) OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:06:00),eth_type(0x0800),ipv4(src=10.0.0.5,dst=10.0.0.6,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9),tcp_flags(ack) ]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -139,7 +139,7 @@ m4_define([DPIF_NETDEV_MISS_FLOW_INSTALL], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(frag=no), actions: @@ -152,11 +152,11 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50: OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(frag=no), actions: -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions: ]) OVS_VSWITCHD_STOP @@ -187,7 +187,7 @@ m4_define([DPIF_NETDEV_FLOW_PUT_MODIFY], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=1000,pcp=5),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no)) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=1000,pcp=5),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no)) ]) ovs-appctl revalidator/wait # Dump the datapath flow to see that it goes to p2 ("actions:2"). @@ -236,11 +236,11 @@ m4_define([DPIF_NETDEV_MISS_FLOW_DUMP], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) ovs-appctl revalidator/wait AT_CHECK([filter_flow_dump < ovs-vswitchd.log | strip_xout], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: ]) # Now, the same again without megaflows. @@ -252,12 +252,12 @@ skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label OVS_WAIT_UNTIL([test `grep -c "miss upcall" ovs-vswitchd.log` -ge 2]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) ovs-appctl revalidator/wait AT_CHECK([filter_flow_dump < ovs-vswitchd.log | strip_xout], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:0, bytes:0, used:never, actions: -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:0, bytes:0, used:never, actions: +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions: ]) OVS_VSWITCHD_STOP @@ -423,7 +423,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=0,tos=0,ttl=64,frag=no) ]) # Check that flow successfully offloaded. OVS_WAIT_UNTIL([grep "succeed to add netdev flow" ovs-vswitchd.log]) @@ -489,7 +489,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),dnl packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=99,pcp=7),encap(eth_type(0x0800),ipv4(src=127.0.0.1,dst=127.0.0.1,proto=17,tos=0,ttl=64,frag=no),udp(src=81,dst=82)) ]) # Check that flow successfully offloaded. @@ -566,7 +566,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS_VID_ARP], OVS_WAIT_UNTIL([grep "miss upcall" ovs-vswitchd.log]) AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),dnl +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),dnl packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type(0x8100),vlan(vid=99,pcp=7),encap(eth_type(0x0806),arp(sip=127.0.0.1,tip=127.0.0.1,op=1,sha=00:0b:0c:0d:0e:0f,tha=00:00:00:00:00:00)) ]) # Check that flow successfully offloaded. @@ -636,6 +636,38 @@ OVS_VSWITCHD_STOP(["/flow: in_port is not an exact match/d /failed to put/d"]) AT_CLEANUP +AT_SETUP([dpif-netdev - check dpctl/add-flow wider ip match]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p0.sock \ + -- set bridge br0 datapath-type=dummy]) + +AT_CHECK([ovs-appctl revalidator/pause]) +AT_CHECK([ovs-appctl dpctl/add-flow "in_port(1),eth_type(0x0800),ipv4(src=0.0.0.0/192.0.0.0,dst=0.0.0.0/192.0.0.0,frag=no)" "3"]) +AT_CHECK([ovs-appctl dpctl/add-flow "in_port(1),eth_type(0x0800),ipv4(src=192.1.1.1/0.0.0.0,dst=49.1.1.1/0.0.0.0,frag=no)" "3"]) +AT_CHECK([ovs-appctl revalidator/resume]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([dpif-netdev - check tx packet checksum offloading]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy options:pstream=punix:$OVS_RUNDIR/p0.sock \ + -- set bridge br0 datapath-type=dummy \ + other-config:datapath-id=1234 fail-mode=secure]) + +AT_CHECK([ovs-vsctl get interface p1 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl +tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false" +], []) + +AT_CHECK([ovs-vsctl get interface br0 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl +tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false" +], []) + +OVS_VSWITCHD_STOP +AT_CLEANUP + # SEND_UDP_PKTS([p_name], [p_ofport]) # # Sends 128 packets to port 'p_name' with different UDP destination ports. @@ -702,3 +734,423 @@ AT_CHECK([test `ovs-vsctl get Interface p2 statistics:tx_q0_packets` -gt 0 -a dn OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([userspace offload - ip csum offload]) +OVS_VSWITCHD_START( + [add-br br1 -- set bridge br1 datapath-type=dummy -- \ + add-port br1 p1 -- \ + set Interface p1 type=dummy -- \ + add-port br1 p2 -- \ + set Interface p2 type=dummy --]) + +# Modify the ip_dst addr to force changing the IP csum. +AT_CHECK([ovs-ofctl add-flow br1 in_port=p1,actions=mod_nw_dst:192.168.1.1,output:p2]) + +flow_s="\ + eth_src=8a:bf:7e:2f:05:84,eth_dst=0a:8f:39:4f:e0:73,dl_type=0x0800,\ + nw_src=192.168.123.2,nw_dst=192.168.123.1,nw_proto=6,nw_ttl=64,nw_frag=no,\ + tp_src=54392,tp_dst=5201,tcp_flags=ack" + +good_frame=$(ovs-ofctl compose-packet --bare "${flow_s}") + +# Check if no offload remains ok. +AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame}]) + +# Checksum should change to 0x990 with ip_dst changed to 192.168.1.1 +# by the datapath while processing the packet. +flow_expected=$(echo "${flow_s}" | sed 's/192.168.123.1/192.168.1.1/g') +good_expected=$(ovs-ofctl compose-packet --bare "${flow_expected}") +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} +]) + +# Check if packets entering the datapath with csum offloading +# enabled gets the csum updated properly by egress handling +# in the datapath and not by the netdev. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame}]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} +]) + +# Check if packets entering the datapath with csum offloading +# enabled gets the csum updated properly by netdev and not +# by the datapath. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${good_frame} +]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} +]) + +# Push a packet with bad csum and offloading disabled to check +# if the datapath updates the csum, but does not fix the issue. +bad_frame=$(ovs-ofctl compose-packet --bare --bad-csum "${flow_s}") +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${bad_frame}]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +bad_expected=$(ovs-ofctl compose-packet --bare --bad-csum "${flow_expected}") +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${bad_expected} +]) + +# Push a packet with bad csum and offloading enabled to check +# if the driver updates and fixes the csum. +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true]) +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 ${bad_frame}]) +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p2.pcap.txt], [0], [${good_expected} +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([userspace offload - tso]) +OVS_VSWITCHD_START( + [set Open_vSwitch . other_config:userspace-tso-enable=true -- \ + add-br br1 -- set bridge br1 datapath-type=dummy -- \ + add-port br1 p1 -- \ + set Interface p1 type=dummy -- \ + add-port br1 p2 -- \ + set Interface p2 type=dummy]) + +dnl Simple passthrough rule. +AT_CHECK([ovs-ofctl add-flow br1 in_port=p1,actions=output:p2]) + +flow_s="in_port(1),eth(src=8a:bf:7e:2f:05:84,dst=0a:8f:39:4f:e0:73),eth_type(0x0800), \ + ipv4(src=192.168.123.2,dst=192.168.123.1,proto=6,tos=1,ttl=64,frag=no), \ + tcp(src=54392,dst=5201),tcp_flags(ack)" + +flow_s_v6="in_port(1),eth(src=8a:bf:7e:2f:05:84,dst=0a:8f:39:4f:e0:73),eth_type(0x86dd), \ + ipv6(src=2001:cafe::88,dst=2001:cafe::92,proto=6), \ + tcp(src=54392,dst=5201),tcp_flags(ack)" + +dnl Send from tso to no-tso. +AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap -- \ + set Interface p1 options:ol_ip_csum=true -- \ + set Interface p1 options:ol_ip_csum_set_good=false -- \ + set Interface p1 options:ol_tso_segsz=500]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s}" --len 2054]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s_v6}" --len 2074]) + +dnl Send from tso to tso. +AT_CHECK([ovs-vsctl set Interface p2 options:ol_ip_csum=true -- \ + set Interface p2 options:ol_ip_csum_set_good=false -- \ + set Interface p2 options:ol_tso_segsz=500]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s}" --len 2054]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "${flow_s_v6}" --len 2074]) + +dnl Check that first we have: +dnl - 4x IPv4 500 byte payloads +dnl - 4x IPv6 500 byte payloads +dnl - one IPv4 2000 byte payload, and +dnl - one IPv6 2000 byte payload +zero500=$(printf '0%.0s' $(seq 1000)) +AT_CHECK_UNQUOTED([ovs-pcap p2.pcap], [0], [dnl +[0a8f394fe0738abf7e2f058408004501021c0000000040060187c0a87b02c0a87b01]dnl +[d47814510000000000000000501000004dc20000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0001000040060186c0a87b02c0a87b01]dnl +[d4781451000001f400000000501000004bce0000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0002000040060185c0a87b02c0a87b01]dnl +[d4781451000003e8000000005010000049da0000${zero500}] +[0a8f394fe0738abf7e2f058408004501021c0003000040060184c0a87b02c0a87b01]dnl +[d4781451000005dc000000005010000047e60000${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000000000000000050100000edfd0000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000001f40000000050100000ec090000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000003e80000000050100000ea150000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f058486dd60000000020806002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000005dc0000000050100000e8210000]dnl +[${zero500}] +[0a8f394fe0738abf7e2f05840800450107f8000000004006fbaac0a87b02c0a87b01]dnl +[d478145100000000000000005010000047e60000${zero500}${zero500}${zero500}${zero500}] +[0a8f394fe0738abf7e2f058486dd6000000007e406002001cafe0000000000000000000000]dnl +[882001cafe000000000000000000000092d4781451000000000000000050100000e8210000]dnl +[${zero500}${zero500}${zero500}${zero500}] +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([dpif-netdev - revalidators handle dp modification fail correctly]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set interface p1 type=dummy \ + -- set bridge br0 datapath-type=dummy \ + -- add-port br0 p2 \ + -- set interface p2 type=dummy -- + ]) + +AT_CHECK([ovs-ofctl add-flow br0 'table=0,in_port=p1,actions=p2']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2),tcp(src=1,dst=2)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/.*thread://' | strip_xout_keep_actions ], [0], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:2 +]) + +dnl Wait for the dp flow to enter OPERATIONAL state. +AT_CHECK([ovs-appctl revalidator/wait]) + +AT_CHECK([ovs-appctl revalidator/pause]) + +dnl Delete all dp flows, so flow modification will fail. +AT_CHECK([ovs-appctl dpctl/del-flows]) + +AT_CHECK([ovs-appctl revalidator/resume]) + +dnl Replace OpenFlow rules, trigger revalidation and wait for it to complete. +AT_CHECK([echo 'table=0,in_port=p1,ip actions=ct(commit)' | ovs-ofctl --bundle replace-flows br0 -]) +AT_CHECK([ovs-appctl revalidator/wait]) + +dnl Inconsistent ukey should be deleted. +AT_CHECK([ovs-appctl upcall/show | grep keys | grep -q -v 0], [1]) + +dnl Check the log for the flow modification error. +AT_CHECK([grep -q -E ".*failed to put.*$" ovs-vswitchd.log]) + +dnl Remove warning logs to let test suite pass. +OVS_VSWITCHD_STOP(["dnl + /.*failed to put.*$/d + /.*failed to flow_del.*$/d"]) +AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Autovalidator]) +AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 > packets]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl +]) + +AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl +DPIF implementation set to dpif_avx512. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator. +]) + +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + +OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Autovalidator Fuzzy]) +AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) +AT_SKIP_IF([! $PYTHON3 $srcdir/genpkts.py 2000 fuzzy > packets]) +OVS_VSWITCHD_START( + [add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl +]) + +AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl +DPIF implementation set to dpif_avx512. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator. +]) + +cat packets | while read line; do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $line], [0], [ignore]) +done + +OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) + +OVS_VSWITCHD_STOP(["dnl +/upcall: datapath reached the dynamic limit of .* flows./d"]) +AT_CLEANUP + +AT_SETUP([dpif-netdev - MFEX Configuration]) +OVS_VSWITCHD_START( + [set Open_vSwitch . other_config:pmd-cpu-mask=0x1 \ + -- add-port br0 p1 \ + -- set Interface p1 type=dummy-pmd], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], +[], [dnl +Error: unknown argument 1. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 6 study 300 xyz], [2], +[], [dnl +Error: invalid study_pkt_cnt value: xyz. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 scalar abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd], [2], +[], [dnl +Error: -pmd option requires a thread id argument. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set tudy abcd], [2], +[], [dnl +Error: unknown argument abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 7 study abcd], [2], +[], [dnl +Error: invalid study_pkt_cnt value: abcd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study], [0], [dnl +Miniflow extract implementation set to study, on pmd thread 0, studying 128 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study 512], [0], [dnl +Miniflow extract implementation set to study, on pmd thread 0, studying 512 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study 512], [0], [dnl +Miniflow extract implementation set to study, studying 512 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study], [0], [dnl +Miniflow extract implementation set to study, studying 128 packets. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 autovalidator], [0], [dnl +Miniflow extract implementation set to autovalidator, on pmd thread 0. +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd zero study], [2], +[], [dnl +Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1], [2], +[], [dnl +Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 superstudy], [2], +[], [dnl +Error: unknown miniflow extract implementation superstudy. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set superstudy], [2], +[], [dnl +Error: unknown miniflow extract implementation superstudy. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 study -pmd], [2], +[], [dnl +Error: invalid study_pkt_cnt value: -pmd. +ovs-appctl: ovs-vswitchd: server returned an error +]) + +OVS_VSWITCHD_STOP(["dnl +/Error: unknown argument 1./d +/Error: invalid study_pkt_cnt value: xyz./d +/Error: unknown argument abcd./d +/Error: -pmd option requires a thread id argument./d +/Error: invalid study_pkt_cnt value: abcd./d +/Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID./d +/Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list./d +/Error: unknown miniflow extract implementation superstudy./d +/Error: invalid study_pkt_cnt value: -pmd./d"]) +AT_CLEANUP + +AT_SETUP([datapath - Actions Autovalidator Checksum]) + +OVS_VSWITCHD_START(add-port br0 p0 -- set Interface p0 type=dummy \ + -- add-port br0 p1 -- set Interface p1 type=dummy) + +AT_CHECK([ovs-appctl odp-execute/action-impl-set autovalidator], [0], [dnl +Action implementation set to autovalidator. +]) + +dnl Add flows to trigger checksum calculation. +AT_DATA([flows.txt], [dnl + in_port=p0,ip,actions=mod_nw_src=10.1.1.1,p1 + in_port=p0,ipv6,actions=set_field:fc00::100->ipv6_src,p1 +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl -Oopenflow13 add-flows br0 flows.txt]) + +dnl Make sure checksum won't be offloaded. +AT_CHECK([ovs-vsctl set Interface p0 options:ol_ip_csum=false]) +AT_CHECK([ovs-vsctl set Interface p0 options:ol_ip_csum_set_good=false]) + +AT_CHECK([ovs-vsctl set Interface p1 options:pcap=p1.pcap]) + +dnl IPv4 packet with values that will trigger carry-over addition for checksum. +flow_s_v4=" + eth_src=47:42:86:08:17:50,eth_dst=3e:55:b5:9e:3a:fb,dl_type=0x0800, + nw_src=229.167.36.90,nw_dst=130.161.64.186,nw_proto=6,nw_ttl=64,nw_frag=no, + tp_src=54392,tp_dst=5201,tcp_flags=ack" + +good_frame=$(ovs-ofctl compose-packet --bare "${flow_s_v4}") +AT_CHECK([ovs-appctl netdev-dummy/receive p0 ${good_frame}]) + +dnl Checksum should change to 0xAC33 with ip_src changed to 10.1.1.1 +dnl by the datapath while processing the packet. +flow_expected=$(echo "${flow_s_v4}" | sed 's/229.167.36.90/10.1.1.1/g') +good_expected=$(ovs-ofctl compose-packet --bare "${flow_expected}") +AT_CHECK([ovs-pcap p1.pcap > p1.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p1.pcap.txt], [0], [${good_expected} +]) + +dnl Repeat similar test for IPv6. +flow_s_v6=" + eth_src=8a:bf:7e:2f:05:84,eth_dst=0a:8f:39:4f:e0:73,dl_type=0x86dd, + ipv6_src=2f8a:2076:3926:9e7:2d47:4bc9:9c7:17f3, + ipv6_dst=7287:10dd:2fb9:41d5:3eb2:2c7a:11b0:6258, + ipv6_label=0x51ac,nw_proto=6,nw_ttl=142,nw_frag=no, + tp_src=20405,tp_dst=20662,tcp_flags=ack" + +good_frame_v6=$(ovs-ofctl compose-packet --bare "${flow_s_v6}") +AT_CHECK([ovs-appctl netdev-dummy/receive p0 ${good_frame_v6}]) + +dnl Checksum should change to 0x59FD with ipv6_src changed to fc00::100 +dnl by the datapath while processing the packet. +flow_expected_v6=$(echo "${flow_s_v6}" | \ + sed 's/2f8a:2076:3926:9e7:2d47:4bc9:9c7:17f3/fc00::100/g') +good_expected_v6=$(ovs-ofctl compose-packet --bare "${flow_expected_v6}") +AT_CHECK([ovs-pcap p1.pcap > p1.pcap.txt 2>&1]) +AT_CHECK_UNQUOTED([tail -n 1 p1.pcap.txt], [0], [${good_expected_v6} +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/drop-stats.at b/tests/drop-stats.at index 1d3af98dabe..946c998a1fc 100644 --- a/tests/drop-stats.at +++ b/tests/drop-stats.at @@ -191,3 +191,171 @@ ovs-appctl coverage/read-counter drop_action_too_many_mpls_labels OVS_VSWITCHD_STOP(["/|WARN|/d"]) AT_CLEANUP + +m4_define([ICMP_PKT], [m4_join([,], + [in_port(1),packet_type(ns=0,id=0)], + [eth(src=3a:6d:d2:09:9c:ab,dst=1e:2c:e9:2a:66:9e)], + [ipv4(src=192.168.10.10,dst=192.168.10.30,proto=1,tos=0,ttl=64,frag=no)], + [icmp(type=8,code=0)])]) + +AT_SETUP([drop-stats - bridge sampling]) + +OVS_VSWITCHD_START([dnl + set bridge br0 datapath_type=dummy \ + protocols=OpenFlow10,OpenFlow13,OpenFlow14,OpenFlow15 -- \ + add-port br0 p1 -- set Interface p1 type=dummy ofport_request=1]) + +AT_DATA([flows.txt], [dnl +table=0,in_port=1,actions=drop +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-vsctl -- set bridge br0 ipfix=@fix -- \ + --id=@fix create ipfix targets=\"127.0.0.1:4739\" \ + sampling=1], + [0], [ignore]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:userspace(pid=0,ipfix(output_port=4294967295)) +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +0 +]) + +dnl Now activate explicit sampled drops. +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) +AT_CHECK([ovs-appctl revalidator/wait]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:5, bytes:530, used:0.0s, dnl +actions:userspace(pid=0,ipfix(output_port=4294967295)),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +3 +]) + +OVS_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP + +AT_SETUP([drop-stats - sampling action]) + +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_DATA([flows.txt], [dnl +table=0,in_port=1,actions=sample(probability=65535,collector_set_id=1) +table=0,in_port=2,actions=sample(probability=32767,collector_set_id=1),load:0->reg0 +table=0,in_port=3,actions=clone(sample(probability=65535,collector_set_id=1)) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-vsctl --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + ipfix=@ipfix], + [0], [ignore]) + +m4_define([USERSPACE_SAMPLE_ACTION], [m4_join([,], + [userspace(pid=0], + [flow_sample(probability=$1,collector_set_id=1,obs_domain_id=0], + [obs_point_id=0,output_port=4294967295))])]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535) +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +0 +]) + +dnl Now activate explicit sampled drops. +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) +AT_CHECK([ovs-appctl revalidator/wait]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:5, bytes:530, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +3 +]) + +AT_CHECK([ovs-appctl dpctl/del-flows]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p2 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:sample(sample=50.0%,actions(USERSPACE_SAMPLE_ACTION(32767))),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +6 +]) + +AT_CHECK([ovs-appctl dpctl/del-flows]) + +for i in $(seq 1 3); do +AT_CHECK([ovs-appctl netdev-dummy/receive p3 'ICMP_PKT'], [0], [ignore]) +done + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), dnl +packets:2, bytes:212, used:0.0s, dnl +actions:USERSPACE_SAMPLE_ACTION(65535),drop +]) + +AT_CHECK([ovs-appctl time/warp 5000], [0], [ignore]) + +AT_CHECK([ovs-appctl coverage/read-counter drop_action_of_pipeline], [0], [dnl +9 +]) + +OVS_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP diff --git a/tests/mfex_fuzzy.py b/tests/genpkts.py similarity index 57% rename from tests/mfex_fuzzy.py rename to tests/genpkts.py index ee2183f8eb3..3354e116d0c 100755 --- a/tests/mfex_fuzzy.py +++ b/tests/genpkts.py @@ -1,55 +1,60 @@ #!/usr/bin/python3 import sys +import warnings +try: + from cryptography.utils import CryptographyDeprecationWarning + warnings.filterwarnings( + "ignore", + category=CryptographyDeprecationWarning, + message=r"(blowfish|cast5)", + ) +except ModuleNotFoundError: + pass + +# flake8: noqa: E402 from scapy.all import RandMAC, RandIP, PcapWriter, RandIP6, RandShort, fuzz from scapy.all import IPv6, Dot1Q, IP, Ether, UDP, TCP, random -# Path for the pcap file location. -path = str(sys.argv[1]) # The number of packets generated will be size * 8. -size = int(sys.argv[2]) +size = int(sys.argv[1]) # Traffic option is used to choose between fuzzy or simple packet type. -if len(sys.argv) > 3: - traffic_opt = str(sys.argv[3]) +if len(sys.argv) > 2: + traffic_opt = str(sys.argv[2]) else: traffic_opt = "" -pktdump = PcapWriter(path, append=False, sync=True) - -pkt = [] - for i in range(0, size): + pkt = [] + if traffic_opt == "fuzzy": eth = Ether(src=RandMAC(), dst=RandMAC()) vlan = Dot1Q() - udp = UDP(dport=RandShort(), sport=RandShort()) ipv4 = IP(src=RandIP(), dst=RandIP(), len=random.randint(0, 100)) ipv6 = IPv6(src=RandIP6(), dst=RandIP6(), plen=random.randint(0, 100)) + udp = UDP(dport=RandShort(), sport=RandShort()) tcp = TCP(dport=RandShort(), sport=RandShort(), flags='S', dataofs=random.randint(0, 15)) # IPv4 packets with fuzzing - pkt.append(fuzz(eth / ipv4 / udp)) - pkt.append(fuzz(eth / ipv4 / tcp)) - pkt.append(fuzz(eth / vlan / ipv4 / udp)) - pkt.append(fuzz(eth / vlan / ipv4 / tcp)) + pkt.append(fuzz(eth / ipv4 / udp).build().hex()) + pkt.append(fuzz(eth / ipv4 / tcp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv4 / udp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv4 / tcp).build().hex()) # IPv6 packets with fuzzing - pkt.append(fuzz(eth / ipv6 / udp)) - pkt.append(fuzz(eth / ipv6 / tcp)) - pkt.append(fuzz(eth / vlan / ipv6 / udp)) - pkt.append(fuzz(eth / vlan / ipv6 / tcp)) + pkt.append(fuzz(eth / ipv6 / udp).build().hex()) + pkt.append(fuzz(eth / ipv6 / tcp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv6 / udp).build().hex()) + pkt.append(fuzz(eth / vlan / ipv6 / tcp).build().hex()) else: mac_addr_src = "52:54:00:FF:FF:{:02X}".format(i % 0xff) mac_addr_dst = "80:FF:FF:FF:FF:{:02X}".format(i % 0xff) - src_port = 200 + (i % 20) - dst_port = 1000 + (i % 20) eth = Ether(src=mac_addr_src, dst=mac_addr_dst) vlan = Dot1Q(vlan=(i % 10)) - udp = UDP(dport=src_port, sport=dst_port) # IPv4 address range limits to 255 and IPv6 limit to 65535 ipv4_addr_src = "192.168.150." + str((i % 255)) ipv4_addr_dst = "200.100.198." + str((i % 255)) @@ -59,18 +64,21 @@ .format(i % 0xffff) ipv4 = IP(src=ipv4_addr_src, dst=ipv4_addr_dst) ipv6 = IPv6(src=ipv6_addr_src, dst=ipv6_addr_dst) + src_port = 200 + (i % 20) + dst_port = 1000 + (i % 20) + udp = UDP(dport=src_port, sport=dst_port) tcp = TCP(dport=src_port, sport=dst_port, flags='S') # IPv4 packets - pkt.append(eth / ipv4 / udp) - pkt.append(eth / ipv4 / tcp) - pkt.append(eth / vlan / ipv4 / udp) - pkt.append(eth / vlan / ipv4 / tcp) + pkt.append((eth / ipv4 / udp).build().hex()) + pkt.append((eth / ipv4 / tcp).build().hex()) + pkt.append((eth / vlan / ipv4 / udp).build().hex()) + pkt.append((eth / vlan / ipv4 / tcp).build().hex()) # IPv6 packets - pkt.append(eth / ipv6 / udp) - pkt.append(eth / ipv6 / tcp) - pkt.append(eth / vlan / ipv6 / udp) - pkt.append(eth / vlan / ipv6 / tcp) + pkt.append((eth / ipv6 / udp).build().hex()) + pkt.append((eth / ipv6 / tcp).build().hex()) + pkt.append((eth / vlan / ipv6 / udp).build().hex()) + pkt.append((eth / vlan / ipv6 / tcp).build().hex()) -pktdump.write(pkt) + print(' '.join(pkt)) diff --git a/tests/learn.at b/tests/learn.at index 5f1d6df9de4..d0bcc83633c 100644 --- a/tests/learn.at +++ b/tests/learn.at @@ -6,7 +6,7 @@ actions=learn() actions=learn(send_flow_rem) actions=learn(delete_learned) actions=learn(send_flow_rem,delete_learned) -actions=learn(NXM_OF_VLAN_TCI[0..11], NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[], output:NXM_OF_IN_PORT[], load:10->NXM_NX_REG0[5..10]) +actions=learn(NXM_OF_VLAN_TCI[0..11], NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[], NXM_NX_REG3[3..19]=0x10011, output:NXM_OF_IN_PORT[], load:10->NXM_NX_REG0[5..10]) actions=learn(table=1,idle_timeout=10, hard_timeout=20, fin_idle_timeout=5, fin_hard_timeout=10, priority=10, cookie=0xfedcba9876543210, in_port=99,eth_dst=eth_src,load:in_port->reg1[16..31]) actions=learn(limit=4096) actions=learn(limit=4096,result_dst=reg0[0]) @@ -18,7 +18,7 @@ OFPT_FLOW_MOD (xid=0x1): ADD actions=learn(table=1) OFPT_FLOW_MOD (xid=0x2): ADD actions=learn(table=1,send_flow_rem) OFPT_FLOW_MOD (xid=0x3): ADD actions=learn(table=1,delete_learned) OFPT_FLOW_MOD (xid=0x4): ADD actions=learn(table=1,send_flow_rem,delete_learned) -OFPT_FLOW_MOD (xid=0x5): ADD actions=learn(table=1,NXM_OF_VLAN_TCI[0..11],NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],output:NXM_OF_IN_PORT[],load:0xa->NXM_NX_REG0[5..10]) +OFPT_FLOW_MOD (xid=0x5): ADD actions=learn(table=1,NXM_OF_VLAN_TCI[0..11],NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],NXM_NX_REG3[3..19]=0x10011,output:NXM_OF_IN_PORT[],load:0xa->NXM_NX_REG0[5..10]) OFPT_FLOW_MOD (xid=0x6): ADD actions=learn(table=1,idle_timeout=10,hard_timeout=20,fin_idle_timeout=5,fin_hard_timeout=10,priority=10,cookie=0xfedcba9876543210,in_port=99,NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],load:NXM_OF_IN_PORT[]->NXM_NX_REG1[16..31]) OFPT_FLOW_MOD (xid=0x7): ADD actions=learn(table=1,limit=4096) OFPT_FLOW_MOD (xid=0x8): ADD actions=learn(table=1,limit=4096,result_dst=NXM_NX_REG0[0]) @@ -836,3 +836,63 @@ AT_CHECK([ovs-vsctl add-br br1 -- set b br1 datapath_type=dummy]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([learning action - flapping learn rule]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_CHECK([ovs-appctl time/stop], [0], [ignore]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=0,priority=2,in_port=1,actions=resubmit(,2)']]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=0,priority=2,in_port=2,actions=resubmit(,2)']]) +AT_CHECK([[ovs-ofctl add-flow br0 'table=2,actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[]=NXM_OF_ETH_SRC[],output:OXM_OF_IN_PORT[]),output:3']]) + +packet="eth(src=50:54:00:00:00:06,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=8,dst=9)" + +dnl Run this test a few times in a loop to reduce the likelyhood that it passes by chance. +for i in 1 2 3; do + AT_CHECK([ovs-appctl revalidator/pause], [0]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + + AT_CHECK([ovs-appctl revalidator/resume], [0]) + AT_CHECK([ovs-appctl revalidator/wait], [0]) + + AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | sort | grep 0x123], [0], [dnl + cookie=0x123, hard_timeout=3, priority=1,dl_dst=50:54:00:00:00:06 actions=output:1 + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) + + AT_CHECK([ovs-appctl revalidator/pause], [0]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p1 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + AT_CHECK([ovs-appctl netdev-dummy/receive p2 $packet], [0]) + AT_CHECK([ovs-appctl time/warp 75], [0], [ignore]) + + AT_CHECK([ovs-appctl revalidator/resume], [0]) + AT_CHECK([ovs-appctl revalidator/wait], [0]) + + AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | sort | grep 0x123], [0], [dnl + cookie=0x123, hard_timeout=3, priority=1,dl_dst=50:54:00:00:00:06 actions=output:2 + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) +done + +dnl Wait and check for learned rule eviction due to hard timeout. +AT_CHECK([ovs-appctl time/warp 3200], [0], [ignore]) + +AT_CHECK([ovs-ofctl --no-stats dump-flows br0 | ofctl_strip | grep 0x123], [0], [dnl + table=2, actions=learn(table=0,hard_timeout=3,priority=1,cookie=0x123,NXM_OF_ETH_DST[[]]=NXM_OF_ETH_SRC[[]],output:OXM_OF_IN_PORT[[]]),output:3 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/learning-switch.at b/tests/learning-switch.at new file mode 100644 index 00000000000..ac2fc1b8017 --- /dev/null +++ b/tests/learning-switch.at @@ -0,0 +1,23 @@ +AT_BANNER([learning switch]) + +### ----------------------------------------------------------------- +### learning switch OpenFlow15 test case +### ----------------------------------------------------------------- + +AT_SETUP([learning switch - OpenFlow15]) +dnl Start ovs-testcontroller +AT_CHECK([ovs-testcontroller --no-chdir --detach punix:controller --pidfile -v ptcp:], [0], [ignore]) +dnl Start ovs +OVS_VSWITCHD_START([dnl + set bridge br0 datapath_type=dummy \ + protocols=OpenFlow15 -- \ + add-port br0 p1 -- set Interface p1 type=dummy ofport_request=1 -- \ + set-controller br0 tcp:127.0.0.1:6653]) +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 1e2ce92a669e3a6dd2099cab0800450000548a53400040011addc0a80a0ac0a80a1e08006f200a4d0001fc509a58000000002715020000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637 +], [0], [ignore]) +AT_CHECK([kill `cat ovs-testcontroller.pid`]) + +OVS_WAIT_UNTIL([! test -e controller]) +OVS_VSWITCHD_STOP(["/cannot find route for controller/d"]) +AT_CLEANUP diff --git a/tests/library.at b/tests/library.at index bafb28277e8..d962e1b3fd2 100644 --- a/tests/library.at +++ b/tests/library.at @@ -27,6 +27,11 @@ AT_CHECK([ovstest test-hindex], [0], [..................... ]) AT_CLEANUP +AT_SETUP([test rcu linked lists]) +AT_CHECK([ovstest test-rculist], [0], [..... +]) +AT_CLEANUP + AT_SETUP([cuckoo hash]) AT_KEYWORDS([cmap]) AT_CHECK([ovstest test-cmap check 1], [0], [... @@ -83,6 +88,19 @@ AT_KEYWORDS([byte order]) AT_CHECK([ovstest test-byte-order]) AT_CLEANUP +AT_SETUP([byteq - basic]) +AT_KEYWORDS([byteq]) +AT_CHECK([ovstest test-byteq basic], [0], [... +]) +AT_CLEANUP + +AT_SETUP([byteq - write_read]) +AT_KEYWORDS([byteq]) +AT_SKIP_IF([test "$IS_WIN32" = "yes"]) +AT_CHECK([ovstest test-byteq write_read], [0], [. +]) +AT_CLEANUP + AT_SETUP([random number generator]) AT_CHECK([ovstest test-random], [0], [dnl average=7fa2014f @@ -212,7 +230,9 @@ AT_CHECK([ovstest test-util -voff -vfile:info '-vPATTERN:file:%c|%p|%m' --log-fi [$exit_status], [], [stderr]) AT_CHECK([sed 's/\(opened log file\) .*/\1/ -s/|[[^|]]*: /|/' test-util.log], [0], [dnl +s/|[[^|]]*: /|/ +/backtrace/d +/|.*|/!d' test-util.log], [0], [dnl vlog|INFO|opened log file util|EMER|assertion false failed in test_assert() ]) @@ -278,3 +298,13 @@ AT_CLEANUP AT_SETUP([uuidset module]) AT_CHECK([ovstest test-uuidset], [0], [], [ignore]) AT_CLEANUP + +AT_SETUP([cooperative-multitasking module]) +AT_CHECK([ovstest test-cooperative-multitasking], [0], []) +AT_CLEANUP + +AT_SETUP([cooperative-multitasking module nested yield detection]) +AT_CHECK([ovstest test-cooperative-multitasking-nested-yield], [0], [], [dnl +cooperative_multitasking|ERR|Nested yield avoided, this is a bug! Enable debug logging for more details. +]) +AT_CLEANUP diff --git a/tests/mcast-snooping.at b/tests/mcast-snooping.at index fe475e7b38c..adbb66c7059 100644 --- a/tests/mcast-snooping.at +++ b/tests/mcast-snooping.at @@ -31,33 +31,33 @@ dummy@ovs-dummy: hit:0 missed:0 ovs-appctl time/stop -# Send IGMPv3 query on p2 with vlan 1725 +# Send IGMPv3 query on p2 with vlan 1725. # 5c:8a:38:55:25:52 > 01:00:5e:00:00:01, ethertype 802.1Q (0x8100), length 64: vlan 1725, p 0, ethertype IPv4, # 172.17.25.1 > 224.0.0.1: igmp query v3 AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006bd080046c000240000000001027f00ac111901e0000001940400001164ec1e00000000027d000000000000000000000000']) -# Send IGMPv3 query on p2 with vlan 1728 +# Send IGMPv3 query on p2 with vlan 1728. # 5c:8a:38:55:25:52 > 01:00:5e:00:00:01, ethertype 802.1Q (0x8100), length 64: vlan 1728, p 0, ethertype IPv4, # 172.17.28.1 > 224.0.0.1: igmp query v3 AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006c0080046c000240000000001027c00ac111c01e0000001940400001164ec1e00000000027d000000000000000000000000']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 2 1725 querier 0 - 2 1728 querier 0 + port VLAN protocol GROUP Age + 2 1725 UNKNOWN querier 0 + 2 1728 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap]) -# Send a multicast packet on p1 +# Send a multicast packet on p1. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ 'in_port(1),eth(src=aa:55:aa:55:00:01,dst=01:00:5e:5e:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=239.94.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)' ]) -# Check this packet was forwarded exactly once to p2 and has vlan tag 1725 +# Check this packet was forwarded exactly once to p2 and has vlan tag 1725. # aa:55:aa:55:00:01 > 01:00:5e:5e:01:01, ethertype 802.1Q (0x8100), length 46: vlan 1725, p 0, ethertype IPv4, # 10.0.0.1.0 > 239.94.1.1.8000: UDP, length 0 AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1]) @@ -75,7 +75,7 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '01005e0000015c8a38552552810006bd080046c000240000000001027f00ac111901e0000001940400001164ec1000000000027d000000000000000000000000']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) @@ -87,8 +87,8 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b800000000ff0200000000000000000001ff0e4c67']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 2 0 ff02::1:ff0e:4c67 0 + port VLAN protocol GROUP Age + 2 0 MLDv1 ff02::1:ff0e:4c67 0 ]) AT_CHECK([ovs-appctl mdb/flush br0], [0], [dnl @@ -99,12 +99,334 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 \ '3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b000000000ff0200000000000000000001ff0e4c67']) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([mcast - check multicast per port flooding]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=false +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [stdout]) +AT_CHECK([grep -v 'Datapath actions:' stdout], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> unregistered multicast, flooding + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +]) +AT_CHECK([sed -ne 's/^Datapath actions: \(.*\)$/\1/p' stdout | tr "," "\n" | sort -n], [0], [dnl +1 +2 +100 +]) + +# Send report packets. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' +], [0]) +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 +]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1 +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> forwarding to mcast flood port + -> mcast flood port is input port, dropping + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + +# Change p2 ofport to force a ofbundle change and check that the mdb contains +# no stale port. +AT_CHECK([ovs-vsctl set interface p2 ofport_request=4]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast group port + -> mcast flood port is input port, dropping + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 1,2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + +AT_SETUP([mcast - check multicast per port flooding (unregistered flood disabled)]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=true +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: drop +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast flood port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 2 +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(3),eth(src=aa:55:aa:55:00:ff,dst=01:00:5e:01:01:01),eth_type(0x0800),ipv4(src=10.0.0.1,dst=224.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=0,dst=8000)"], [0], [dnl +Flow: udp,in_port=3,vlan_tci=0x0000,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_src=10.0.0.1,nw_dst=224.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=0,tp_dst=8000 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding to mcast flood port + -> mcast flood port is input port, dropping + +Final flow: unchanged +Megaflow: recirc_id=0,eth,udp,in_port=3,dl_src=aa:55:aa:55:00:ff,dl_dst=01:00:5e:01:01:01,nw_dst=224.1.1.1,nw_frag=no +Datapath actions: 2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + +AT_SETUP([mcast - check reports per port flooding]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ + other-config:mcast-snooping-disable-flood-unregistered=false +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 \ + -- set Interface p1 type=dummy other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ + -- add-port br0 p2 \ + -- set Interface p2 type=dummy other-config:hwaddr=aa:55:aa:55:00:02 ofport_request=2 \ + -- add-port br0 p3 \ + -- set Interface p3 type=dummy other-config:hwaddr=aa:55:aa:55:00:03 ofport_request=3 \ +], [0]) + +ovs-appctl time/stop + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> learned that 00:0c:29:a0:27:a1 is on port p1 in VLAN 0 + -> multicast snooping learned that 224.1.1.1 is on port p1 in VLAN 0 + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: drop +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p3 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p2 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3,2 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +AT_CHECK([ovs-vsctl set port p1 other_config:mcast-snooping-flood-reports=true]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> forwarding report to mcast flagged port + -> mcast port is input port, dropping the Report + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 3,2 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +# Change p2 ofport to force a ofbundle change and check that the mdb contains +# no stale port. +AT_CHECK([ovs-vsctl set interface p3 ofport_request=4]) + +AT_CHECK([ovs-appctl ofproto/trace "in_port(1)" '01005E010101000C29A027A108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101'], [0], [dnl +Flow: ip,in_port=1,vlan_tci=0x0000,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_src=172.16.34.30,nw_dst=224.1.1.1,nw_proto=2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=18,tp_dst=20 + +bridge("br0") +------------- + 0. priority 32768 + NORMAL + -> forwarding report to mcast flagged port + -> mcast port is input port, dropping the Report + -> forwarding report to mcast flagged port + +Final flow: unchanged +Megaflow: recirc_id=0,eth,ip,in_port=1,dl_src=00:0c:29:a0:27:a1,dl_dst=01:00:5e:01:01:01,nw_proto=2,nw_frag=no +Datapath actions: 2,3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + + AT_SETUP([mcast - delete the port mdb when vlan configuration changed]) OVS_VSWITCHD_START([]) @@ -128,7 +450,7 @@ AT_CHECK([ ovs-appctl time/stop -# send report packets +# Send report packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ '01005E010101000C29A027A18100000108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' @@ -136,7 +458,7 @@ AT_CHECK([ '01005E010101000C29A027A28100000208004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' ], [0]) -# send query packets +# Send query packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p3 \ '01005E010101000C29A027D18100000108004500001C000100004002CBCBAC102201E00101011114EEEB00000000' @@ -145,19 +467,19 @@ AT_CHECK([ ], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 - 3 1 querier 0 - 3 2 querier 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 + 3 1 UNKNOWN querier 0 + 3 2 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl set port p3 tag=2], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 ]) AT_CLEANUP @@ -183,7 +505,7 @@ AT_CHECK([ ovs-appctl time/stop -# send report packets +# Send report packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p1 \ '01005E010101000C29A027A18100000108004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' @@ -191,7 +513,7 @@ AT_CHECK([ '01005E010101000C29A027A28100000208004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' ], [0]) -# send query packets +# Send query packets. AT_CHECK([ ovs-appctl netdev-dummy/receive p2 \ '01005E010101000C29A027D18100000108004500001C000100004002CBCBAC102201E00101011114EEEB00000000' @@ -200,19 +522,19 @@ AT_CHECK([ ], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 - 2 1 querier 0 - 2 2 querier 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 + 2 1 UNKNOWN querier 0 + 2 2 UNKNOWN querier 0 ]) AT_CHECK([ovs-vsctl del-port br0 p2], [0]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age - 1 1 224.1.1.1 0 - 1 2 224.1.1.1 0 + port VLAN protocol GROUP Age + 1 1 IGMPv1 224.1.1.1 0 + 1 2 IGMPv1 224.1.1.1 0 ]) AT_CLEANUP @@ -277,9 +599,91 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep -e .*ipv4 | sort | dnl sed 's/pid=[[0-9]]*,// s/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/'], [0], [dnl -ct_state(+new-inv+trk),recirc_id(),in_port(1),eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:2 -ct_state(+new-inv+trk),recirc_id(),in_port(1),eth_type(0x0800),ipv4(proto=2,frag=no), packets:0, bytes:0, used:never, actions:userspace(controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) recirc_id(),in_port(1),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:ct(zone=64000),recirc() +recirc_id(),in_port(1),ct_state(+new-inv+trk),eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:2 +recirc_id(),in_port(1),ct_state(+new-inv+trk),eth_type(0x0800),ipv4(proto=2,frag=no), packets:0, bytes:0, used:never, actions:userspace(controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) +]) + +AT_CLEANUP + +AT_SETUP([mcast - mcast_group protocol updated in mdb]) +OVS_VSWITCHD_START([]) + +AT_CHECK([ + ovs-vsctl set bridge br0 \ + datapath_type=dummy \ + mcast_snooping_enable=true \ +], [0]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) + +AT_CHECK([ + ovs-vsctl add-port br0 p1 -- set Interface p1 type=dummy \ + other-config:hwaddr=aa:55:aa:55:00:01 ofport_request=1 \ +], [0]) + +AT_CHECK([ovs-appctl time/stop]) + +# Send IGMPv1 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005E010101000C29A027A18100000008004500001C000100004002CBAEAC10221EE001010112140CE9E0010101' +], [0]) + +# Send IGMPv2 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e010102505400000103080046c00020000040000102f8110a000103e001010294040000160008fce0010102' +], [0]) + +# Send IGMPv3 report packet. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e000016505400000003080046c00028000040000102f9f60a000003e0000016940400002200e3e10000000104000000e9360ce6' +], [0]) + +# Check that all the ipv4 mcast groups were updated in +# the mdb with the appropriate protocol. +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 + 1 0 IGMPv2 224.1.1.2 0 + 1 0 IGMPv3 233.54.12.230 0 +]) + +# Send IGMPv1 report packet to address 224.1.1.2 +# and make sure that the protocol will be updated to +# IGMPV1. +AT_CHECK([ + ovs-appctl netdev-dummy/receive p1 \ + '01005e010102505400000103080046c00020000040000102f8110a000103e00101029404000012000cfce0010102' +], [0]) + +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 IGMPv1 224.1.1.1 0 + 1 0 IGMPv3 233.54.12.230 0 + 1 0 IGMPv1 224.1.1.2 0 ]) +# Flush the mdb. +AT_CHECK([ovs-appctl mdb/flush br0], [0], [dnl +table successfully flushed +]) + +# Send MLDV2 packet. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +'333300000016d0509956ddf986dd60000000001c3a01fe80000000000000712065589886fa88ff0200000000000000000000000000168f00134d0000000104000000ff0200000000000000000001ff52f3e1']) + +# Send MLDV1 packet. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ +'3333ff0e4c67000c290e4c6786dd600000000020000100000000000000000000000000000000ff0200000000000000000001ff0e4c673a000502000001008300e7b800000000ff0200000000000000000001ff0e4c67']) + +# Check that all the ipv6 mcast groups were updated in +# the mdb with the appropriate protocol. +AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl + port VLAN protocol GROUP Age + 1 0 MLDv2 ff02::1:ff52:f3e1 0 + 1 0 MLDv1 ff02::1:ff0e:4c67 0 +]) AT_CLEANUP diff --git a/tests/nsh.at b/tests/nsh.at index 6b7b6856f26..0040a50b36c 100644 --- a/tests/nsh.at +++ b/tests/nsh.at @@ -521,51 +521,45 @@ AT_CHECK([ set interface vxlangpe32 type=vxlan options:exts=gpe options:remote_ip=30.0.0.2 options:packet_type=ptap ofport_request=3020 ovs-appctl netdev-dummy/ip4addr br-p1 10.0.0.1/24 - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 ovs-appctl netdev-dummy/ip4addr br-p2 20.0.0.2/24 - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 ovs-appctl netdev-dummy/ip4addr br-p3 30.0.0.3/24 - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [stdout]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: | sort ], [0], [dnl -User: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 -User: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 -User: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 +Cached: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 local +Cached: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 local +Cached: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 local ]) AT_CHECK([ @@ -725,8 +719,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(4),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,push_nsh(flags=0,ttl=63,mdtype=1,np=1,spi=0x3000,si=255,c1=0x0,c2=0x0,c3=0x0,c4=0x0),tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(4789) -tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3000,si=255), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x1) -tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0x1),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 +recirc_id(0),tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3000,si=255), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x1) +recirc_id(0x1),tunnel(tun_id=0x0,src=30.0.0.1,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 ]) AT_CHECK([ @@ -779,9 +773,9 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(4),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20/255.255.255.248,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,push_nsh(flags=0,ttl=63,mdtype=1,np=1,spi=0x3020,si=255,c1=0x0,c2=0x0,c3=0x0,c4=0x0),tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(1)),set(ipv4(src=20.0.0.1,dst=20.0.0.2)),tnl_pop(4789) -tunnel(tun_id=0x0,src=20.0.0.1,dst=20.0.0.2,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(spi=0x3020,si=255), packets:1, bytes:108, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=11:22:33:44:55:66),set(nsh(spi=0x3020,si=254)),pop_eth,tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(2)),set(ipv4(src=30.0.0.2,dst=30.0.0.3)),tnl_pop(4789) -tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3020,si=254), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x2) -tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),recirc_id(0x2),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 +recirc_id(0),tunnel(tun_id=0x0,src=20.0.0.1,dst=20.0.0.2,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(spi=0x3020,si=255), packets:1, bytes:108, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=11:22:33:44:55:66),set(nsh(spi=0x3020,si=254)),pop_eth,tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.3,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0xc000004,vni=0x0)),out_port(2)),set(ipv4(src=30.0.0.2,dst=30.0.0.3)),tnl_pop(4789) +recirc_id(0),tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x894f),eth_type(0x894f),nsh(np=1,spi=0x3020,si=254), packets:1, bytes:108, used:0.0s, actions:pop_nsh(),recirc(0x2) +recirc_id(0x2),tunnel(tun_id=0x0,src=30.0.0.2,dst=30.0.0.3,flags(-df-csum+key)),in_port(4789),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:03),6 ]) AT_CHECK([ diff --git a/tests/odp.at b/tests/odp.at index 7a1cf3b2ceb..402b2386d37 100644 --- a/tests/odp.at +++ b/tests/odp.at @@ -3,92 +3,87 @@ AT_BANNER([datapath parsing and formatting]) AT_SETUP([OVS datapath key parsing and formatting - valid forms]) dnl We could add a test for invalid forms, but that's less important. AT_DATA([odp-base.txt], [dnl -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x1234) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x81,ttl=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=first) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=later) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=6,tos=0,ttl=128,frag=no),tcp(src=80,dst=8080) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=17,tos=0,ttl=128,frag=no),udp(src=81,dst=6632) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=1,tos=0,ttl=128,frag=no),icmp(type=1,code=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x71,hlimit=128,frag=no) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=first) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=later) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=6,tclass=0,hlimit=128,frag=no),tcp(src=80,dst=8080) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=17,tclass=0,hlimit=128,frag=no),udp(src=6630,dst=22) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=1,code=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0806),arp(sip=1.2.3.4,tip=5.6.7.8,op=1,sha=00:0f:10:11:12:13,tha=00:14:15:16:17:18) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e),nd_ext(nd_reserved=0x0,nd_options_type=2) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=3,ttl=64,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=0) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=1) -in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=0) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x1234) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x81,ttl=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=first) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=5,tos=0x80,ttl=128,frag=later) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=6,tos=0,ttl=128,frag=no),tcp(src=80,dst=8080) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=17,tos=0,ttl=128,frag=no),udp(src=81,dst=6632) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0800),ipv4(src=35.8.2.41,dst=172.16.0.20,proto=1,tos=0,ttl=128,frag=no),icmp(type=1,code=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x71,hlimit=128,frag=no) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=first) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=later) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=6,tclass=0,hlimit=128,frag=no),tcp(src=80,dst=8080) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=17,tclass=0,hlimit=128,frag=no),udp(src=6630,dst=22) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=1,code=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x0806),arp(sip=1.2.3.4,tip=5.6.7.8,op=1,sha=00:0f:10:11:12:13,tha=00:14:15:16:17:18) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=58,tclass=0,hlimit=128,frag=no),icmpv6(type=136,code=0),nd(target=::3,sll=00:05:06:07:08:09,tll=00:0a:0b:0c:0d:0e),nd_ext(nd_reserved=0x0,nd_options_type=2) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=3,ttl=64,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8847),mpls(label=100,tc=7,ttl=100,bos=0) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=1) +eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x8848),mpls(label=1000,tc=4,ttl=200,bos=0) ]) (echo '# Valid forms without tun_id or VLAN header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt - - sed ' -s/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ -' odp-base.txt - + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt echo echo '# Valid forms with tunnel header.' - sed 's/^/skb_priority(0),tunnel(tun_id=0x7f10354,src=10.10.10.10,dst=20.20.20.20,ttl=64,flags(csum|key)),skb_mark(0x1234),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0x7f10354,src=10.10.10.10,dst=20.20.20.20,ttl=64,flags(csum|key)),in_port(1),skb_mark(0x1234),/' odp-base.txt echo echo '# Valid forms with VLAN header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with MPLS header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*),?\)/\1,eth_type(0x8847),mpls(label=100,tc=7,ttl=64,bos=1)/' odp-base.txt echo echo '# Valid forms with MPLS multicast header.' - sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*),?\)/\1,eth_type(0x8848),mpls(label=100,tc=7,ttl=64,bos=1)/' odp-base.txt echo echo '# Valid forms with tunnel and VLAN headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,tos=0x8,ttl=128,flags(key)),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,tos=0x8,ttl=128,flags(key)),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with QOS priority, tunnel, and VLAN headers.' - sed 's/^/skb_priority(0x1234),tunnel(tun_id=0xfedcba9876543210,src=10.10.10.10,dst=20.20.20.20,tos=0x8,ttl=64,flags(key)),skb_mark(0),recirc_id(0),dp_hash(0),/ + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0x1234),tunnel(tun_id=0xfedcba9876543210,src=10.10.10.10,dst=20.20.20.20,tos=0x8,ttl=64,flags(key)),in_port(1),skb_mark(0),/ s/\(eth([[^)]]*)\),*/\1,eth_type(0x8100),vlan(vid=99,pcp=7),encap(/ s/$/)/' odp-base.txt echo echo '# Valid forms with conntrack fields.' - sed 's/^/skb_priority(0),skb_mark(0),ct_mark(0x12345678),ct_label(0x1234567890abcdef1234567890abcdef),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_mark(0x12345678),ct_label(0x1234567890abcdef1234567890abcdef),/' odp-base.txt echo echo '# Valid forms with IP first fragment.' -sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt | sed -n 's/,frag=no),/,frag=first),/p' + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt | sed -n 's/,frag=no),/,frag=first),/p' echo echo '# Valid forms with IP later fragment.' -sed 's/^/skb_priority(0),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt | sed -n 's/,frag=no),.*/,frag=later)/p' + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),/' odp-base.txt | sed -n 's/,frag=no),.*/,frag=later)/p' echo echo '# Valid forms with tunnel and ERSPAN v1 headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=1,idx=0x7),flags(df|key)),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=1,idx=0x7),flags(df|key)),in_port(1),skb_mark(0),/' odp-base.txt echo echo '# Valid forms with tunnel and ERSPAN v2 headers.' - sed 's/^/skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=2,dir=1,hwid=0x7),flags(df|key)),skb_mark(0),recirc_id(0),dp_hash(0),/' odp-base.txt + sed 's/^/recirc_id(0),dp_hash(0),skb_priority(0),tunnel(tun_id=0xfedcba9876543210,src=10.0.0.1,dst=10.0.0.2,ttl=128,erspan(ver=2,dir=1,hwid=0x7),flags(df|key)),in_port(1),skb_mark(0),/' odp-base.txt ) > odp-in.txt AT_CAPTURE_FILE([odp-in.txt]) @@ -102,11 +97,12 @@ s/^/ODP_FIT_TOO_LITTLE: / dnl Some fields are always printed for this test, because wildcards aren't dnl specified. We can skip these. sed -i'back' 's/\(skb_mark(0)\),\(ct\)/\1,ct_state(0),ct_zone(0),\2/' odp-out.txt -sed -i'back' 's/\(skb_mark([[^)]]*)\),\(recirc\)/\1,ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),\2/' odp-out.txt -sed -i'back' 's/\(in_port(1)\),\(eth\)/\1,packet_type(ns=0,id=0),\2/' odp-out.txt +sed -i'back' 's/\(skb_mark([[^)]]*)\),\(eth\)/\1,ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),\2/' odp-out.txt +sed -i'back' 's/\(ct_label([[^)]]*)\),\(eth\)/\1,packet_type(ns=0,id=0),\2/' odp-out.txt AT_CHECK_UNQUOTED([ovstest test-odp parse-keys < odp-in.txt], [0], [`cat odp-out.txt` ]) +AT_CHECK_UNQUOTED([cat odp-in.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath wildcarded key parsing and formatting - valid forms]) @@ -194,6 +190,7 @@ sed -n 's/,frag=no),.*/,frag=later)/p' odp-base.txt AT_CAPTURE_FILE([odp.txt]) AT_CHECK_UNQUOTED([ovstest test-odp parse-wc-keys < odp.txt], [0], [`cat odp.txt` ]) +AT_CHECK_UNQUOTED([cat odp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath wildcarded key filtering.]) @@ -241,24 +238,31 @@ in_port(1),eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15),eth_type(0x86dd),ipv ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_type=0x1235' < odp-base.txt], [0], [`cat odp-eth-type.txt` ]) +AT_CHECK_UNQUOTED([cat odp-eth-type.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_vlan=99' < odp-vlan-base.txt], [0], [`cat odp-vlan.txt` ]) +AT_CHECK_UNQUOTED([cat odp-vlan.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_vlan=99,ip' < odp-vlan-base.txt], [0], [`cat odp-vlan.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='ip,nw_src=35.8.2.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='ip,nw_dst=172.16.0.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) +AT_CHECK_UNQUOTED([cat odp-ipv4.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='dl_type=0x0800,nw_src=35.8.2.199,nw_dst=172.16.0.199' < odp-base.txt], [0], [`cat odp-ipv4.txt` ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='icmp,nw_src=35.8.2.199' < odp-base.txt], [0], [`cat odp-icmp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-icmp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='arp,arp_spa=1.2.3.5' < odp-base.txt], [0], [`cat odp-arp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-arp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='tcp,tp_src=90' < odp-base.txt], [0], [`cat odp-tcp.txt` ]) +AT_CHECK_UNQUOTED([cat odp-tcp.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CHECK_UNQUOTED([ovstest test-odp parse-filter filter='tcp6,tp_src=90' < odp-base.txt], [0], [`cat odp-tcp6.txt` ]) +AT_CHECK_UNQUOTED([cat odp-tcp6.txt | sed 's/^#.*//' | sed 's/$/ actions:drop/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - valid forms]) @@ -338,6 +342,8 @@ tnl_push(tnl_port(6),header(size=70,type=4,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:1 tnl_push(tnl_port(6),header(size=70,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0x0),geneve(oam,vni=0x1c7)),out_port(1)) tnl_push(tnl_port(6),header(size=78,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0x0),geneve(crit,vni=0x1c7,options({class=0xffff,type=0x80,len=4,0xa}))),out_port(1)) tnl_push(tnl_port(6),header(size=70,type=5,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x1c7)),out_port(1)) +tnl_push(tnl_port(6),header(size=78,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=0,segs(2001:cafe::90))),out_port(1)) +tnl_push(tnl_port(6),header(size=110,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91,2001:cafe::92))),out_port(1)) ct ct(commit) ct(commit,zone=5) @@ -348,7 +354,9 @@ ct(commit,helper=tftp) ct(commit,timeout=ovs_tp_1_tcp4) ct(nat) ct(commit,nat(src)) +ct(commit,timeout=ovs_tp_1_tcp4,nat(src)) ct(commit,nat(dst)) +ct(commit,timeout=ovs_tp_1_tcp4,nat(dst)) ct(commit,nat(src=10.0.0.240,random)) ct(commit,nat(src=10.0.0.240:32768-65535,random)) ct(commit,nat(dst=10.0.0.128-10.0.0.254,hash)) @@ -385,16 +393,39 @@ check_pkt_len(size=200,gt(ct(nat)),le(drop)) check_pkt_len(size=200,gt(set(eth(src=00:01:02:03:04:05,dst=10:11:12:13:14:15))),le(set(eth(src=00:01:02:03:04:06,dst=10:11:12:13:14:16)))) lb_output(1) add_mpls(label=200,tc=7,ttl=64,bos=1,eth_type=0x8847) +psample(group=12,cookie=0xf1020304050607080910111213141516) +psample(group=12) +sample(sample=50.0%,actions(psample(group=12,cookie=0xf1020304))) +sample(sample=50.0%,actions(userspace(pid=42,userdata(0102030400000000)),psample(group=12))) ]) AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [`cat actions.txt` ]) +AT_CHECK_UNQUOTED([cat actions.txt | sed 's/^/actions:/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - invalid forms]) dnl This caused a hang in older versions. -AT_CHECK([echo 'encap_nsh@:{@' | ovstest test-odp parse-actions -], [0], [dnl +AT_DATA([actions.txt], [dnl +encap_nsh@:{@ +tnl_push(tnl_port(6),header(size=94,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91))),out_port(1)) +tnl_push(tnl_port(6),header(size=126,type=112,eth(dst=f8:bc:12:44:34:b6,src=f8:bc:12:46:58:e0,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=2,segs(2001:cafe::90,2001:cafe::91,2001:cafe::92,2001:cafe::93))),out_port(1)) +psample(group_id=12,cookie=0x0102030405060708090a0b0c0d0e0f0f0f) +psample(cookie=0x010203) +psample(group=12,cookie=0x010203,group=12) +psample(group=abc) +psample(group=12,cookie=wrong) +psample() +]) +AT_CHECK_UNQUOTED([ovstest test-odp parse-actions < actions.txt], [0], [dnl +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error +odp_actions_from_string: error odp_actions_from_string: error ]) AT_CLEANUP @@ -434,6 +465,7 @@ odp_actions_from_string: error `cat actions.txt | head -3 | tail -1` odp_actions_from_string: error ]) +AT_CHECK_UNQUOTED([cat actions.txt | sed 's/^/actions:/' | test-dpparse.py]) AT_CLEANUP AT_SETUP([OVS datapath actions parsing and formatting - actions too long]) diff --git a/tests/ofp-actions.at b/tests/ofp-actions.at index 9d820eba6d4..8a0504b3cb3 100644 --- a/tests/ofp-actions.at +++ b/tests/ofp-actions.at @@ -136,6 +136,9 @@ ffff 0020 00002320 0026 3039 00005BA0 00008707 0000B26E DDD50000 00000000 # actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) ffff 0020 00002320 0029 3039 00005BA0 00008707 0000B26E DDD50200 00000000 +# actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) +ffff 0028 00002320 0033 3039 00005ba0 00000002 000f0000 0001d810 081f0000 0000 000000000000 + # bad OpenFlow10 actions: OFPBAC_BAD_LEN & ofp_actions|WARN|OpenFlow action OFPAT_OUTPUT length 240 exceeds action buffer length 8 & ofp_actions|WARN|bad action at offset 0 (OFPBAC_BAD_LEN): @@ -329,6 +332,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow10 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.0 "instruction" translations]) @@ -359,6 +363,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-instructions OpenFlow10 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.1 action translation]) @@ -487,6 +492,9 @@ ffff 0020 00002320 0015 000500000000 80003039005A02fd 0400000000000000 # actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) ffff 0018 00002320 001d 3039 00005BA0 00008707 0000B26E +# actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) +ffff 0028 00002320 0033 3039 00005ba0 00000002 000f0000 0001d810 081f0000 0000 000000000000 + # bad OpenFlow11 actions: OFPBAC_BAD_OUT_PORT & ofp_actions|WARN|bad action at offset 0 (OFPBAC_BAD_OUT_PORT): & 00000000 00 00 00 10 ff ff ff ff-00 00 00 00 00 00 00 00 @@ -502,6 +510,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow11 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([OpenFlow 1.1 instruction translation]) @@ -737,6 +746,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow12 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP dnl Our primary goal here is to verify OpenFlow 1.3-specific changes, @@ -798,6 +808,7 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow13 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP dnl Our primary goal here is to verify that OpenFlow 1.5-specific changes, @@ -827,17 +838,20 @@ AT_CAPTURE_FILE([experr]) AT_CHECK( [ovs-ofctl '-vPATTERN:console:%c|%p|%m' parse-actions OpenFlow15 < input.txt], [0], [expout], [experr]) +AT_CHECK([cat expout | grep 'actions=' | test-ofparse.py]) AT_CLEANUP AT_SETUP([ofp-actions - inconsistent MPLS actions]) OVS_VSWITCHD_START dnl OK: Use fin_timeout action on TCP flow AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn add-flow br0 'tcp actions=fin_timeout(idle_timeout=1)']) +AT_CHECK([echo 'tcp actions=fin_timeout(idle_timeout=1)' | test-ofparse.py]) dnl Bad: Use fin_timeout action on TCP flow that has been converted to MPLS AT_CHECK([ovs-ofctl -O OpenFlow11 -vwarn add-flow br0 'tcp actions=push_mpls:0x8847,fin_timeout(idle_timeout=1)'], [1], [], [dnl ovs-ofctl: none of the usable flow formats (OpenFlow10,NXM) is among the allowed flow formats (OpenFlow11) ]) +AT_CHECK([echo 'tcp actions=push_mpls:0x8847,fin_timeout(idle_timeout=1)' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -853,6 +867,8 @@ AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: mpls actions=load:0xa->OXM_OF_MPLS_LABEL[[]] ]) +AT_CHECK([echo 'mpls actions=set_field:10->mpls_label' | test-ofparse.py]) +AT_CHECK([echo 'mpls actions=load:0xa->OXM_OF_MPLS_LABEL[[]]'| test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -862,14 +878,17 @@ OVS_VSWITCHD_START dnl OpenFlow 1.0 has an "enqueue" action. For OpenFlow 1.1+, we translate dnl it to a series of actions that accomplish the same thing. AT_CHECK([ovs-ofctl -O OpenFlow10 add-flow br0 'actions=enqueue(123,456)']) +AT_CHECK([echo 'actions=enqueue(123,456)' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: actions=enqueue:123:456 ]) +AT_CHECK([echo 'actions=enqueue:123:456' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow13 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.3): reset_counts actions=set_queue:456,output:123,pop_queue ]) +AT_CHECK([echo 'actions=set_queue:456,output:123,pop_queue' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -887,6 +906,8 @@ AT_CHECK([ovs-ofctl -O OpenFlow11 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.1): ip actions=mod_nw_ttl:123 ]) +AT_CHECK([echo 'ip,actions=mod_nw_ttl:123' | test-ofparse.py]) +AT_CHECK([echo 'ip actions=load:0x7b->NXM_NX_IP_TTL[[]]' | test-ofparse.py]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -898,10 +919,12 @@ dnl OpenFlow 1.1, but no other version, has a "mod_nw_ecn" action. dnl Check that we translate it properly for OF1.0 and OF1.2. dnl (OF1.3+ should be the same as OF1.2.) AT_CHECK([ovs-ofctl -O OpenFlow11 add-flow br0 'ip,actions=mod_nw_ecn:2']) +AT_CHECK([echo 'ip,actions=mod_nw_ecn:2' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow10 dump-flows br0 | ofctl_strip], [0], [dnl NXST_FLOW reply: ip actions=load:0x2->NXM_NX_IP_ECN[[]] ]) +AT_CHECK([echo 'ip actions=load:0x2->NXM_NX_IP_ECN[[]]' | test-ofparse.py]) AT_CHECK([ovs-ofctl -O OpenFlow11 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.1): ip actions=mod_nw_ecn:2 @@ -910,6 +933,7 @@ AT_CHECK([ovs-ofctl -O OpenFlow12 dump-flows br0 | ofctl_strip], [0], [dnl OFPST_FLOW reply (OF1.2): ip actions=set_field:2->nw_ecn ]) +AT_CHECK([echo 'ip actions=set_field:2->nw_ecn' | test-ofparse.py]) dnl Check that OF1.2+ set_field to set ECN is translated into the OF1.1 dnl mod_nw_ecn action. @@ -1103,6 +1127,10 @@ bad_action 'unroll_xlate' "UNROLL is an internal action that shouldn't be used v # sample bad_action 'sample(probability=0)' 'invalid probability value "0"' bad_action 'sample(sampling_port=asdf)' 'asdf: unknown port' +bad_action 'sample(probability=12345,obs_domain_id=NXM_NX_CT_LABEL[[5..40]])' \ + 'size of obs_domain_id field (36) exceeds maximum (32)' +bad_action 'sample(probability=12345,obs_point_id=NXM_NX_CT_LABEL[[0..32]])' \ + 'size of obs_point_id field (33) exceeds maximum (32)' bad_action 'sample(foo=bar)' 'invalid key "foo" in "sample" argument' bad_action 'sample' 'non-zero "probability" must be specified on sample' diff --git a/tests/ofp-print.at b/tests/ofp-print.at index fe41cc42c7f..b2e69c10038 100644 --- a/tests/ofp-print.at +++ b/tests/ofp-print.at @@ -4073,3 +4073,213 @@ AT_CHECK([ovs-ofctl ofp-print "\ NXT_CT_FLUSH_ZONE (xid=0x3): zone_id=13 ]) AT_CLEANUP + +AT_SETUP([NXT_CT_FLUSH]) +AT_KEYWORDS([ofp-print]) +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 18 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 04 00 08 00 00 00 cd \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0/0xcd 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 28 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 04 00 08 00 00 00 cd \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab/0xcd 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0xffab00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 48 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 labels=0xffab00/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 38 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab labels=0xffab00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 58 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 03 00 08 00 00 00 ab \ +00 04 00 08 00 00 00 cd \ +00 05 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ab 00 00 00 00 00 \ +00 06 00 14 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 mark=0xab/0xcd labels=0xffab00/0xffcd00 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_ipv6_src=::,ct_ipv6_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 68 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 48 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 02 00 08 00 50 00 00 \ +00 03 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=10.10.0.1,ct_nw_dst=10.10.0.2,ct_tp_src=80,ct_tp_dst=8080,ct_nw_proto=6' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 68 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 01 00 48 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 03 00 08 00 50 00 00 \ +00 02 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=6' 'ct_nw_src=10.10.0.2,ct_nw_dst=10.10.0.1,ct_tp_src=8080,ct_tp_dst=80' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 b0 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 48 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 02 00 08 00 50 00 00 \ +00 03 00 08 1f 90 00 00 \ +00 01 00 48 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +00 03 00 08 00 50 00 00 \ +00 02 00 08 1f 90 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=13 'ct_nw_src=10.10.0.1,ct_nw_dst=10.10.0.2,ct_tp_src=80,ct_tp_dst=8080,ct_nw_proto=6' 'ct_nw_src=10.10.0.2,ct_nw_dst=10.10.0.1,ct_tp_src=8080,ct_tp_dst=80' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 b8 00 00 00 03 00 00 23 20 00 00 00 20 \ +01 \ +00 00 00 00 00 00 00 \ +00 00 00 50 00 00 00 00 \ +00 00 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 01 00 00 00 00 \ +00 01 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 02 00 00 00 00 \ +00 04 00 08 00 0a 00 00 \ +00 05 00 05 01 00 00 00 \ +00 06 00 05 02 00 00 00 \ +00 01 00 50 00 00 00 00 \ +00 01 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 02 00 00 00 00 \ +00 00 00 14 fd 18 00 00 00 00 00 00 00 00 ff ff ab cd 00 01 00 00 00 00 \ +00 04 00 08 00 0a 00 00 \ +00 05 00 05 03 00 00 00 \ +00 06 00 05 04 00 00 00 \ +"], [0], [dnl +NXT_CT_FLUSH (xid=0x3): zone=0 'ct_ipv6_src=fd18::ffff:abcd:1,ct_ipv6_dst=fd18::ffff:abcd:2,icmp_id=10,icmp_type=1,icmp_code=2,ct_nw_proto=1' 'ct_ipv6_src=fd18::ffff:abcd:1,ct_ipv6_dst=fd18::ffff:abcd:2,icmp_id=10,icmp_type=3,icmp_code=4' +]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 58 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 38 00 00 00 00 \ +00 00 00 14 00 0a 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +" | grep -q OFPBPC_BAD_VALUE], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 60 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 02 00 08 00 0d 00 00 \ +00 00 00 20 00 00 00 00 \ +00 00 00 14 00 0a 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 01 00 00 00 00 \ +00 01 00 20 00 00 00 00 \ +00 00 00 14 00 00 00 00 00 00 00 00 00 00 ff ff 0a 0a 00 02 00 00 00 00 \ +" | grep -q OFPBPC_BAD_VALUE], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 20 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 80 00 08 00 00 00 00 \ +"| grep -q OFPBPC_BAD_TYPE], [0], [ignore], [stderr]) +AT_CHECK([grep -q "unknown NXT_CT_FLUSH property type 128" stderr], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 28 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 00 00 10 00 00 00 00 \ +00 80 00 08 00 50 00 00 \ +"| grep -q OFPBPC_BAD_TYPE], [0], [ignore], [stderr]) +AT_CHECK([grep -q "unknown NXT_CT_TUPLE property type 128" stderr], [0]) + +AT_CHECK([ovs-ofctl ofp-print "\ +01 04 00 30 00 00 00 03 00 00 23 20 00 00 00 20 \ +06 \ +00 00 00 00 00 00 00 \ +00 06 00 15 00 00 00 00 00 00 00 00 00 00 00 00 00 ff cd 00 00 00 00 00 \ +" | grep -q OFPBPC_BAD_LEN], [0]) +AT_CLEANUP diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at index 8e993c585ff..42fb66de687 100644 --- a/tests/ofproto-dpif.at +++ b/tests/ofproto-dpif.at @@ -62,6 +62,25 @@ AT_CHECK([ovs-appctl coverage/read-counter rev_reconfigure], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - malformed lldp autoattach tlv]) +OVS_VSWITCHD_START() +add_of_ports br0 1 + +dnl Enable lldp +AT_CHECK([ovs-vsctl set interface p1 lldp:enable=true]) + +dnl Send a malformed lldp packet +packet="0180c200000ef6b426aa5f0088cc020704f6b426aa5f000403057632060200780c"dnl +"5044454144424545464445414442454546444541444245454644454144424545464445414"dnl +"4424545464445414442454546444541444245454644454144424545464445414442454546"dnl +"4445414442454546fe0500040d0c010000" +AT_CHECK([ovs-appctl netdev-dummy/receive p1 "$packet"], [0], [stdout]) + +OVS_WAIT_UNTIL([grep -q "ISID_VLAN_ASGNS TLV too short" ovs-vswitchd.log]) + +OVS_VSWITCHD_STOP(["/|WARN|ISID_VLAN_ASGNS TLV too short received on/d"]) +AT_CLEANUP + AT_SETUP([ofproto-dpif - active-backup bonding (with primary)]) dnl Create br0 with members p1, p2 and p7, creating bond0 with p1 and @@ -528,6 +547,23 @@ ovs-appctl time/warp 1000 100 ovs-appctl bond/show > bond3.txt AT_CHECK([sed -n '/member p2/,/^$/p' bond3.txt | grep 'hash'], [0], [ignore]) +# Check that both ports doing down and back up doesn't break statistics. +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p1 down], 0, [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p2 down], 0, [OK +]) +ovs-appctl time/warp 1000 100 +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p1 up], 0, [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/set-admin-state p2 up], 0, [OK +]) +ovs-appctl time/warp 1000 100 + +AT_CHECK([SEND_TCP_BOND_PKTS([p5], [5], [65500])]) +# We sent 49125 KB of data total in 3 batches. No hash should have more +# than that amount of load. Just checking that it is within 5 digits. +AT_CHECK([ovs-appctl bond/show | grep -E '[[0-9]]{6}'], [1]) + OVS_VSWITCHD_STOP() AT_CLEANUP @@ -657,7 +693,7 @@ NXST_FLOW reply: OVS_VSWITCHD_STOP() AT_CLEANUP -AT_SETUP([bond - discard duplicated frames]) +AT_SETUP([ofproto-dpif - bond - discard duplicated frames]) dnl With an active/active non-lacp bond, the default behaviour dnl is to discard multicast frames on the secondary interface. OVS_VSWITCHD_START([dnl @@ -721,7 +757,7 @@ Datapath actions: drop OVS_VSWITCHD_STOP() AT_CLEANUP -AT_SETUP([bond - allow duplicated frames]) +AT_SETUP([ofproto-dpif - bond - allow duplicated frames]) dnl Receiving of duplicated multicast frames should be allowed with 'all_members_active'. OVS_VSWITCHD_START([dnl add-bond br0 bond0 p1 p2 -- dnl @@ -830,7 +866,7 @@ table=2 ip actions=set_field:192.168.3.91->ip_src,output(11) AT_CHECK([ovs-ofctl -O OpenFlow12 add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=1,nw_tos=0,nw_ttl=128,nw_frag=no,icmp_type=8,icmp_code=0'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: 10,set(ipv4(src=192.168.3.91)),11,set(ipv4(src=192.168.3.90)),13 ]) OVS_VSWITCHD_STOP @@ -893,7 +929,7 @@ AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_ds # Must match on the source address to be able to restore it's value for # the second bucket AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),11 ]) OVS_VSWITCHD_STOP @@ -911,6 +947,28 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - group with ct and dnat recirculation in action list]) +OVS_VSWITCHD_START +add_of_ports br0 1 10 +AT_CHECK([ovs-ofctl -O OpenFlow12 add-group br0 \ + 'group_id=1234,type=all,bucket=ct(nat(dst=10.10.10.7:80),commit,table=2)']) +AT_DATA([flows.txt], [dnl +table=0 ip,ct_state=-trk actions=group:1234 +table=2 ip,ct_state=+trk actions=output:10 +]) +AT_CHECK([ovs-ofctl -O OpenFlow12 add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl ofproto/trace br0 ' + in_port=1,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,dl_type=0x0800, + nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=1,nw_tos=0,nw_ttl=128,nw_frag=no, + icmp_type=8,icmp_code=0 +'], [0], [stdout]) +AT_CHECK([grep 'Datapath actions' stdout], [0], [dnl +Datapath actions: ct(commit,nat(dst=10.10.10.7:80)),recirc(0x1) +Datapath actions: 10 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - group actions have no effect afterwards]) OVS_VSWITCHD_START add_of_ports br0 1 10 @@ -925,7 +983,7 @@ done AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/dp_hash(.*\/0xf)/dp_hash(0xXXXX\/0xf)/' | sed 's/packets.*actions:/actions:/' | strip_ufid | strip_used | sort], [0], [dnl flow-dump from the main thread: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:hash(sym_l4(0)),recirc(0x1) -recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,frag=no), actions:set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),10 +recirc_id(0x1),dp_hash(0xXXXX/0xf),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.0.1,proto=1,frag=no), actions:set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),10 ]) OVS_VSWITCHD_STOP @@ -940,7 +998,7 @@ AT_CHECK([ovs-appctl ofproto/trace br0 'in_port=1,dl_src=50:54:00:00:00:05,dl_ds # Must match on the source address to be able to restore it's value for # the third bucket AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_src=192.168.0.1,nw_frag=no + [Megaflow: recirc_id=0,eth,icmp,in_port=1,nw_src=192.168.0.1,nw_frag=no Datapath actions: set(ipv4(src=192.168.3.90)),10,set(ipv4(src=192.168.0.1)),11 ]) OVS_VSWITCHD_STOP @@ -1517,17 +1575,17 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=111,tos=0,ttl=2,frag=no)' -generate], [0], [stdout]) AT_CHECK([tail -4 stdout], [0], [ Final flow: ip,in_port=1,vlan_tci=0x0000,dl_src=50:54:00:00:00:05,dl_dst=50:54:00:00:00:07,nw_src=192.168.0.1,nw_dst=192.168.0.2,nw_proto=111,nw_tos=0,nw_ecn=0,nw_ttl=1,nw_frag=no -Megaflow: recirc_id=0,eth,ip,in_port=1,nw_ttl=2,nw_frag=no +Megaflow: recirc_id=0,eth,ip,in_port=1,nw_proto=111,nw_ttl=2,nw_frag=no Datapath actions: set(ipv4(ttl=1)),2,userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)),4 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=111,tos=0,ttl=3,frag=no)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_ttl=3,nw_frag=no + [Megaflow: recirc_id=0,eth,ip,in_port=1,nw_proto=111,nw_ttl=3,nw_frag=no Datapath actions: set(ipv4(ttl=2)),2,set(ipv4(ttl=1)),3,4 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x86dd),ipv6(src=::1,dst=::2,label=0,proto=10,tclass=0x70,hlimit=128,frag=no)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,eth,ipv6,in_port=1,nw_ttl=128,nw_frag=no + [Megaflow: recirc_id=0,eth,ipv6,in_port=1,nw_proto=10,nw_ttl=128,nw_frag=no Datapath actions: set(ipv6(hlimit=127)),2,set(ipv6(hlimit=126)),3,4 ]) @@ -1637,7 +1695,7 @@ AT_CHECK([ovs-vsctl -- \ --id=@q2 create Queue dscp=2], [0], [ignore]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(9),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=1.1.1.1,dst=2.2.2.2,proto=1,tos=0xff,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) AT_CHECK([tail -2 stdout], [0], - [Megaflow: recirc_id=0,skb_priority=0,eth,ip,in_port=9,nw_tos=252,nw_frag=no + [Megaflow: recirc_id=0,skb_priority=0,eth,icmp,in_port=9,nw_tos=252,nw_frag=no Datapath actions: dnl 100,dnl set(ipv4(tos=0x4/0xfc)),set(skb_priority(0x1)),1,dnl @@ -5162,6 +5220,172 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - mirroring, filter]) +AT_KEYWORDS([mirror mirrors mirroring]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 +AT_CHECK([ovs-vsctl \ + set Bridge br0 mirrors=@m -- \ + --id=@p3 get Port p3 -- \ + --id=@m create Mirror name=mymirror select_all=true output_port=@p3 filter="icmp"], [0], [ignore]) + +icmp_flow="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" +tcp_flow1="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=128,frag=no),tcp(dst=443)" +tcp_flow2="eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=0,ttl=128,frag=no),tcp(dst=80)" + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flow br0 'actions=normal' ]) + +dnl Add non-matching flows, then change the mirror to match one of the flows, +dnl then add a matching flow. +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $icmp_flow]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow1]) +AT_CHECK([ovs-vsctl set mirror mymirror filter="tcp"], [0]) +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow2]) +AT_CHECK([ovs-appctl dpif/dump-flows --names br0 | strip_ufid | strip_used | sort], [0], [dnl +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:br0,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:1, bytes:118, used:0.0s, actions:p3,br0,p2 +]) +AT_CHECK([ovs-appctl dpctl/dump-flows --names | strip_ufid | strip_used | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:0, bytes:0, used:never, actions:br0,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:1, bytes:118, used:0.0s, actions:p3,br0,p2 +]) + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 actions=output:2"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=2 actions=output:1"]) + +dnl Add mirrored flow after non-mirrored flow. +AT_CHECK([ovs-vsctl set mirror mymirror filter="icmp"], [0]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $tcp_flow1]) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 $icmp_flow]) +AT_CHECK([ovs-appctl dpif/dump-flows --names br0 | strip_ufid | strip_used | sort], [0], [dnl +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=1,frag=no), packets:1, bytes:106, used:0.0s, actions:p3,p2 +recirc_id(0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),dnl +eth_type(0x0800),ipv4(proto=6,frag=no), packets:2, bytes:236, used:0.0s, actions:p2 +]) + +dnl Check one direction, only icmp should mirror. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +dnl Check other direction, only icmp should mirror. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +dnl Change filter to tcp, only tcp should mirror. +AT_CHECK([ovs-vsctl set mirror mymirror filter="tcp"], [0]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,1 +]) + +dnl Invalid filter. Nothing should mirror, error should be logged. +AT_CHECK([ovs-vsctl set mirror mymirror filter="invalid"], [0]) +dnl Setting an in_port is also invalid. +AT_CHECK([ovs-vsctl set mirror mymirror filter="\"in_port=p1\""], [0]) + +dnl Each of the above two lines should produce two log messages. +OVS_WAIT_UNTIL([test $(grep -Ec "filter is invalid|mirror mymirror configuration is invalid" ovs-vswitchd.log) -eq 4]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 1 +]) + +dnl Check more complex filter cases with partially overlapping default wildcards. +AT_CHECK([ovs-vsctl set mirror mymirror filter="\"tcp,tcp_dst=80\""], [0]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(1),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 2 +]) + +dnl Change port number. +AT_CHECK([ovs-appctl dpif-dummy/change-port-number ovs-dummy p1 8]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$tcp_flow2"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +dnl Empty filter, all traffic should mirror. +AT_CHECK([ovs-vsctl clear mirror mymirror filter], [0]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(8),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$icmp_flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,8 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "in_port(2),$tcp_flow1"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 3,8 +]) + +OVS_VSWITCHD_STOP(["/filter is invalid: invalid: unknown field invalid/d +/filter is invalid due to in_port field/d +/mirror mymirror configuration is invalid/d"]) +AT_CLEANUP AT_SETUP([ofproto-dpif - mirroring, select_all]) AT_KEYWORDS([mirror mirrors mirroring]) @@ -5330,7 +5554,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) flow="in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], - [Datapath actions: 3,push_vlan(vid=17,pcp=0),2 + [Datapath actions: 3,push_vlan(vid=17,pcp=0),2,3 ]) flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" @@ -5369,7 +5593,7 @@ flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x080 AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) actual=`tail -1 stdout | sed 's/Datapath actions: //'` -expected="push_vlan(vid=17,pcp=0),1,pop_vlan,push_vlan(vid=12,pcp=0),1,2,100" +expected="push_vlan(vid=12,pcp=0),100,2,1,pop_vlan,push_vlan(vid=17,pcp=0),1,pop_vlan,push_vlan(vid=12,pcp=0),100,2,1" AT_CHECK([ovs-dpctl normalize-actions "$flow" "$expected"], [0], [stdout]) mv stdout expout AT_CHECK([ovs-dpctl normalize-actions "$flow" "$actual"], [0], [expout]) @@ -5637,7 +5861,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) flow="in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) AT_CHECK_UNQUOTED([tail -1 stdout], [0], - [Datapath actions: trunc(100),3,push_vlan(vid=17,pcp=0),2 + [Datapath actions: trunc(100),3,push_vlan(vid=17,pcp=0),2,trunc(100),3 ]) flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" @@ -5835,6 +6059,40 @@ OVS_WAIT_UNTIL([check_flows], [ovs-ofctl dump-flows br0]) OVS_VSWITCHD_STOP AT_CLEANUP +dnl Checks for regression against a bug in which OVS dropped packets +dnl originating from a controller passing through a patch port. +AT_SETUP([ofproto-dpif - packet-out recirculation OFPP_CONTROLLER and patch port]) +OVS_VSWITCHD_START( + [add-port br0 patch-br1 -- \ + set interface patch-br1 type=patch options:peer=patch-br0 -- \ + add-br br1 -- set bridge br1 datapath-type=dummy fail-mode=secure -- \ + add-port br1 patch-br0 -- set interface patch-br0 type=patch options:peer=patch-br1 +]) + +add_of_ports --pcap br1 1 + +AT_DATA([flows-br0.txt], [dnl +table=0 icmp actions=output:patch-br1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows-br0.txt]) + +AT_DATA([flows-br1.txt], [dnl +table=0, icmp actions=ct(table=1,zone=1) +table=1, ct_state=+trk, icmp actions=p1 +]) +AT_CHECK([ovs-ofctl add-flows br1 flows-br1.txt]) + +packet=50540000000750540000000508004500005c000000008001b94dc0a80001c0a80002080013fc00000000000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f +AT_CHECK([ovs-ofctl packet-out br0 "in_port=CONTROLLER packet=$packet actions=table"]) + +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows -m br1 | grep "ct_state" | ofctl_strip], [dnl + table=1, n_packets=1, n_bytes=106, ct_state=+trk,icmp actions=output:2]) + +OVS_WAIT_UNTIL([ovs-pcap p1-tx.pcap | grep -q "$packet"]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - debug_slow action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 @@ -6125,6 +6383,57 @@ AT_CHECK([test 1 = `$PYTHON3 "$top_srcdir/utilities/ovs-pcap.in" p2-tx.pcap | wc OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - continuation with meters]) +AT_KEYWORDS([continuations pause meters]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Add meter with id=1. +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) + +AT_DATA([flows.txt], [dnl +table=0 dl_dst=50:54:00:00:00:0a actions=goto_table(1) +table=1 dl_dst=50:54:00:00:00:0a actions=controller(pause,meter_id=1) +]) +AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows.txt]) + +on_exit 'kill $(cat ovs-ofctl.pid)' +AT_CAPTURE_FILE([ofctl_monitor.log]) +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in \ + --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234)']) + +OVS_WAIT_UNTIL([test $(wc -l < ofctl_monitor.log) -ge 2]) +OVS_APP_EXIT_AND_WAIT([ovs-ofctl]) +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN (xid=0x0): cookie=0x0 total_len=14 in_port=1 (via action) data_len=14 (unbuffered) +vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,dl_type=0x1234 +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow13 dump-flows br0 | ofctl_strip | sort], [0], [dnl + n_packets=1, n_bytes=14, dl_dst=50:54:00:00:00:0a actions=goto_table:1 + table=1, n_packets=1, n_bytes=14, dl_dst=50:54:00:00:00:0a actions=controller(pause,meter_id=1) +OFPST_FLOW reply (OF1.3): +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 dump-meters br0 | ofctl_strip | sort], [0], [dnl +OFPST_METER_CONFIG reply (OF1.3): +meter=1 pktps bands= +type=drop rate=1 +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 meter-stats br0 | strip_timers], [0], [dnl +OFPST_METER reply (OF1.3) (xid=0x2): +meter:1 flow_count:0 packet_in_count:1 byte_in_count:14 duration:0.0s bands: +0: packet_count:0 byte_count:0 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - continuation with patch port]) AT_KEYWORDS([continuations pause resume]) OVS_VSWITCHD_START( @@ -7287,6 +7596,29 @@ AT_CHECK([ovs-appctl coverage/read-counter mac_learning_static_none_move], [0], OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - static-mac learned mac age out]) +OVS_VSWITCHD_START([set bridge br0 fail-mode=standalone -- set bridge br0 other_config:mac-aging-time=5]) +add_of_ports br0 1 2 + +dnl Add some static mac entries. +AT_CHECK([ovs-appctl fdb/add br0 p1 0 50:54:00:00:01:01]) +AT_CHECK([ovs-appctl fdb/add br0 p2 0 50:54:00:00:02:02]) + +dnl Generate some dynamic fdb entries on some ports. +OFPROTO_TRACE([ovs-dummy], [in_port(1),eth(src=60:54:00:00:00:01)], [-generate], [100,2]) +OFPROTO_TRACE([ovs-dummy], [in_port(2),eth(src=60:54:00:00:00:02)], [-generate], [100,1]) + +dnl Waiting for aging out. +ovs-appctl time/warp 20000 + +dnl Count number of static entries remaining. +AT_CHECK_UNQUOTED([ovs-appctl fdb/stats-show br0 | grep expired], [0], [dnl + Total number of expired MAC entries : 2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - basic truncate action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 4 5 @@ -7577,12 +7909,14 @@ dummy@ovs-dummy: hit:0 missed:0 vm1 5/3: (dummy: ifindex=2011) ]) -dnl set up route to 1.1.2.92 via br0 and action=normal +dnl Add 1.1.2.92 to br0 and action=normal AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +]) dnl Prime ARP Cache for 1.1.2.92 AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)']) @@ -7593,10 +7927,13 @@ ovs-vsctl \ --id=@sf create sflow targets=\"127.0.0.1:$SFLOW_PORT\" agent=127.0.0.1 \ header=128 sampling=1 polling=0 -dnl set up route to 192.168.1.2 via br0 +dnl Add 192.168.1.2 to br0, AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.1.1/16], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 192.168.0.0/16 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 192.168.0.0/16 dev br0 SRC 192.168.1.1 local ]) dnl add rule for int-br to force packet onto tunnel. There is no ifindex @@ -8133,6 +8470,61 @@ AT_CHECK([ovs-vsctl destroy Flow_Sample_Collector_Set 1], [0], [ignore]) OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([ofproto-dpif - Flow IPFIX sanity check - from field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:5500\" \ + -- --id=@cs create Flow_Sample_Collector_Set id=0 \ + bridge=@br0 ipfix=@ipfix], + [0], [ignore]) + +m4_define([SAMPLE_ACTION], + [sample(probability=65535,collector_set_id=1,obs_domain_id=NXM_OF_IN_PORT,obs_point_id=$1)]dnl +) + +dnl Store in_port in obs_domain_id and dp_hash in the obs_point_id. +AT_DATA([flows.txt], [dnl +priority=100,arp,action=normal +priority=10,in_port=1,ip actions=SAMPLE_ACTION(NXM_NX_DP_HASH),2 +priority=10,in_port=2,ip actions=SAMPLE_ACTION(NXM_NX_CT_LABEL[[[0..31]]]),1 +priority=10,in_port=3,ip actions=SAMPLE_ACTION(NXM_NX_CT_LABEL[[[10..14]]]),1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt], [0], [ignore]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(1),dp_hash(45),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,dp_hash=0x2d,eth,ip,in_port=1,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=1,obs_point_id=45,output_port=4294967295)),2 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(2),ct_label(0x1234567890abcdef1234567890abcdef),\ + eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,ct_label=0x90abcdef/0xffffffff,eth,ip,in_port=2,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=2,obs_point_id=2427178479,output_port=4294967295)),1 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy \ + "in_port(3),ct_label(0x1234567890abcdef1234567890abcdef),\ + eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),\ + ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)"], [0], [stdout]) + +AT_CHECK([tail -2 stdout], [0], [dnl +Megaflow: recirc_id=0,ct_label=0x4c00/0x7c00,eth,ip,in_port=3,nw_frag=no +Datapath actions: userspace(pid=0,flow_sample(probability=65535,collector_set_id=1,obs_domain_id=3,obs_point_id=19,output_port=4294967295)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([ofproto-dpif - clone action]) OVS_VSWITCHD_START add_of_ports br0 1 2 3 4 @@ -8708,6 +9100,46 @@ dummy@ovs-dummy: hit:0 missed:0 br1 65534/101: (dummy-internal) p3 3/3: (dummy) ]) + +AT_CHECK([ovs-appctl --format json --pretty dpif/show], [0], [dnl +[{ + "dummy@ovs-dummy": { + "bridges": { + "br0": { + "br0": { + "ofport": "65534", + "port_no": "100", + "type": "dummy-internal"}, + "p1": { + "config": { + "n_rxq": "1", + "n_txq": "1", + "numa_id": "0"}, + "ofport": "1", + "port_no": "1", + "type": "dummy-pmd"}, + "p2": { + "config": { + "n_rxq": "1", + "n_txq": "1", + "numa_id": "0"}, + "ofport": "2", + "port_no": "2", + "type": "dummy-pmd"}}, + "br1": { + "br1": { + "ofport": "65534", + "port_no": "101", + "type": "dummy-internal"}, + "p3": { + "ofport": "3", + "port_no": "3", + "type": "dummy"}}}, + "stats": { + "hit": "0", + "missed": "0"}}}] +]) + OVS_VSWITCHD_STOP AT_CLEANUP @@ -8735,12 +9167,12 @@ recirc_id(0),in_port(3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), p ]) AT_CHECK([ovs-appctl dpif/dump-flows -m br0 | strip_ufid | strip_used | sort], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p2),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=0/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p2),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=0/0,code=0/0), packets:0, bytes:0, used:never, actions:drop ]) AT_CHECK([ovs-appctl dpif/dump-flows -m br1 | strip_ufid | strip_used | sort], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(p3),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(p3),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=10.0.0.2/0.0.0.0,dst=10.0.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:0, bytes:0, used:never, actions:drop ]) OVS_VSWITCHD_STOP @@ -8900,10 +9332,10 @@ recirc_id(0),in_port(101),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), ]) AT_CHECK([grep -e 'in_port(100).*packets:9' ovs-vswitchd.log | strip_ufid | filter_flow_dump], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(100),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:9, bytes:954, used:0.0s, actions:101,3,2 +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(100),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:05/00:00:00:00:00:00,dst=50:54:00:00:00:07/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.1/0.0.0.0,dst=192.168.0.2/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:9, bytes:954, used:0.0s, actions:101,3,2 ]) AT_CHECK([grep -e 'in_port(101).*packets:4' ovs-vswitchd.log | strip_ufid | filter_flow_dump], [0], [dnl -skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(101),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:4, bytes:424, used:0.0s, actions:100,2,3 +recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(101),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:07/00:00:00:00:00:00,dst=50:54:00:00:00:05/00:00:00:00:00:00),eth_type(0x0800),ipv4(src=192.168.0.2/0.0.0.0,dst=192.168.0.1/0.0.0.0,proto=1/0,tos=0/0,ttl=64/0,frag=no),icmp(type=8/0,code=0/0), packets:4, bytes:424, used:0.0s, actions:100,2,3 ]) AT_CHECK([ovs-ofctl dump-ports br0 pbr0], [0], [dnl @@ -9595,12 +10027,12 @@ table=0 in_port=1,ip,nw_dst=10.0.0.3 actions=drop done sleep 1 AT_CHECK([strip_ufid < ovs-vswitchd.log | filter_flow_install | strip_used], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:2 -skb_priority(0),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:drop +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:2 +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(-new-est-rel-rpl-inv-trk-snat-dnat),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), actions:drop ]) AT_CHECK([strip_ufid < ovs-vswitchd.log | filter_flow_dump | grep 'packets:3'], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:2 -skb_priority(0),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:drop +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:2 +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0/0xff),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0), packets:3, bytes:318, used:0.0s, actions:drop ]) OVS_VSWITCHD_STOP AT_CLEANUP]) @@ -9630,7 +10062,7 @@ OVS_VSWITCHD_STOP AT_CLEANUP # Tests the bundling with various bfd and cfm configurations. -AT_SETUP([ofproto - bundle with variable bfd/cfm config]) +AT_SETUP([ofproto-dpif - bundle with variable bfd/cfm config]) OVS_VSWITCHD_START([add-br br1 -- set bridge br1 datapath-type=dummy -- \ add-bond br0 br0bond p0 p2 bond-mode=active-backup -- \ add-bond br1 br1bond p1 p3 bond-mode=active-backup -- \ @@ -10302,7 +10734,7 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x1234), packets:5, byte ]) AT_CHECK([grep 'modify' ovs-vswitchd.log | strip_ufid ], [0], [dnl -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:push_vlan(vid=4,pcp=0),100 +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:push_vlan(vid=4,pcp=0),100 ]) OVS_VSWITCHD_STOP AT_CLEANUP @@ -10383,8 +10815,8 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x8100),vlan(vid=99,pcp= # are wildcarded. AT_CHECK([grep '\(modify\)\|\(flow_add\)' ovs-vswitchd.log | strip_ufid ], [0], [dnl dpif_netdev|DBG|flow_add: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x1234), actions:100 -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:drop -dpif|DBG|dummy@ovs-dummy: put[[modify]] skb_priority(0/0),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),recirc_id(0),dp_hash(0/0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:100 +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:drop +dpif|DBG|dummy@ovs-dummy: put[[modify]] recirc_id(0),dp_hash(0/0),skb_priority(0/0),in_port(1),skb_mark(0/0),ct_state(0/0),ct_zone(0/0),ct_mark(0/0),ct_label(0/0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09/00:00:00:00:00:00,dst=50:54:00:00:00:0a/00:00:00:00:00:00),eth_type(0x1234), actions:100 dpif_netdev|DBG|flow_add: recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x8100),vlan(vid=99,pcp=7/0x0),encap(eth_type(0x1234)), actions:drop ]) OVS_VSWITCHD_STOP @@ -10710,10 +11142,10 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00: AT_CHECK([cat ovs-vswitchd.log | strip_ufid | filter_flow_install], [0], [dnl -ct_state(+new-est+trk),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:drop -ct_state(-new+est+trk),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:1 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:ct(commit),2 recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:ct,recirc(0x1) +recirc_id(0x1),in_port(2),ct_state(+new-est+trk),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:drop +recirc_id(0x1),in_port(2),ct_state(-new+est+trk),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no), actions:1 ]) OVS_VSWITCHD_STOP @@ -11119,9 +11551,9 @@ AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00: ovs-appctl revalidator/wait AT_CHECK([cat ovs-vswitchd.log | strip_ufid | filter_flow_install], [0], [dnl -ct_state(+rpl+trk),ct_label(0x1),recirc_id(0x1),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:1 recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=17,frag=no),udp(src=1), actions:ct(commit,label=0x1),2 recirc_id(0),in_port(2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:ct,recirc(0x1) +recirc_id(0x1),in_port(2),ct_state(+rpl+trk),ct_label(0x1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), actions:1 ]) OVS_VSWITCHD_STOP @@ -11679,7 +12111,29 @@ AT_CHECK([tail -1 stdout], [0], OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto - set mtu]) +dnl Checks the get/set sweep interval +AT_SETUP([ofproto-dpif - conntrack - change sweep interval]) +OVS_VSWITCHD_START + +# Check the default value. +AT_CHECK([ovs-appctl dpctl/ct-get-sweep-interval], [0], [dnl +20000 +]) + +# Set the interval to 5s. +AT_CHECK([ovs-appctl dpctl/ct-set-sweep-interval 5000], [0], [dnl +setting sweep interval successful +]) + +# Verify that the previous value has been applied. +AT_CHECK([ovs-appctl dpctl/ct-get-sweep-interval], [0], [dnl +5000 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto-dpif - set mtu]) OVS_VSWITCHD_START add_of_ports br0 1 @@ -11729,7 +12183,7 @@ AT_CHECK([ovs-vsctl wait-until Interface br0 mtu=1400]) OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto - fragment prerequisites]) +AT_SETUP([ofproto-dpif - fragment prerequisites]) OVS_VSWITCHD_START AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) @@ -11842,7 +12296,7 @@ ovs-ofctl dump-flows br0 AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) AT_CHECK([tail -3 stdout], [0], [dnl -Megaflow: recirc_id=0,eth,ip,reg0=0/0x1,in_port=1,nw_src=10.10.10.2,nw_frag=no +Megaflow: recirc_id=0,eth,icmp,reg0=0/0x1,in_port=1,nw_src=10.10.10.2,nw_frag=no Datapath actions: drop Translation failed (Recursion too deep), packet is dropped. ]) @@ -11943,3 +12397,267 @@ AT_CHECK([test 1 = `ovs-ofctl parse-pcap p2-tx.pcap | wc -l`]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - not supported]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +m4_define([NOT_SUPPORTED_WARN], [dnl +ignoring local sampling configuration: not supported by this datapath]) + +AT_CHECK([grep -q "NOT_SUPPORTED_WARN" ovs-vswitchd.log ]) + +AT_DATA([flows.txt], [dnl +in_port=1 actions=sample(probability=32767,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [in_port(1)], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: 2 +]) + +OVS_VSWITCHD_STOP(["/NOT_SUPPORTED_WARN/d"]) +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - sanity check]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 3 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=42], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),3 +in_port=2, actions=sample(probability=32767,collector_set_id=20,obs_domain_id=100,obs_point_id=200),3 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +dnl collector_set_id does not match. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: 3 +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))),3 +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - with IPFIX]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create ipfix targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set ipfix=@i id=1 \ + bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +m4_define([EXPECTED_ACT], [m4_join([], + [sample(sample=50.0%,actions(], + [psample(group=42,cookie=0x64000000c8),], + [userspace(pid=0,], + [flow_sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200,output_port=4294967295)], + [))),], + [2], +)]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: EXPECTED_ACT +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - with metered IPFIX]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +dnl Enabling an usupported feature is dangerous but we are not sending traffic. +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create ipfix targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set ipfix=@i id=1 \ + bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=slowpath pktps stats bands=type=drop rate=2']) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200),2 +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +m4_define([EXPECTED_ACT], [m4_join([], + [sample(sample=50.0%,actions(], + [psample(group=42,cookie=0x64000000c8),], + [meter(0),], + [userspace(pid=0,], + [flow_sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200,output_port=4294967295)], + [))),], + [2], +)]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: EXPECTED_ACT +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Local sampling - drop]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_CHECK([ovs-appctl dpif/set-dp-features --force br0 psample true], [0], [ignore]) + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 local-group-id=42], + [0], [ignore]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=slowpath pktps stats bands=type=drop rate=2']) + +AT_DATA([flows.txt], [dnl +in_port=1, actions=sample(probability=32767,collector_set_id=1,obs_domain_id=100,obs_point_id=200) +in_port=2, actions=sample(probability=65535,collector_set_id=1,obs_domain_id=100,obs_point_id=200) +]) + +AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) + +m4_define([TRACE_PKT], [m4_join([,], + [eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800)], + [ipv4(src=10.10.10.2,dst=10.10.10.1,proto=1,tos=1,ttl=128,frag=no)], + [icmp(type=8,code=0)])]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))) +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: psample(group=42,cookie=0x64000000c8) +]) + +AT_CHECK([ovs-vsctl set Open_vSwitch . other-config:explicit-sampled-drops=true]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: sample(sample=50.0%,actions(psample(group=42,cookie=0x64000000c8))),drop +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2) TRACE_PKT'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: psample(group=42,cookie=0x64000000c8),drop +]) + +OVS_VSWITCHD_STOP("/Enabling an unsupported feature is very dangerous/d") +AT_CLEANUP + +AT_SETUP([ofproto-dpif - Dump OF rules corresponding to UFID]) +OVS_VSWITCHD_START + +add_of_ports br0 1 2 3 + +dnl Add some OpenFlow rules and groups. +AT_DATA([groups.txt], [dnl +group_id=1,type=select,selection_method=dp_hash,bucket=bucket_id:0,weight:100,actions=ct(commit,table=2,nat(dst=20.0.0.2)) +group_id=2,type=all,bucket=resubmit(,3),bucket=resubmit(,4) +]) +AT_DATA([flows.txt], [dnl +table=0,priority=100,cookie=0x12345678,in_port=p1,ip,nw_dst=10.0.0.2,actions=resubmit(,1) +table=1,priority=200,ip,actions=group:1 +table=2,ip,actions=group:2 +table=3,ip,actions=p2 +table=4,ip,actions=p3 +]) +AT_CHECK([ovs-ofctl add-groups br0 groups.txt]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6),tcp(src=1,dst=2)']) +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl revalidator/pause]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_used | strip_stats | strip_duration | strip_dp_hash | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.0.0.2,frag=no), packets:0, bytes:0, used:0.0s, actions:hash(l4(0)),recirc(0x1) +recirc_id(0x1),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit,nat(dst=20.0.0.2)),recirc(0x2) +recirc_id(0x2),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:2,3 +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +cookie=0x12345678, n_packets=2, n_bytes=236, priority=100,ip,in_port=1,nw_dst=10.0.0.2,actions=resubmit(,1) +table_id=1, n_packets=2, n_bytes=236, priority=200,ip,actions=group:1 +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0x1)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +group_id=1,type=select,selection_method=dp_hash,bucket=bucket_id:0,weight:100,actions=ct(commit,table=2,nat(dst=20.0.0.2)) +]) + +ufid=$(ovs-appctl dpctl/dump-flows -m filter='recirc_id(0x2)' | parse_ufid) +AT_CHECK([ovs-appctl ofproto/detrace $ufid | ofctl_strip], [0], [dnl +table_id=2, n_packets=2, n_bytes=236, ip,actions=group:2 +table_id=3, n_packets=2, n_bytes=236, ip,actions=output:2 +table_id=4, n_packets=2, n_bytes=236, ip,actions=output:3 +group_id=2,type=all,bucket=bucket_id:0,actions=resubmit(,3),bucket=bucket_id:1,actions=resubmit(,4) +]) + +AT_CHECK([ovs-appctl revalidator/resume]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/ofproto-macros.at b/tests/ofproto-macros.at index 676d55aa956..c27d96177b6 100644 --- a/tests/ofproto-macros.at +++ b/tests/ofproto-macros.at @@ -9,7 +9,9 @@ s/ duration=[0-9.]*s,// s/ cookie=0x0,// s/ table=0,// s/ n_packets=0,// +s/ n_offload_packets=0,// s/ n_bytes=0,// +s/ n_offload_bytes=0,// s/ idle_age=[0-9]*,// s/ hard_age=[0-9]*,// s/dp_hash=0x[0-9a-f]*\//dp_hash=0x0\// @@ -19,6 +21,11 @@ s/dir\/[0-9]*\/br0.mgmt/dir\/XXXX\/br0.mgmt/ ' } +# Strips out byte counters from ovs-ofctl output +ofctl_strip_bytes () { + sed 's/ n_bytes=[0-9]*,//' +} + # Filter (multiline) vconn debug messages from ovs-vswitchd.log. # Use with vconn_sub() and ofctl_strip() print_vconn_debug () { awk -F\| < ovs-vswitchd.log ' @@ -125,7 +132,7 @@ strip_used () { # Removes all 'duration=...' to make output easier to compare. strip_duration () { - sed 's/duration=[[0-9]]*\.[[0-9]]*s,//' + sed 's/duration=[[0-9.]]*s,//' } # Strips 'ufid:...' from output, to make it easier to compare. @@ -135,12 +142,31 @@ strip_ufid () { s/ufid:[[-0-9a-f]]* //' } +parse_ufid () { + grep -o 'ufid:[[-0-9a-f]]*' +} + # Strips packets: and bytes: from output strip_stats () { sed 's/packets:[[0-9]]*/packets:0/ s/bytes:[[0-9]]*/bytes:0/' } +# Strips key32 field from output. +strip_key32 () { + sed 's/key32([[0-9 \/]]*),//' +} + +# Strips packet-type from output. +strip_ptype () { + sed 's/packet_type(ns=[[0-9]]*,id=[[0-9]]*),//' +} + +# Strips bare eth from output. +strip_eth () { + sed 's/eth(),//' +} + # Changes all 'recirc(...)' and 'recirc=...' to say 'recirc()' and # 'recirc=' respectively. This should make output easier to # compare. @@ -149,6 +175,12 @@ strip_recirc() { s/recirc_id=[[x0-9]]*/recirc_id=/ s/recirc([[x0-9]]*)/recirc()/' } + +# Strips dp_hash from output. +strip_dp_hash() { + sed 's/dp_hash([[0-9a-fx/]]*),//' +} + m4_divert_pop([PREPARE_TESTS]) m4_define([TESTABLE_LOG], [-vPATTERN:ANY:'%c|%p|%m']) @@ -249,6 +281,9 @@ check_logs () { # we ignore the messages that were rate-limited, we can end up failing just # because of the announcement that rate-limiting happened (and in a racy, # timing-dependent way, too). + # + # We also ignore the "Spent an unreasonably long XXms dumping flows" as + # they can appear when large time/warps are used during tests. sed -n "$1 /reset by peer/d /Broken pipe/d @@ -260,11 +295,20 @@ check_logs () { /ovs_rcu.*blocked [[0-9]]* ms waiting for .* to quiesce/d /Dropped [[0-9]]* log messages/d /setting extended ack support failed/d +/ETHTOOL_GSSET_INFO/d +/Spent an unreasonably long .*ms dumping flows/d /|WARN|/p /|ERR|/p /|EMER|/p" ${logs} } +# Gets the last line number in ovs-vswitchd.log +1. This can be used to +# help ensure that an output in the log is newly written as the result of +# a test command and it is not just matching an earlier log line. +get_log_next_line_num () { + LINENUM=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) +} + # add_of_br BRNUM [ARG...] add_of_br () { local brnum=$1; shift diff --git a/tests/ofproto.at b/tests/ofproto.at index a666bebcac4..2889f81fb17 100644 --- a/tests/ofproto.at +++ b/tests/ofproto.at @@ -6538,3 +6538,213 @@ verify_deleted OVS_VSWITCHD_STOP(["/nw_dst,output=2 +table=0 in_port=1 priority=83,ip,nw_dst=192.168.1.15,actions=set_field:192.168.21.26->nw_src,output=2 +table=0 in_port=1 priority=82,ip,nw_dst=192.168.1.14,actions=set_field:0x40->nw_tos,output=2 +table=0 in_port=1 priority=0,actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl send a proto 0 packet to try and poison the DP flow path +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + '5054000000075054000000050800450000548de140004000289fc0a801c4c0a8011408003bf60002001bbf080a640000000032ad010000000000101112131415161718191a1b1c1d1e1f202122232425262728292a2b2c2d2e2f3031323334353637']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=0,frag=no), packets:0, bytes:0, used:never, actions:2 +]) + +dnl Send ICMP for mod nw_src and mod nw_dst +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.21,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.20,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will dec TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.10,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.19,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod ECN +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.18,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will mod TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.17,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set DST +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.16,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set SRC +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.15,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +dnl send ICMP that will set TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.14,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.10,proto=1,ttl=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(ttl=63)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.14,proto=1,tos=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.16,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(dst=192.168.20.26)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.17,proto=1,tos=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.18,proto=1,tos=0/0x3,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(tos=0x2/0x3)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.19,proto=1,ttl=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(ttl=8)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=0,frag=no), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.1.20,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(dst=192.168.20.20)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.15,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(src=192.168.21.26)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(src=192.168.1.1,dst=192.168.1.21,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv4(src=192.168.20.21)),2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto - implicit mask of ipv6 proto with HOPOPT field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=77,ip6,ipv6_dst=111:db8::3,actions=dec_ttl,output=2 +table=0 in_port=1 priority=76,ip6,ipv6_dst=111:db8::4,actions=mod_nw_ttl:8,output=2 +table=0 in_port=1 priority=75,ip6,ipv6_dst=111:db8::5,actions=mod_nw_ecn:2,output=2 +table=0 in_port=1 priority=74,ip6,ipv6_dst=111:db8::6,actions=mod_nw_tos:0x40,output=2 +table=0 in_port=1 priority=73,ip6,ipv6_dst=111:db8::7,actions=set_field:2112:db8::2->ipv6_dst,output=2 +table=0 in_port=1 priority=72,ip6,ipv6_dst=111:db8::8,actions=set_field:2112:db8::3->ipv6_src,output=2 +table=0 in_port=1 priority=72,ip6,ipv6_dst=111:db8::9,actions=set_field:44->ipv6_label,output=2 +table=0 in_port=1 priority=0,actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl send a proto 0 packet to try and poison the DP flow path +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::3,proto=0,tclass=0,hlimit=64,frag=no)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=0,hlimit=0,frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)) +]) + +dnl Send ICMP for mod nw_src and mod nw_dst +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::3,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::4,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will dec TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::5,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod TTL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::6,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod ECN +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::7,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will mod TOS +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::8,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +dnl send ICMP that will set LABEL +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::9,proto=1,tclass=0,hlimit=64,frag=no),icmpv6(type=0,code=8)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=0,hlimit=0,frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=2,dont_send=0,continuation=0,recirc_id=1,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::3,proto=1,hlimit=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(hlimit=63)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::4,proto=1,hlimit=64,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(hlimit=8)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::5,proto=1,tclass=0/0x3,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(tclass=0x2/0x3)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::6,proto=1,tclass=0/0xfc,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(tclass=0x40/0xfc)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::7,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(dst=2112:db8::2)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(dst=111:db8::9,label=0,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(label=0x2c)),2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=111:db8::8,proto=1,frag=no), packets:0, bytes:0, used:never, actions:set(ipv6(src=2112:db8::3)),2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto - implicit mask of ARP OPer field]) +OVS_VSWITCHD_START +add_of_ports br0 1 2 + +AT_DATA([flows.txt], [dnl +table=0 in_port=1 priority=77,arp,arp_sha=00:01:02:03:04:06,actions=set_field:0x1->arp_op,2 +table=0 in_port=1 priority=76,arp,arp_sha=00:01:02:03:04:07,actions=set_field:00:02:03:04:05:06->arp_sha,2 +table=0 in_port=1 priority=75,arp,arp_sha=00:01:02:03:04:08,actions=set_field:ff:00:00:00:00:ff->arp_tha,2 +table=0 in_port=1 priority=74,arp,arp_sha=00:01:02:03:04:09,actions=set_field:172.31.110.26->arp_spa,2 +table=0 in_port=1 priority=73,arp,arp_sha=00:01:02:03:04:0a,actions=set_field:172.31.110.10->arp_tpa,2 +table=0 in_port=1 priority=1,actions=drop +]) + +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Send op == 0 packet +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \ + 'ffffffffffffaa55aa550000080600010800060400000001020304070c0a00010000000000000c0a0002']) + +AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=0,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:2 +]) + +dnl Send op 2 -> set op +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=2,sha=00:01:02:03:04:06,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set SHA +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:07,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set THA +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:08,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set SIP +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:09,tha=ff:ff:ff:ff:ff:ff)']) + +dnl Send op 1 -> set TIP +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0806),arp(sip=172.31.110.1,tip=172.31.110.25,op=1,sha=00:01:02:03:04:0a,tha=ff:ff:ff:ff:ff:ff)']) + +AT_CHECK([ovs-appctl dpctl/dump-flows | sort], [0], [dnl +flow-dump from the main thread: +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=0,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=1,sha=00:01:02:03:04:07), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=1,sha=00:01:02:03:04:08,tha=ff:ff:ff:ff:ff:ff), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(op=2,sha=00:01:02:03:04:06), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(sip=172.31.110.1,op=1,sha=00:01:02:03:04:09), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0806),arp(tip=172.31.110.25,op=1,sha=00:01:02:03:04:0a), packets:0, bytes:0, used:never, actions:userspace(pid=0,slow_path(action)) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ofproto - configure inactivity probe interval]) + +# Set 6 second inactivity probe interval (default is 5 seconds). +OVS_VSWITCHD_START([set-controller br0 unix:testcontroller \ + -- set Controller br0 inactivity_probe=6000], [], [], + [-vfile:rconn:dbg]) + +# Start test openflow controller. +AT_CHECK([ovs-testcontroller -vsyslog:off --detach --no-chdir --pidfile punix:testcontroller], + [0], [ignore]) +on_exit 'kill `cat ovs-testcontroller.pid`' +OVS_WAIT_UNTIL([test -e testcontroller]) + +# After 6 seconds of inactivity there should be a log message. +OVS_WAIT_UNTIL([grep "idle 6 seconds, sending inactivity probe" ovs-vswitchd.log]) + +# Restart ovs-vswitchd with an empty ovs-vswitchd log file. +OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) +mv ovs-vswitchd.log ovs-vswitchd_1.log +AT_CHECK([ovs-vswitchd --enable-dummy --disable-system --disable-system-route --detach \ + --no-chdir --pidfile --log-file -vfile:rconn:dbg -vvconn -vofproto_dpif -vunixctl], + [0], [], [stderr]) + +# After 6 seconds of inactivity there should be a log message. +OVS_WAIT_UNTIL([grep "idle 6 seconds, sending inactivity probe" ovs-vswitchd.log]) +OVS_VSWITCHD_STOP(["/br0<->unix:testcontroller: connection failed/d"]) +AT_CLEANUP diff --git a/tests/ovs-macros.at b/tests/ovs-macros.at index 39fbfceeb81..06c97855548 100644 --- a/tests/ovs-macros.at +++ b/tests/ovs-macros.at @@ -211,14 +211,9 @@ m4_divert_pop([PREPARE_TESTS]) OVS_START_SHELL_HELPERS ovs_cleanup() { - if test "$(echo asan.*)" != 'asan.*'; then - echo "Address Sanitizer reported errors in:" asan.* - cat asan.* - AT_FAIL_IF([:]) - fi - if test "$(echo ubsan.*)" != 'ubsan.*'; then - echo "Undefined Behavior Sanitizer reported errors in:" ubsan.* - cat ubsan.* + if test "$(echo sanitizers.*)" != 'sanitizers.*'; then + echo "Undefined Behavior Sanitizer or Address Sanitizer reported errors in:" sanitizers.* + cat sanitizers.* AT_FAIL_IF([:]) fi } diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at index a8934051ef1..e2f4429ae55 100644 --- a/tests/ovs-ofctl.at +++ b/tests/ovs-ofctl.at @@ -198,6 +198,8 @@ actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_ actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63]) ip,actions=ct(nat) ip,actions=ct(commit,nat(dst)) ip,actions=ct(commit,nat(src)) @@ -233,6 +235,8 @@ OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_d OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress) OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789,egress) +OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) +OFPT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63]) OFPT_FLOW_MOD: ADD ip actions=ct(nat) OFPT_FLOW_MOD: ADD ip actions=ct(commit,nat(dst)) OFPT_FLOW_MOD: ADD ip actions=ct(commit,nat(src)) @@ -265,6 +269,7 @@ sctp actions=drop in_port=0 actions=resubmit:0 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CHECK([ovs-ofctl --protocols OpenFlow11 parse-flows flows.txt @@ -286,6 +291,7 @@ OFPT_FLOW_MOD (OF1.1): ADD sctp actions=drop OFPT_FLOW_MOD (OF1.1): ADD in_port=0 actions=resubmit:0 OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +OFPT_FLOW_MOD (OF1.1): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CLEANUP @@ -312,6 +318,7 @@ in_port=0 actions=mod_dl_src:11:22:33:44:55:66,mod_dl_dst:10:20:30:40:50:60 in_port=0 actions=resubmit:0 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CHECK([ovs-ofctl --protocols OpenFlow12 parse-flows flows.txt @@ -339,6 +346,7 @@ OFPT_FLOW_MOD (OF1.2): ADD in_port=0 actions=set_field:11:22:33:44:55:66->eth_sr OFPT_FLOW_MOD (OF1.2): ADD in_port=0 actions=resubmit:0 OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +OFPT_FLOW_MOD (OF1.2): ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=0) ]]) AT_CLEANUP @@ -441,6 +449,7 @@ tcp,actions=fin_timeout(idle_timeout=5,hard_timeout=15) actions=controller(max_len=123,reason=invalid_ttl,id=555) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789) mpls,mpls_label=5,mpls_tc=1,mpls_ttl=1,mpls_bos=0,actions=drop ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[]))) @@ -508,6 +517,7 @@ NXT_FLOW_MOD: ADD table:255 tcp actions=fin_timeout(idle_timeout=5,hard_timeout= NXT_FLOW_MOD: ADD table:255 actions=controller(reason=invalid_ttl,max_len=123,id=555) NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD table:255 actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789) NXT_FLOW_MOD: ADD table:255 mpls,mpls_label=5,mpls_tc=1,mpls_ttl=1,mpls_bos=0 actions=drop NXT_FLOW_MOD: ADD table:255 ip actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD table:255 ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[])) @@ -567,6 +577,7 @@ dl_dst=aa:bb:cc:dd:ee:ff/fe:ff:ff:ff:ff:ff,actions=drop dl_dst=aa:bb:cc:dd:ee:ff/00:00:00:00:00:00,actions=drop actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[[]],obs_point_id=NXM_NX_CT_LABEL[[32..63]],sampling_port=56789,egress) ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[[]]))) ip,actions=ct(commit,exec(load(0x1->NXM_NX_CT_LABEL[[]]))) @@ -608,6 +619,7 @@ NXT_FLOW_MOD: ADD dl_dst=aa:bb:cc:dd:ee:ff/fe:ff:ff:ff:ff:ff actions=drop NXT_FLOW_MOD: ADD actions=drop NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[[]],obs_point_id=NXM_NX_CT_LABEL[[32..63]],sampling_port=56789,egress) NXT_FLOW_MOD: ADD ip actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[[]])) NXT_FLOW_MOD: ADD ip actions=ct(commit,exec(load:0x1->NXM_NX_CT_LABEL[[0..63]],load:0->NXM_NX_CT_LABEL[[64..127]])) @@ -648,6 +660,7 @@ actions=push:reg0[0..31],pop:reg0 vlan_tci=0x1123/0x1fff,actions=drop actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) ip,actions=ct(commit,zone=5) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_MARK[]))) ip,actions=ct(commit,exec(load(1->NXM_NX_CT_LABEL[]))) @@ -688,6 +701,7 @@ NXT_FLOW_MOD: ADD actions=push:NXM_NX_REG0[],pop:NXM_NX_REG0[] NXT_FLOW_MOD: ADD NXM_OF_VLAN_TCI_W(1123/1fff) actions=drop NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678) NXT_FLOW_MOD: ADD actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,sampling_port=56789) +NXT_FLOW_MOD: ADD actions=sample(probability=12341,collector_set_id=23456,obs_domain_id=NXM_OF_IN_PORT[],obs_point_id=NXM_NX_CT_LABEL[32..63],sampling_port=56789,egress) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,zone=5) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,exec(load:0x1->NXM_NX_CT_MARK[])) NXT_FLOW_MOD: ADD NXM_OF_ETH_TYPE(0800) actions=ct(commit,exec(load:0x1->NXM_NX_CT_LABEL[0..63],load:0->NXM_NX_CT_LABEL[64..127])) @@ -3271,3 +3285,77 @@ AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sed '/OFPST_FLO OVS_VSWITCHD_STOP(["/Flow exceeded the maximum flow statistics reply size and was excluded from the response set/d"]) AT_CLEANUP + +AT_SETUP([ovs-ofctl ct-flush]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl vlog/set ct_dpif:dbg]) + +# Check flush conntrack with both zone and tuple +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 1]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_tp_src=1,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack just with tuple +AT_CHECK([ovs-ofctl ct-flush br0 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 2]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack with reply tuple +AT_CHECK([ovs-ofctl ct-flush br0 '' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 3]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack with zone and reply tuple +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 '' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_nw_proto=17,ct_tp_src=1']) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 4]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 'ct_nw_src=::,ct_nw_dst=::,ct_tp_src=0,ct_tp_dst=0,ct_nw_proto=17' 'ct_nw_src=10.1.1.3,ct_nw_dst=10.1.1.4,ct_tp_src=1,ct_tp_dst=0'" ovs-vswitchd.log]) + +# Check flush-conntrack without any tuple and zone +AT_CHECK([ovs-ofctl ct-flush br0]) + +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 5]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: " ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 mark=0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 6]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 mark=0/0x5]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 7]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0/0x5" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 mark=0xabc/0xdef]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 8]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 mark=0xabc/0xdef" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 9]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0/0x5]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 10]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0/0x5" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 labels=0xabc/0xdef]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 11]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=0 labels=0xabc/0xdef" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 mark=25 labels=25]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 12]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 mark=0x19 labels=0x19" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=5 mark=30/25 labels=30/25]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 13]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone=5 mark=0x1e/0x19 labels=0x1e/0x19" ovs-vswitchd.log]) + +AT_CHECK([ovs-ofctl ct-flush br0 zone=6 mark=30/0 labels=30/0]) +OVS_WAIT_UNTIL([test $(grep -c "|ct_dpif|DBG|.*ct_flush" ovs-vswitchd.log) -eq 14]) +AT_CHECK([grep -q "ct_dpif|DBG|.*ct_flush: zone 6" ovs-vswitchd.log]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/ovs-router.at b/tests/ovs-router.at index 6dacc2954bc..b3314b3dff0 100644 --- a/tests/ovs-router.at +++ b/tests/ovs-router.at @@ -1,14 +1,111 @@ AT_BANNER([ovs-router]) -AT_SETUP([appctl - route/add with gateway]) +AT_SETUP([appctl - route/add with gateway and pkt_mark]) AT_KEYWORDS([ovs_router]) -OVS_VSWITCHD_START([add-port br0 p2 -- set Interface p2 type=gre \ - options:local_ip=2.2.2.2 options:remote_ip=1.1.1.1 \ - -- add-port br0 p1 -- set interface p1 type=dummy]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 2.2.2.2/24], [0], [OK ]) +AT_CHECK([ovs-appctl ovs/route/add 2.2.2.3/32 br0 pkt_mark=1], [0], [OK +]) AT_CHECK([ovs-appctl ovs/route/add 1.1.1.0/24 br0 2.2.2.10], [0], [OK ]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.2.0/24 br0 2.2.2.10 pkt_mark=2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.3.0/24 br0 pkt_mark=3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 1.1.foo.bar/24 br0 2.2.2.10], [2], [], [dnl +Invalid 'ip/plen' parameter +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2.2.2.4/24 br0 pkt_mark=baz], [2], [], [dnl +Invalid pkt_mark, IP gateway or src_ip +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | sort], [0], [dnl +User: 1.1.1.0/24 dev br0 GW 2.2.2.10 SRC 2.2.2.2 +User: 1.1.2.0/24 MARK 2 dev br0 GW 2.2.2.10 SRC 2.2.2.2 +User: 2.2.2.3/32 MARK 1 dev br0 SRC 2.2.2.2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([appctl - route/add with src - ipv4]) +AT_KEYWORDS([ovs_router]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.9.2/24], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 192.168.9.3/24], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.9.11/32 br0 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.12/32 br0 192.168.9.1 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.13/32 br0 192.168.9.1 pkt_mark=13 src=192.168.9.3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.14/32 br0 192.168.9.1 pkt_mark=14 src=192.168.9.2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.15/32 br0 192.168.9.1 src=foo.bar.9.200], [2], [], [dnl +Invalid pkt_mark, IP gateway or src_ip +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.16/32 br0 192.168.9.1 src=192.168.9.200], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.17/32 br0 192.168.11.1 src=192.168.9.3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 192.168.10.18/32 br0 src=192.168.9.3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | grep 192.168.10 | sort], [0], [dnl +User: 192.168.10.12/32 dev br0 GW 192.168.9.1 SRC 192.168.9.3 +User: 192.168.10.13/32 MARK 13 dev br0 GW 192.168.9.1 SRC 192.168.9.3 +User: 192.168.10.14/32 MARK 14 dev br0 GW 192.168.9.1 SRC 192.168.9.2 +]) +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([appctl - route/add with src - ipv6]) +AT_KEYWORDS([ovs_router]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:db8:cafe::2/64], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:db8:cafe::3/64], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:cafe::11/128 br0 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::12/128 br0 2001:db8:cafe::1 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::13/128 br0 2001:db8:cafe::1 pkt_mark=13 src=2001:db8:cafe::3], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::14/128 br0 2001:db8:cafe::1 pkt_mark=14 src=2001:db8:cafe::2], [0], [OK +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::15/128 br0 2001:db8:cafe::1 src=foo:bar:2001:db8:cafe], [2], [], [dnl +Invalid pkt_mark, IP gateway or src_ip +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::16/128 br0 2001:db8:cafe::1 src=2001:db8:cafe::200], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::17/128 br0 2001:db8:face::1 src=2001:db8:cafe::3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/add 2001:db8:beef::18/128 br0 src=2001:db8:cafe::3], [2], [], [dnl +Error while inserting route. +ovs-appctl: ovs-vswitchd: server returned an error +]) +AT_CHECK([ovs-appctl ovs/route/show | grep User | grep 2001:db8:beef | sort], [0], [dnl +User: 2001:db8:beef::12/128 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::3 +User: 2001:db8:beef::13/128 MARK 13 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::3 +User: 2001:db8:beef::14/128 MARK 14 dev br0 GW 2001:db8:cafe::1 SRC 2001:db8:cafe::2 +]) OVS_VSWITCHD_STOP AT_CLEANUP diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at index d6cd2c0849a..febb9dadf19 100644 --- a/tests/ovs-vsctl.at +++ b/tests/ovs-vsctl.at @@ -425,6 +425,7 @@ AT_CHECK([RUN_OVS_VSCTL_ONELINE( [add-port a a1], [add-bond a bond0 a2 a3], [br-set-external-id a key0 value0], + [add Bridge a external_ids key0=value1], [set port a1 external-ids:key1=value1], [set interface a2 external-ids:key2=value2], [set interface a2 external-ids:key3=value3], @@ -446,6 +447,7 @@ AT_CHECK([RUN_OVS_VSCTL_ONELINE( + key0=value0 value0 @@ -973,6 +975,67 @@ AT_CHECK( [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:10, Timeout Policies: system default ]) +AT_CHECK([RUN_OVS_VSCTL([--if-exists del-zone-tp netdev zone=10])]) + +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 1 1])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 1, Limit: 1 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 1 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 1, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 1])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 10 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=10 icmp_first=1 icmp_reply=2])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: icmp_first=1 icmp_reply=2 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: icmp_first=1 icmp_reply=2 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 10 5])]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-tp netdev zone=10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [dnl +Zone:10, Timeout Policies: system default +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default 5])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Default, Limit: 5 +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default 10])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Default, Limit: 10 +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev default])]) +AT_CHECK([RUN_OVS_VSCTL([list-zone-limits netdev])], [0], [dnl +Zone: 10, Limit: 5 +]) + +AT_CHECK([RUN_OVS_VSCTL([--if-exists del-zone-limit netdev default])]) + AT_CHECK([RUN_OVS_VSCTL([-- --id=@m create Datapath datapath_version=0 'capabilities={recirc=true}' -- set Open_vSwitch . datapaths:"system"=@m])], [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-dp-cap system])], [0], [recirc=true @@ -1111,16 +1174,39 @@ AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdevxx zone=1 icmp_first=1 icmp_reply=2]) ]) AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=2 icmp_first=2 icmp_reply=3])]) AT_CHECK([RUN_OVS_VSCTL([add-zone-tp netdev zone=2 icmp_first=2 icmp_reply=3])], - [1], [], [ovs-vsctl: zone id 2 already exists + [1], [], [ovs-vsctl: zone id 2 already has a policy ]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:2, Timeout Policies: icmp_first=2 icmp_reply=3 ]) AT_CHECK([RUN_OVS_VSCTL([del-zone-tp netdev zone=11])], - [1], [], [ovs-vsctl: zone id 11 does not exist + [1], [], [ovs-vsctl: zone id 11 does not have a policy ]) AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:2, Timeout Policies: icmp_first=2 icmp_reply=3 ]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdevxx 5 1])], + [1], [], [ovs-vsctl: datapath netdevxx does not exist +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 88888 1])], + [1], [], [ovs-vsctl: zone_id (88888) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev 5 -1])], + [1], [], [ovs-vsctl: limit (-1) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev 10])], + [1], [], [ovs-vsctl: zone_id 10 does not have a limit +]) + +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdevxx default 1])], + [1], [], [ovs-vsctl: datapath netdevxx does not exist +]) +AT_CHECK([RUN_OVS_VSCTL([set-zone-limit netdev default -1])], + [1], [], [ovs-vsctl: limit (-1) out of range +]) +AT_CHECK([RUN_OVS_VSCTL([del-zone-limit netdev default])], + [1], [], [ovs-vsctl: datapath netdev does not have a limit +]) + AT_CHECK([RUN_OVS_VSCTL([-- --id=@m create Datapath datapath_version=0 'capabilities={recirc=true}' -- set Open_vSwitch . datapaths:"system"=@m])], [0], [stdout]) AT_CHECK([RUN_OVS_VSCTL([list-dp-cap nosystem])], [1], [], [ovs-vsctl: datapath "nosystem" record not found @@ -1710,3 +1796,28 @@ ingress_policing_kpkts_rate: 100 ]) OVS_VSCTL_CLEANUP AT_CLEANUP + +AT_SETUP([ovs-vsctl create bridge with uuid]) +AT_KEYWORDS([create bridge with uuid]) +OVS_VSCTL_SETUP + +AT_CHECK([ovs-vsctl --no-wait --id=c5cc12f8-eaa1-43a7-8a73-bccd18df1111 create bridge \ +name=tst0 -- add open . bridges c5cc12f8-eaa1-43a7-8a73-bccd18df1111], [0],[dnl +c5cc12f8-eaa1-43a7-8a73-bccd18df1111 +]) + +AT_CHECK([ovs-vsctl --no-wait --id=c5cc12f8-eaa1-43a7-8a73-bccd18df1111 create bridge \ +name=tst1 -- add open . bridges c5cc12f8-eaa1-43a7-8a73-bccd18df1111], [1], [ignore], [ignore]) + +AT_CHECK([ovs-vsctl --no-wait --bare --columns _uuid,name list bridge], [0], [dnl +c5cc12f8-eaa1-43a7-8a73-bccd18df1111 +tst0 +]) + +ovs-vsctl --no-wait --id=@a create bridge \ +name=tst1 -- add open . bridges @a + +AT_CHECK([ovs-vsctl --no-wait --bare --columns _uuid,name list bridge tst1], [0], [ignore]) + +OVS_VSCTL_CLEANUP +AT_CLEANUP diff --git a/tests/ovs-vswitchd.at b/tests/ovs-vswitchd.at index 977b2eba1f2..730363e8357 100644 --- a/tests/ovs-vswitchd.at +++ b/tests/ovs-vswitchd.at @@ -265,3 +265,36 @@ OFPT_FEATURES_REPLY: dpid:$orig_dpid OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ovs-vswitchd version]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl version], [0], [ignore]) +ovs_version=$(ovs-appctl version) + +AT_CHECK_UNQUOTED([ovs-appctl --format json version], [0], [dnl +{"reply":"$ovs_version","reply-format":"plain"} +]) + +AT_CHECK_UNQUOTED([ovs-appctl --format json --pretty version], [0], [dnl +{ + "reply": "$ovs_version", + "reply-format": "plain"} +]) + +AT_CLEANUP + +AT_SETUP([ovs-vswitchd list-commands]) +OVS_VSWITCHD_START + +AT_CHECK([ovs-appctl list-commands], [0], [ignore]) +AT_CHECK([ovs-appctl --format json list-commands], [0], [stdout]) + +# Check that ovs-appctl prints a single line with a trailing newline. +AT_CHECK([wc -l stdout], [0], [1 stdout +]) + +# Check that ovs-appctl prints a JSON document. +AT_CHECK([ovstest test-json stdout], [0], [ignore]) + +AT_CLEANUP diff --git a/tests/ovsdb-client.at b/tests/ovsdb-client.at index 2d14f1ac262..dcddb258745 100644 --- a/tests/ovsdb-client.at +++ b/tests/ovsdb-client.at @@ -5,7 +5,7 @@ AT_KEYWORDS([ovsdb client positive]) ordinal_schema > schema on_exit 'kill `cat *.pid`' AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client get-schema-version unix:socket ordinals], [0], [5.1.3 ]) AT_CHECK([ovsdb-client get-schema-cksum unix:socket ordinals], [0], [12345678 9 @@ -19,7 +19,7 @@ on_exit 'kill `cat *.pid`' ordinal_schema > schema touch .db.~lock~ AT_CHECK([ovsdb-tool create db schema], [0], [], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client needs-conversion unix:socket schema], [0], [no ]) OVSDB_SERVER_SHUTDOWN @@ -31,7 +31,7 @@ ordinal_schema > schema touch .db.~lock~ on_exit 'kill `cat *.pid`' AT_CHECK([ovsdb-tool create db schema], [0], [], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) sed 's/5\.1\.3/5.1.4/' < schema > schema2 AT_CHECK([diff schema schema2], [1], [ignore]) AT_CHECK([ovsdb-client needs-conversion unix:socket schema2], [0], [yes @@ -134,7 +134,7 @@ _uuid name number ]) dnl Stop the database server, then re-start it based on the backup. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock backup], [0]) dnl Dump a new copy of the data. @@ -195,7 +195,7 @@ ordinals table _uuid,name,number ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP @@ -254,7 +254,7 @@ _uuid,name,number ]) dnl Stopping the server. -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl ovsdb-client should exit by itself after disconnection form the server. OVS_WAIT_WHILE([test -e ovsdb-client.pid]) @@ -270,8 +270,8 @@ AT_CHECK([ovsdb-client --replay=./replay_dir dnl dnl Waiting for client to exit the same way as it exited during recording. OVS_WAIT_WHILE([test -e ovsdb-client.pid]) -AT_CHECK([diff monitor.stdout monitor-replay.stdout]) -AT_CHECK([diff monitor.stderr monitor-replay.stderr]) +AT_CHECK([diff -u monitor.stdout monitor-replay.stdout]) +AT_CHECK([diff -u monitor.stderr monitor-replay.stderr]) dnl Stripping out timestamps, PIDs and poll_loop warnings from the log. dnl Also stripping socket_util errors as sockets are not used in replay. @@ -284,6 +284,6 @@ m4_define([CLEAN_LOG_FILE], CLEAN_LOG_FILE([monitor.log], [monitor.log.clear]) CLEAN_LOG_FILE([monitor-replay.log], [monitor-replay.log.clear]) -AT_CHECK([diff monitor.log.clear monitor-replay.log.clear]) +AT_CHECK([diff -u monitor.log.clear monitor-replay.log.clear]) AT_CLEANUP diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 920b833b721..9d8b4d06a4a 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -8,7 +8,7 @@ ovsdb_check_cluster () { $schema_func > schema schema=`ovsdb-tool schema-name schema` AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db schema unix:s1.raft], [0], [], [stderr]) - if test X$local_config == X"yes"; then + if test X$local_config = X"yes"; then for i in `seq $n`; do AT_CHECK([ovsdb-tool create c$i.db $top_srcdir/ovsdb/local-config.ovsschema], [0], [], [stderr]) local ctxn="[[\"Local_Config\", @@ -30,7 +30,7 @@ ovsdb_check_cluster () { for i in `seq $n`; do local remote=punix:s$i.ovsdb local config_db= - if test X$local_config == X"yes"; then + if test X$local_config = X"yes"; then remote=db:Local_Config,Config,connections config_db=c$i.db fi @@ -104,8 +104,6 @@ ovsdb_test_cluster_disconnect () { n=$1 leader_or_follower=$2 check_flapping=$3 - schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` - ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -129,7 +127,7 @@ ovsdb_test_cluster_disconnect () { # When a node is disconnected from the cluster, the IDL should disconnect # and retry even if it uses a single remote, because the remote IP can be # a VIP on a load-balance. So we use single remote to test here. - if test $leader_or_follower == "leader"; then + if test $leader_or_follower = "leader"; then target=1 shutdown=`seq $(($n/2 + 1)) $n` cleanup=`seq $(($n/2))` @@ -188,13 +186,13 @@ ovsdb_test_cluster_disconnect () { count_old=`grep "raft_is_connected: true" raft_is_connected.log | wc -l` echo count_old $count_old - if test X$check_flapping == X"yes"; then + if test X$check_flapping = X"yes"; then sleep 10 fi # Make sure raft_is_connected didn't flap from false to true. count_new=`grep "raft_is_connected: true" raft_is_connected.log | wc -l` echo count_new $count_new - AT_CHECK([test $count_new == $count_old]) + AT_CHECK([test $count_new = $count_old]) for i in $cleanup; do OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) @@ -221,8 +219,6 @@ AT_SETUP([OVSDB cluster - initial status should be disconnected]) AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -260,8 +256,6 @@ AT_SETUP([OVSDB cluster - election timer change]) AT_KEYWORDS([ovsdb server positive unix cluster timer]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -353,8 +347,6 @@ AT_SETUP([OVSDB cluster - install snapshot RPC]) AT_KEYWORDS([ovsdb server positive unix cluster snapshot]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -433,8 +425,6 @@ AT_SETUP([OVSDB cluster - follower crash while joining]) AT_KEYWORDS([ovsdb server negative unix cluster join]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` @@ -483,6 +473,112 @@ done AT_CLEANUP +AT_SETUP([OVSDB cluster - leadership change after replication while joining]) +AT_KEYWORDS([ovsdb server negative unix cluster join]) + +n=5 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl + $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +on_exit " + for i in \$(ls $(pwd)/s[[0-$n]]); do + ovs-appctl --timeout 1 -t \$i cluster/status $schema_name; + done +" + +dnl Starting servers one by one asking all exisitng servers to transfer +dnl leadership after append reply forcing the joining server to try another +dnl one that will also transfer leadership. Since transfer is happening +dnl after the servers update is replicated to other servers, one of the +dnl other servers will actually commit it. It may be a new leader from +dnl one of the old members or the new joining server itself. +for i in $(seq $n); do + dnl Make sure that all already started servers joined the cluster. + for j in $(seq $((i - 1)) ); do + AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected]) + done + for j in $(seq $((i - 1)) ); do + OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \ + cluster/failure-test \ + transfer-leadership-after-sending-append-request \ + | grep -q "engaged"]) + done + + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s$i.log \ + --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +AT_CLEANUP + +AT_SETUP([OVSDB cluster - leadership change before replication while joining]) +AT_KEYWORDS([ovsdb server negative unix cluster join]) + +n=5 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl + $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +on_exit " + for i in \$(ls $(pwd)/s[[0-$n]]); do + ovs-appctl --timeout 1 -t \$i cluster/status $schema_name; + done +" + +dnl Starting servers one by one asking all exisitng servers to transfer +dnl leadership right after starting to add a server. Joining server will +dnl need to find a new leader that will also transfer leadership. +dnl This will continue until the same server will not become a leader +dnl for the second time and will be able to add a new server. +for i in $(seq $n); do + dnl Make sure that all already started servers joined the cluster. + for j in $(seq $((i - 1)) ); do + AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected]) + done + for j in $(seq $((i - 1)) ); do + OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \ + cluster/failure-test \ + transfer-leadership-after-starting-to-add \ + | grep -q "engaged"]) + done + + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \ + --detach --no-chdir --log-file=s$i.log \ + --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done + +dnl Make sure that all servers joined the cluster. +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +AT_CLEANUP OVS_START_SHELL_HELPERS @@ -493,7 +589,7 @@ ovsdb_cluster_failure_test () { remote_2=$2 crash_node=$3 crash_command=$4 - if test "$crash_node" == "1"; then + if test "$crash_node" = "1"; then new_leader=$5 fi log_grep=$6 @@ -536,7 +632,7 @@ ovsdb_cluster_failure_test () { # To ensure $new_leader node the new leader, we delay election timer for # the other follower. if test -n "$new_leader"; then - if test "$new_leader" == "2"; then + if test "$new_leader" = "2"; then delay_election_node=3 else delay_election_node=2 @@ -665,8 +761,6 @@ AT_SETUP([OVSDB cluster - competing candidates]) AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates]) n=3 -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) cid=`ovsdb-tool db-cid s1.db` schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` @@ -715,6 +809,49 @@ done AT_CLEANUP + +AT_SETUP([OVSDB cluster - disruptive server]) +AT_KEYWORDS([ovsdb server negative unix cluster disruptive]) + +n=3 +AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster \ + s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) +cid=$(ovsdb-tool db-cid s1.db) +schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema) +for i in $(seq 2 $n); do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) +done + +on_exit 'kill $(cat *.pid)' +for i in $(seq $n); do + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir \ + --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i \ + --remote=punix:s$i.ovsdb s$i.db]) +done +for i in $(seq $n); do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) +done + +# An unstable follower shouldn't disrupt the healthy cluster - shouldn't +# trigger term change. +AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test stop-raft-rpc], [0], [ignore]) +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep "Role: candidate"]) +AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test clear], [0], [ignore]) + +# Should step back to follower. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep "Role: follower"]) + +# No term change. +for i in $(seq $n); do + AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name | grep "Term: 1"], [0], [ignore]) +done + +for i in $(seq $n); do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid]) +done + +AT_CLEANUP + AT_BANNER([OVSDB - cluster tests]) diff --git a/tests/ovsdb-execution.at b/tests/ovsdb-execution.at index e72bf060697..1ffa2b73854 100644 --- a/tests/ovsdb-execution.at +++ b/tests/ovsdb-execution.at @@ -728,6 +728,53 @@ dnl collide (only) with their previous values (succeeds). [{"count":2},{"uuid":["uuid","<6>"]},{"uuid":["uuid","<7>"]},{"rows":[{"name":"new one","number":1},{"name":"new two","number":2},{"name":"old one","number":10},{"name":"old two","number":20}]}] ]]) +OVSDB_CHECK_EXECUTION([size constraints on sets], + [constraint_schema], + [ + [[["constraints", + {"op": "insert", + "table": "b", + "row": {"b": 1} + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "delete", 0]] + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "insert", 1]] + }]]], + [[["constraints", + {"op": "update", + "table": "b", + "where": [], + "row": {"x": ["set", [3, 4]]} + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "insert", 5]] + }]]], + [[["constraints", + {"op": "mutate", + "table": "b", + "where": [], + "mutations": [["x", "delete", 4], ["x", "insert", 5]] + }]]] + ], + [[[{"uuid":["uuid","<0>"]}] +[{"details":"Attempted to store 0 elements in set of 1 to 2 integers.","error":"constraint violation"}] +[{"count":1}] +[{"count":1}] +[{"details":"Attempted to store 3 elements in set of 1 to 2 integers.","error":"constraint violation"}] +[{"count":1}] +]]) + OVSDB_CHECK_EXECUTION([referential integrity -- simple], [constraint_schema], [[[["constraints", @@ -751,12 +798,6 @@ OVSDB_CHECK_EXECUTION([referential integrity -- simple], {"op": "delete", "table": "b", "where": []}]]], -dnl Check that "mutate" honors number-of-elements constraints on sets and maps. - [[["constraints", - {"op": "mutate", - "table": "b", - "where": [], - "mutations": [["x", "delete", 0]]}]]], [[["constraints", {"op": "delete", "table": "a", @@ -783,7 +824,6 @@ dnl Check that "mutate" honors number-of-elements constraints on sets and maps. "where": []}]]]], [[[{"uuid":["uuid","<0>"]},{"uuid":["uuid","<1>"]},{"uuid":["uuid","<2>"]},{"uuid":["uuid","<3>"]}] [{"count":1},{"details":"cannot delete b row <0> because of 3 remaining reference(s)","error":"referential integrity violation"}] -[{"details":"Attempted to store 0 elements in set of 1 to 2 integers.","error":"constraint violation"}] [{"count":1}] [{"count":1},{"details":"cannot delete b row <0> because of 2 remaining reference(s)","error":"referential integrity violation"}] [{"count":1}] @@ -1161,4 +1201,55 @@ OVSDB_CHECK_EXECUTION([garbage collection], [{"rows":[]}] ]])]) +OVSDB_CHECK_EXECUTION([insert rows, count with mutation], + [ordinal_schema], + [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "zero"]], + "mutations": []}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "one"]], + "mutations": []}]]], + [[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "one"}, + "uuid-name": "first"}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [["name", "==", "one"]], + "mutations": []}]]], + [[["ordinals", + {"op": "delete", + "table": "ordinals", + "where": [["name", "==", "zero"]]}]]], + [[["ordinals", + {"op": "mutate", + "table": "ordinals", + "where": [], + "mutations": []}]]]], + [[[{"uuid":["uuid","<0>"]}] +[{"uuid":["uuid","<1>"]}] +[{"count":1}] +[{"count":1}] +[{"uuid":["uuid","<2>"]}] +[{"count":2}] +[{"count":1}] +[{"count":2}] +]]) + EXECUTION_EXAMPLES diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at index 8e75d00d7cc..9070ea051a6 100644 --- a/tests/ovsdb-idl.at +++ b/tests/ovsdb-idl.at @@ -1,17 +1,6 @@ AT_BANNER([OVSDB -- interface description language (IDL)]) m4_divert_text([PREPARE_TESTS], [ -# ovsdb_start_idltest [REMOTE] [SCHEMA] -# -# Creates a database using SCHEMA (default: idltest.ovsschema) and -# starts a database server listening on punix:socket and REMOTE (if -# specified). -ovsdb_start_idltest () { - ovsdb-tool create db ${2:-$abs_srcdir/idltest.ovsschema} || return $? - ovsdb-server -vconsole:warn --log-file --detach --no-chdir --pidfile --remote=punix:socket ${1:+--remote=$1} db || return $? - on_exit 'kill `cat ovsdb-server.pid`' -} - # ovsdb_cluster_leader [REMOTES] [DATABASE] # # Returns the leader of the DATABASE cluster. @@ -29,6 +18,29 @@ ovsdb_cluster_leader () { done }]) + +# OVSDB_START_IDLTEST([REMOTE], [SCHEMA]) +# +# Creates a database using SCHEMA (default: idltest.ovsschema) and +# starts a database server listening on punix:socket and REMOTE (if +# specified). +m4_define([OVSDB_START_IDLTEST], +[ + AT_CHECK([ovsdb-tool create db dnl + m4_if([$2], [], [$abs_srcdir/idltest.ovsschema], [$2])]) + PKIDIR=$abs_top_builddir/tests + AT_CHECK([ovsdb-server -vconsole:warn -vfile:dbg --log-file dnl + --detach --no-chdir --pidfile --remote=punix:socket dnl + m4_if(m4_substr($1, 0, 5), [pssl:], + [--private-key=$PKIDIR/testpki-privkey2.pem dnl + --certificate=$PKIDIR/testpki-cert2.pem dnl + --ca-cert=$PKIDIR/testpki-cacert.pem], []) dnl + m4_if([$1], [], [], [--remote=$1]) db dnl + ]) + on_exit 'kill `cat ovsdb-server.pid`' +]) + + # OVSDB_CLUSTER_START_IDLTEST([N], [REMOTE]) # # Creates a clustered database using idltest.ovsschema and starts a database @@ -45,9 +57,9 @@ m4_define([OVSDB_CLUSTER_START_IDLTEST], done on_exit 'kill $(cat s*.pid)' for i in $(seq $n); do - AT_CHECK([ovsdb-server -vraft -vconsole:warn --detach --no-chdir \ - --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i \ - --remote=punix:s$i.ovsdb \ + AT_CHECK([ovsdb-server -vraft -vconsole:warn -vfile:dbg --detach \ + --no-chdir --log-file=s$i.log --pidfile=s$i.pid \ + --unixctl=s$i --remote=punix:s$i.ovsdb \ m4_if([$2], [], [], [--remote=$2]) s$i.db]) done @@ -77,7 +89,7 @@ m4_define([OVSDB_CLUSTER_START_IDLTEST], m4_define([OVSDB_CHECK_IDL_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3], @@ -91,10 +103,10 @@ m4_define([OVSDB_CHECK_IDL_C], m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C], [AT_SETUP([$1 - write-changed-only - C]) AT_KEYWORDS([ovsdb server idl positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $3], + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 --write-changed-only idl unix:socket $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) @@ -105,7 +117,7 @@ m4_define([OVSDB_CHECK_IDL_WRITE_CHANGED_ONLY_C], m4_define([OVSDB_CHECK_IDL_TCP_C], [AT_SETUP([$1 - C - tcp]) AT_KEYWORDS([ovsdb server idl positive tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -123,7 +135,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_C], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -139,7 +151,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_C], m4_define([OVSDB_CHECK_IDL_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl positive Python $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $3], @@ -152,11 +164,20 @@ m4_define([OVSDB_CHECK_IDL_PY], m4_define([OVSDB_CHECK_IDL_REGISTER_COLUMNS_PY], [AT_SETUP([$1 - Python3 - register_columns]) AT_KEYWORDS([ovsdb server idl positive Python register_columns $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket ?simple:b,ba,i,ia,r,ra,s,sa,u,ua?simple3:name,uset,uref?simple4:name?simple6:name,weak_ref?link1:i,k,ka,l2?link2:i,l1?singleton:name $3], - [0], [stdout], [ignore]) + m4_define([REGISTER], m4_joinall([?], [], + [simple:b,ba,i,ia,r,ra,s,sa,u,ua], + [simple3:name,uset,uref], + [simple4:name], + [simple6:name,weak_ref], + [link1:i,k,ka,l2], + [link2:i,l1], + [indexed:i], + [singleton:name])) + AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema \ + unix:socket REGISTER $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) OVSDB_SERVER_SHUTDOWN @@ -166,7 +187,7 @@ m4_define([OVSDB_CHECK_IDL_REGISTER_COLUMNS_PY], m4_define([OVSDB_CHECK_IDL_TCP_PY], [AT_SETUP([$1 - Python3 - tcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) m4_if([$2], [], [], @@ -183,7 +204,7 @@ m4_define([OVSDB_CHECK_IDL_TCP_PY], m4_define([OVSDB_CHECK_IDL_TCP_MULTIPLE_REMOTES_PY], [AT_SETUP([$1 - Python3 (multiple remotes) - tcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT_1=$((TCP_PORT + 101)) WRONG_PORT_2=$((TCP_PORT + 102)) @@ -203,7 +224,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_PY], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive Python with tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) echo "TCP_PORT=$TCP_PORT" @@ -221,7 +242,7 @@ m4_define([OVSDB_CHECK_IDL_TCP6_MULTIPLE_REMOTES_PY], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test $HAVE_IPV6 = no]) AT_KEYWORDS([ovsdb server idl positive Python with tcp6 socket $5]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:[[::1]]"]) + OVSDB_START_IDLTEST(["ptcp:0:[[::1]]"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT_1=$((TCP_PORT + 101)) WRONG_PORT_2=$((TCP_PORT + 102)) @@ -266,7 +287,10 @@ m4_define([OVSDB_CHECK_IDL_SSL_PY], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) - OVSDB_SERVER_SHUTDOWN + OVSDB_SERVER_SHUTDOWN([" + /unexpected SSL connection close/d + /Protocol error/d + "]) AT_CLEANUP]) m4_define([OVSDB_CHECK_IDL], @@ -287,13 +311,13 @@ m4_define([OVSDB_CHECK_IDL_PASSIVE_TCP_PY], [AT_SETUP([$1 - Python3 - ptcp]) AT_KEYWORDS([ovsdb server idl positive Python with tcp socket $5]) # find free TCP port - AT_CHECK([ovsdb_start_idltest "ptcp:0:127.0.0.1"]) + OVSDB_START_IDLTEST(["ptcp:0:127.0.0.1"]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) OVSDB_SERVER_SHUTDOWN rm -f db # start OVSDB server in passive mode - AT_CHECK([ovsdb_start_idltest "tcp:127.0.0.1:$TCP_PORT"]) + OVSDB_START_IDLTEST(["tcp:127.0.0.1:$TCP_PORT"]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl_passive $srcdir/idltest.ovsschema ptcp:127.0.0.1:$TCP_PORT $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), @@ -473,7 +497,7 @@ OVSDB_CHECK_IDL([simple idl, writing via IDL with unicode], m4_define([OVSDB_CHECK_IDL_PY_WITH_EXPOUT], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl positive Python $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $3], @@ -576,9 +600,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, false condition], "b": true}}]']], [['condition simple []' \ 'condition simple [true]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) @@ -592,13 +616,40 @@ OVSDB_CHECK_IDL([simple idl, conditional, true condition], "b": true}}]']], [['condition simple []' \ 'condition simple [true]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) +dnl This test ensures that the first explicitly set monitor condition +dnl is sent to the server. +OVSDB_CHECK_IDL([simple idl, conditional, wait for condition], + [], + [['["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1, + "r": 2.0, + "b": true}}]' \ + 'condition simple [true]' \ + '^["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 2, + "r": 4.0, + "b": true}}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table simple: i=1 r=2 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +003: simple: conditions unchanged +004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} +005: table simple: i=1 r=2 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +005: table simple: i=2 r=4 b=true s= u=<1> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +006: done +]]) + OVSDB_CHECK_IDL([simple idl, conditional, multiple clauses in condition], [['["idltest", {"op": "insert", @@ -613,9 +664,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple clauses in condition], "b": true}}]']], [['condition simple []' \ 'condition simple [["i","==",1],["i","==",2]]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: i=2 r=3 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> 004: done @@ -630,9 +681,9 @@ OVSDB_CHECK_IDL([simple idl, conditional, modify as insert due to condition], "b": true}}]']], [['condition simple []' \ 'condition simple [["i","==",1]]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 004: done ]]) @@ -653,11 +704,11 @@ OVSDB_CHECK_IDL([simple idl, conditional, modify as delete due to condition], "row": {"i": 2, "r": 3.0, "b": true}}]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: simple: change conditions 005: empty 006: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} 007: table simple: i=2 r=3 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> @@ -688,14 +739,16 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple tables], "table": "link2", "row": {"i": 3}, "uuid-name": "row0"}]']], - [[000: change conditions + [[000: link1: change conditions +000: link2: change conditions +000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: link1: change conditions 005: table link1: i=0 k=0 ka=[] l2= uuid=<2> 005: table simple: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -006: change conditions +006: link2: change conditions 007: {"error":null,"result":[{"uuid":["uuid","<3>"]}]} 008: table link1: i=0 k=0 ka=[] l2= uuid=<2> 008: table link2: i=3 l1= uuid=<3> @@ -703,6 +756,31 @@ OVSDB_CHECK_IDL([simple idl, conditional, multiple tables], 009: done ]]) +OVSDB_CHECK_IDL([indexed idl, modification and removal], + [], + [['["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table indexed: i=123 uuid=<0> +003: {"error":null,"result":[{"count":1}]} +004: table indexed: i=456 uuid=<0> +005: {"error":null,"result":[{"count":1}]} +006: empty +007: done +]]) + OVSDB_CHECK_IDL([self-linking idl, consistent ops], [], [['["idltest", @@ -961,7 +1039,7 @@ AT_KEYWORDS([ovsdb server idl positive]) # table link2 and column l2 have been deleted. But the IDL still # expects them to be there, so this test checks that it properly # tolerates them being missing. -AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest2.ovsschema"]) +OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest2.ovsschema"]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket ['["idltest", {"op": "insert", "table": "link1", @@ -1034,7 +1112,7 @@ AT_CLEANUP m4_define([OVSDB_CHECK_IDL_FETCH_COLUMNS_PY], [AT_SETUP([$1 - Python3 - fetch]) AT_KEYWORDS([ovsdb server idl positive Python increment fetch $6]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket [$3] $4], @@ -1075,10 +1153,23 @@ OVSDB_CHECK_IDL_FETCH_COLUMNS([simple idl, initially populated], 003: done ]]) +m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_C], + [AT_SETUP([$1 - C]) + AT_KEYWORDS([ovsdb server idl monitor $4]) + OVSDB_START_IDLTEST + AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/disable-monitor-cond]) + + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $2], + [0], [stdout], [ignore]) + AT_CHECK([sort stdout | uuidfilt]m4_if([$5],,, [[| $5]]), + [0], [$3]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_PY], [AT_SETUP([$1 - Python3]) AT_KEYWORDS([ovsdb server idl Python monitor $4]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/disable-monitor-cond]) AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], [0], [stdout], [ignore]) @@ -1088,7 +1179,8 @@ m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND_PY], AT_CLEANUP]) m4_define([OVSDB_CHECK_IDL_WO_MONITOR_COND], - [OVSDB_CHECK_IDL_WO_MONITOR_COND_PY($@)]) + [OVSDB_CHECK_IDL_WO_MONITOR_COND_C($@) + OVSDB_CHECK_IDL_WO_MONITOR_COND_PY($@)]) OVSDB_CHECK_IDL_WO_MONITOR_COND([simple idl disable monitor-cond], @@ -1171,7 +1263,7 @@ OVSDB_CHECK_IDL_WO_MONITOR_COND([simple idl disable monitor-cond], m4_define([OVSDB_CHECK_IDL_TRACK_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl tracking positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl unix:socket $3], @@ -1184,10 +1276,10 @@ m4_define([OVSDB_CHECK_IDL_TRACK_C], m4_define([OVSDB_CHECK_IDL_TRACK_WRITE_CHANGED_ONLY_C], [AT_SETUP([$1 - write-changed-only - C]) AT_KEYWORDS([ovsdb server idl tracking positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) - AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c -w idl unix:socket $3], + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c --write-changed-only idl unix:socket $3], [0], [stdout], [ignore]) AT_CHECK([sort stdout | uuidfilt]m4_if([$6],,, [[| $6]]), [0], [$4]) @@ -1230,6 +1322,33 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated], 003: done ]]) +OVSDB_CHECK_IDL_TRACK([track, indexed idl, modification and removal], + [], + [['["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +001: {"error":null,"result":[{"uuid":["uuid","<0>"]}]} +002: table indexed: inserted row: i=123 uuid=<0> +002: table indexed: updated columns: i +003: {"error":null,"result":[{"count":1}]} +004: table indexed: i=456 uuid=<0> +004: table indexed: updated columns: i +005: {"error":null,"result":[{"count":1}]} +006: empty +007: done +]]) + dnl This test creates database with weak references and checks that orphan dnl rows created for weak references are not available for iteration via dnl list of tracked changes. @@ -1266,10 +1385,10 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, orphan weak refer {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s @@ -1308,19 +1427,19 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, orphan rows, cond {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s -004: change conditions +004: simple: change conditions 005: table simple6: name=first_row weak_ref=[] uuid=<0> 005: table simple: deleted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 005: table simple: inserted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> 005: table simple: updated columns: s -006: change conditions +006: simple: change conditions 007: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 007: table simple: deleted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> 007: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> @@ -1362,14 +1481,14 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, condi {"op": "delete", "table": "simple6", "where": []}]']], - [[000: change conditions + [[000: simple: change conditions 001: table simple6: inserted row: name=first_row weak_ref=[] uuid=<0> 001: table simple6: updated columns: name weak_ref -002: change conditions +002: simple: change conditions 003: table simple6: name=first_row weak_ref=[<1>] uuid=<0> 003: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 003: table simple: updated columns: s -004: change conditions +004: simple: change conditions 005: table simple6: name=first_row weak_ref=[<3>] uuid=<0> 005: table simple: deleted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> 005: table simple: inserted row: i=1 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> @@ -1405,7 +1524,8 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, singl {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple6: conditions unchanged +000: simple: conditions unchanged 001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> 001: table simple6: updated columns: name weak_ref 001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> @@ -1421,6 +1541,56 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, references, singl 006: done ]]) +OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, insert+delete batch], + [['["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row0_s"}, + "uuid-name": "uuid_row0_s"}, + {"op": "insert", + "table": "simple6", + "row": {"name": "row0_s6", + "weak_ref": ["set", + [["named-uuid", "uuid_row0_s"]] + ]}}]']], + [['condition simple [true];simple6 [true]' \ + '["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row1_s"}, + "uuid-name": "uuid_row1_s"}, + {"op": "mutate", + "table": "simple6", + "where": [["name", "==", "row0_s6"]], + "mutations": [["weak_ref", "insert", ["set", [["named-uuid", "uuid_row1_s"]]]]]}]' \ + '+["idltest", + {"op": "delete", + "table": "simple", + "where": [["s", "==", "row1_s"]]}]' \ + '["idltest", + {"op": "insert", + "table": "simple", + "row": {"s": "row2_s"}}]']], + [[000: simple6: conditions unchanged +000: simple: conditions unchanged +001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> +001: table simple6: updated columns: name weak_ref +001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +001: table simple: updated columns: s +002: {"error":null,"result":[{"uuid":["uuid","<3>"]},{"count":1}]} +003: {"error":null,"result":[{"count":1}]} +004: table simple6: name=row0_s6 weak_ref=[<0>] uuid=<1> +004: table simple6: updated columns: weak_ref +004: table simple: inserted/deleted row: i=0 r=0 b=false s=row1_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +004: table simple: updated columns: s +005: {"error":null,"result":[{"uuid":["uuid","<4>"]}]} +006: table simple6: name=row0_s6 weak_ref=[<0>] uuid=<1> +006: table simple: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> +006: table simple: inserted row: i=0 r=0 b=false s=row2_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +006: table simple: updated columns: s +007: done +]]) + dnl This test checks that deleting both the destination and source of the dnl reference doesn't remove the reference in the source tracked record. OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, multiple deletes], @@ -1447,7 +1617,8 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, weak references, {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple6: conditions unchanged +000: simple: conditions unchanged 001: table simple6: inserted row: name=row0_s6 weak_ref=[<0>] uuid=<1> 001: table simple6: updated columns: name weak_ref 001: table simple: inserted row: i=0 r=0 b=false s=row0_s u=<2> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<0> @@ -1487,7 +1658,9 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, strong references {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: inserted row: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple3: updated columns: name uref 001: table simple4: inserted row: name=row0_s4 uuid=<0> @@ -1522,12 +1695,14 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially populated, strong references {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: inserted row: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple3: updated columns: name uref 001: table simple4: inserted row: name=row0_s4 uuid=<0> 001: table simple4: updated columns: name -002: change conditions +002: simple4: change conditions 003: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> 003: table simple4: deleted row: name=row0_s4 uuid=<0> 004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} @@ -1558,10 +1733,12 @@ OVSDB_CHECK_IDL([simple idl, initially populated, strong references, conditional {"op": "insert", "table": "simple", "row": {"s": "row0_s"}}]']], - [[000: change conditions + [[000: simple3: conditions unchanged +000: simple4: conditions unchanged +000: simple: conditions unchanged 001: table simple3: name=row0_s3 uset=[] uref=[<0>] uuid=<1> 001: table simple4: name=row0_s4 uuid=<0> -002: change conditions +002: simple4: change conditions 003: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> 004: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} 005: table simple3: name=row0_s3 uset=[] uref=[] uuid=<1> @@ -1679,7 +1856,7 @@ OVSDB_CHECK_IDL_TRACK([track, simple idl, initially empty, various ops], m4_define([OVSDB_CHECK_IDL_PARTIAL_UPDATE_MAP_COLUMN], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl partial update map column positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-partial-update-map-column unix:socket $3], @@ -1740,7 +1917,7 @@ OVSDB_CHECK_IDL_PY([partial-map update set refmap idl], m4_define([OVSDB_CHECK_IDL_PARTIAL_UPDATE_SET_COLUMN], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl partial update set column positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-partial-update-set-column unix:socket $3], @@ -1920,6 +2097,36 @@ OVSDB_CHECK_IDL_NOTIFY([simple idl verify notify], 015: done ]]) +OVSDB_CHECK_IDL_NOTIFY([indexed idl, modification and removal notify], + [['track-notify' \ + '["idltest", + {"op": "insert", + "table": "indexed", + "row": {"i": 123 }}]' \ + '["idltest", + {"op": "update", + "table": "indexed", + "where": [["i", "==", 123]], + "row": {"i": 456}}]' \ + '["idltest", + {"op": "delete", + "table": "indexed", + "where": [["i", "==", 456]]}]']], + [[000: empty +000: event:create, row={}, uuid=<0>, updates=None +000: event:create, row={}, uuid=<1>, updates=None +001: {"error":null,"result":[{"uuid":["uuid","<2>"]}]} +002: event:create, row={i=123}, uuid=<2>, updates=None +002: table indexed: i=123 uuid=<2> +003: {"error":null,"result":[{"count":1}]} +004: event:update, row={i=456}, uuid=<2>, updates={i=123} +004: table indexed: i=456 uuid=<2> +005: {"error":null,"result":[{"count":1}]} +006: empty +006: event:delete, row={i=456}, uuid=<2>, updates=None +007: done +]]) + # Tests to verify the functionality of the one column compound index. # It tests index for one column string and integer indexes. # The run of test-ovsdb generates the output of the display of data using the different indexes defined in @@ -1929,7 +2136,7 @@ OVSDB_CHECK_IDL_NOTIFY([simple idl verify notify], m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_SINGLE_COLUMN_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index_single_column compound_index positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) # Generate the data to be tested. @@ -2076,7 +2283,7 @@ OVSDB_CHECK_IDL_COMPOUND_INDEX_SINGLE_COLUMN_C([Compound_index, single column te m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_DOUBLE_COLUMN_C], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index_double_column compound_index positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) # Generate the data to be tested. @@ -2215,7 +2422,7 @@ OVSDB_CHECK_IDL_COMPOUND_INDEX_DOUBLE_COLUMN_C([Compound_index, double column te m4_define([OVSDB_CHECK_IDL_COMPOUND_INDEX_WITH_REF], [AT_SETUP([$1 - C]) AT_KEYWORDS([ovsdb server idl compound_index compound_index_with_ref positive $5]) - AT_CHECK([ovsdb_start_idltest]) + OVSDB_START_IDLTEST m4_if([$2], [], [], [AT_CHECK([ovsdb-client transact unix:socket $2], [0], [ignore], [ignore])]) AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 -c idl-compound-index-with-ref unix:socket $3], @@ -2242,14 +2449,29 @@ m4_define([CHECK_STREAM_OPEN_BLOCK], [AT_SETUP([Check stream open block - $1 - $3]) AT_SKIP_IF([test "$3" = "tcp6" && test "$IS_WIN32" = "yes"]) AT_SKIP_IF([test "$3" = "tcp6" && test "$HAVE_IPV6" = "no"]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$IS_WIN32" = "yes"]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$HAVE_IPV6" = "no"]) + AT_SKIP_IF([test "$3" = "ssl" && test "$HAVE_OPENSSL" = "no"]) + $PYTHON3 -c "import ssl" + SSL_PRESENT=$? + AT_SKIP_IF([test "$3" = "ssl" && test $SSL_PRESENT != 0]) + AT_SKIP_IF([test "$3" = "ssl6" && test "$HAVE_OPENSSL" = "no"]) + AT_SKIP_IF([test "$3" = "ssl6" && test $SSL_PRESENT != 0]) AT_KEYWORDS([ovsdb server stream open_block $3]) - AT_CHECK([ovsdb_start_idltest "ptcp:0:$4"]) + PKIDIR=$abs_top_builddir/tests + m4_define([PROTOCOL], [m4_substr([$3], [0], [3])]) + OVSDB_START_IDLTEST([m4_join([], [p], PROTOCOL, [:0:], $4)]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) WRONG_PORT=$(($TCP_PORT + 101)) - AT_CHECK([$2 tcp:$4:$TCP_PORT], [0], [ignore]) - AT_CHECK([$2 tcp:$4:$WRONG_PORT], [1], [ignore], [ignore]) - OVSDB_SERVER_SHUTDOWN - AT_CHECK([$2 tcp:$4:$TCP_PORT], [1], [ignore], [ignore]) + SSL_KEY_ARGS="$PKIDIR/testpki-privkey.pem $PKIDIR/testpki-cert.pem $PKIDIR/testpki-cacert.pem" + AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [0], [ignore]) + AT_CHECK([$2 PROTOCOL:$4:$WRONG_PORT $SSL_KEY_ARGS], [1], [ignore], + [ignore]) + OVSDB_SERVER_SHUTDOWN([" + /unexpected SSL connection close/d + /Protocol error/d + "]) + AT_CHECK([$2 PROTOCOL:$4:$TCP_PORT $SSL_KEY_ARGS], [1], [ignore], [ignore]) AT_CLEANUP]) CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [tcp], [127.0.0.1]) @@ -2258,6 +2480,29 @@ CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], [tcp], [127.0.0.1]) CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], [tcp6], [[[::1]]]) +CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [ssl], [127.0.0.1]) +CHECK_STREAM_OPEN_BLOCK([C], [test-stream], [ssl6], [[[::1]]]) +CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], + [ssl], [127.0.0.1]) +CHECK_STREAM_OPEN_BLOCK([Python3], [$PYTHON3 $srcdir/test-stream.py], + [ssl6], [[[::1]]]) + +dnl OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS(LOG) +dnl +dnl Looks up transaction IDs in the log of OVSDB client application. +dnl All-zero UUID should not be sent within a monitor request more than once, +dnl unless some database requests were lost (not replied). +m4_define([OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS], +[ + requests=$(grep -c 'send request' $1) + replies=$(grep -c 'received reply' $1) + + if test "$requests" -eq "$replies"; then + AT_CHECK([grep 'monitor_cond_since' $1 \ + | grep -c "00000000-0000-0000-0000-000000000000" | tr -d '\n'], + [0], [1]) + fi +]) # same as OVSDB_CHECK_IDL but uses Python IDL implementation with tcp # with multiple remotes to assert the idl connects to the leader of the Raft cluster @@ -2274,10 +2519,11 @@ m4_define([OVSDB_CHECK_IDL_LEADER_ONLY_PY], pids=$(cat s2.pid s3.pid s1.pid | tr '\n' ',') echo $pids AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t30 idl-cluster $srcdir/idltest.ovsschema $remotes $pids $3], - [0], [stdout], [ignore]) + [0], [stdout], [stderr]) remote=$(ovsdb_cluster_leader $remotes "idltest") leader=$(echo $remote | cut -d'|' -f 1) AT_CHECK([grep -F -- "${leader}" stdout], [0], [ignore]) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) OVSDB_CHECK_IDL_LEADER_ONLY_PY([Check Python IDL connects to leader], 3, ['remote']) @@ -2320,6 +2566,7 @@ m4_define([OVSDB_CHECK_CLUSTER_IDL_C], AT_CHECK([sort stdout | uuidfilt]m4_if([$7],,, [[| $7]]), [0], [$5]) m4_ifval([$8], [AT_CHECK([grep '$8' stderr], [1])], [], []) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) # Same as OVSDB_CHECK_CLUSTER_IDL_C but uses the Python IDL implementation. @@ -2340,6 +2587,7 @@ m4_define([OVSDB_CHECK_CLUSTER_IDL_PY], AT_CHECK([sort stdout | uuidfilt]m4_if([$7],,, [[| $7]]), [0], [$5]) m4_if([$8], [AT_CHECK([grep '$8' stderr], [1])], [], []) + OVSDB_CLUSTER_CHECK_MONITOR_COND_SINCE_TXN_IDS([stderr]) AT_CLEANUP]) m4_define([OVSDB_CHECK_CLUSTER_IDL], @@ -2370,11 +2618,11 @@ OVSDB_CHECK_CLUSTER_IDL([simple idl, monitor_cond_since, cluster disconnect], "table": "simple", "where": [["i", "==", 1]], "row": {"r": 2.0 }}]']], - [[000: change conditions + [[000: simple: change conditions 001: empty -002: change conditions +002: simple: change conditions 003: table simple: i=2 r=1 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> -004: change conditions +004: simple: change conditions 005: reconnect 006: table simple 007: {"error":null,"result":[{"count":1}]} @@ -2431,7 +2679,7 @@ reconnect.*waiting .* seconds before reconnect) AT_SETUP([idl table and column presence check]) AT_KEYWORDS([ovsdb server idl table column check]) -AT_CHECK([ovsdb_start_idltest "" "$abs_srcdir/idltest2.ovsschema"]) +OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest2.ovsschema"]) AT_CHECK(ovsdb-tool create db2 $abs_srcdir/idltest.ovsschema) AT_CHECK(ovsdb-server -vconsole:warn --log-file=ovsdb-server2.log --detach dnl @@ -2555,3 +2803,178 @@ OVSDB_CHECK_IDL_TRACK([track, insert and delete, refs to link2], 005: table link2: i=1 l1= uuid=<1> 006: done ]]) + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_C], + [AT_SETUP([$1 - C]) + AT_KEYWORDS([idl persistent uuid insert]) + OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest.ovsschema"]) + AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl unix:socket $2], + [0], [stdout], [stderr]) + AT_CHECK([sort stdout], + [0], [$3]) + AT_CHECK([grep $4 stderr], [0], [ignore]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY], + [AT_SETUP([$1 - Python3]) + AT_KEYWORDS([idl python persistent uuid insert]) + OVSDB_START_IDLTEST([], ["$abs_srcdir/idltest.ovsschema"]) + AT_CHECK([$PYTHON3 $srcdir/test-ovsdb.py -t10 idl $srcdir/idltest.ovsschema unix:socket $2], + [0], [stdout], [stderr]) + AT_CHECK([sort stdout], + [0], [$3]) + AT_CHECK([grep $4 stderr], [0], [ignore]) + OVSDB_SERVER_SHUTDOWN + AT_CLEANUP]) + + +m4_define([OVSDB_CHECK_IDL_PERS_UUID_INSERT], + [OVSDB_CHECK_IDL_PERS_UUID_INSERT_C($@) + OVSDB_CHECK_IDL_PERS_UUID_INSERT_PY($@)]) + +OVSDB_CHECK_IDL_PERS_UUID_INSERT([simple idl, persistent uuid insert], + [['insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 2, insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df3333 3' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df4444 4, insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 5' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df4444 4' \ + 'delete 2' \ + 'insert_uuid c5cc12f8-eaa1-43a7-8a73-bccd18df2222 5' + ]], + [[000: empty +001: commit, status=success +002: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +002: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +003: commit, status=error +004: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +004: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +005: commit, status=success +006: table simple: i=2 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +006: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +006: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +007: commit, status=success +008: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +008: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +009: commit, status=success +010: table simple: i=3 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df3333 +010: table simple: i=4 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df4444 +010: table simple: i=5 r=0 b=false s= u=00000000-0000-0000-0000-000000000000 ia=[] ra=[] ba=[] sa=[] ua=[] uuid=c5cc12f8-eaa1-43a7-8a73-bccd18df2222 +011: done +]], + [['This UUID would duplicate a UUID already present within the table or deleted within the same transaction']]) + + +OVSDB_CHECK_IDL_PY([simple idl, python, add_op], + [], + [['insert 1, insert 2, insert 3, insert 1' \ + 'add_op {"op": "delete", "table": "simple", "where": [["i", "==", 1]]}' \ + 'add_op {"op": "insert", "table": "simple", "row": {"i": 2}}, delete 3' \ + 'insert 2, add_op {"op": "update", "table": "simple", "row": {"i": 1}, "where": [["i", "==", 2]]}' + ]], + [[000: empty +001: commit, status=success +002: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +002: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> +002: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +002: table simple: i=3 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +003: commit, status=success +004: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +004: table simple: i=3 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<4> +005: commit, status=success +006: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +006: table simple: i=2 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<5> +007: commit, status=success +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<3> +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<5> +008: table simple: i=1 r=0 b=false s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<6> +009: done +]],[],sort) + + +m4_define([OVSDB_CHECK_IDL_CHANGE_AWARE], + [AT_SETUP([simple idl, database change aware, online conversion - $1]) + AT_KEYWORDS([ovsdb server idl db_change_aware conversion $1]) + + m4_if([$1], [clustered], + [OVSDB_CLUSTER_START_IDLTEST([1], [punix:socket])], + [OVSDB_START_IDLTEST]) + + dnl Add some data. + AT_CHECK([[ovsdb-client transact unix:socket '["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1, + "r": 2.0, + "b": true, + "s": "first row", + "u": ["uuid", "84f5c8f5-ac76-4dbc-a24f-8860eb407fc1"], + "ia": ["set", [1, 2, 3]], + "ra": ["set", [-0.5]], + "ba": ["set", [true]], + "sa": ["set", ["abc", "def"]], + "ua": ["set", [["uuid", "69443985-7806-45e2-b35f-574a04e720f9"], + ["uuid", "aad11ef0-816a-4b01-93e6-03b8b4256b98"]]]}}, + {"op": "insert", + "table": "simple", + "row": {"b": false, "s": "second row"}}, + {"op": "insert", + "table": "simple", + "row": {"b": true, "s": "third row"}} + ]']], [0], [stdout]) + + dnl Create a new schema by adding 'extra_column' to the 'simple' table. + AT_CHECK([sed 's/"ua": {/"extra_column":{"type": "string"},"ua": {/ + s/1.2.3/1.2.4/' \ + $abs_srcdir/idltest.ovsschema > new-idltest.ovsschema]) + dnl Try "needs-conversion". + AT_CHECK([ovsdb-client needs-conversion unix:socket $abs_srcdir/idltest.ovsschema], [0], [no +]) + AT_CHECK([ovsdb-client needs-conversion unix:socket new-idltest.ovsschema], [0], [yes +]) + + dnl Conditionally exclude the second row from monitoring. + m4_define([COND], [['condition simple [["b","==",true]]']]) + + dnl Start monitoring. + OVS_DAEMONIZE([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t30 \ + idl unix:socket COND monitor \ + >idl-c.out 2>idl-c.err], [idl-c.pid]) + AT_CAPTURE_FILE([idl-c.out]) + AT_CAPTURE_FILE([idl-c.err]) + + OVS_DAEMONIZE([$PYTHON3 $srcdir/test-ovsdb.py -t30 \ + idl $srcdir/idltest.ovsschema unix:socket COND monitor \ + >idl-python.out 2>idl-python.err], [idl-python.pid]) + AT_CAPTURE_FILE([idl-python.out]) + AT_CAPTURE_FILE([idl-python.err]) + + dnl Wait for monitors to receive the data. + OVS_WAIT_UNTIL([grep -q 'third row' idl-c.err]) + OVS_WAIT_UNTIL([grep -q 'third row' idl-python.err]) + + dnl Convert the database. + AT_CHECK([ovsdb-client convert unix:socket new-idltest.ovsschema]) + + dnl Check for the monitor cancellation and the data being requested again. + m4_foreach([FILE], [[idl-c], [idl-python]], + [OVS_WAIT_UNTIL([grep -q 'monitor_canceled' FILE.err]) + OVS_WAIT_UNTIL([test 2 -eq $(grep -c 'send request, method="monitor_cond_since", params=."idltest"' FILE.err)]) + + dnl XXX: Checking for the new schema bits conditionally because standalone + dnl databases are not updating the schema in the _Server database properly. + m4_if([$1], [clustered], [OVS_WAIT_UNTIL([grep -q 'extra_column' FILE.err])]) + + dnl Check that there were no unexpected messages. + AT_CHECK([! grep 'unexpected' FILE.err]) + + dnl Check that the data is received twice and the condition is working. + AT_CHECK([sort FILE.out | uuidfilt], [0], +[[000: simple: change conditions +001: table simple: i=0 r=0 b=true s=third row u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +001: table simple: i=1 r=2 b=true s=first row u=<2> ia=[1 2 3] ra=[-0.5] ba=[true] sa=[abc def] ua=[<3> <4>] uuid=<5> +002: table simple: i=0 r=0 b=true s=third row u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> +002: table simple: i=1 r=2 b=true s=first row u=<2> ia=[1 2 3] ra=[-0.5] ba=[true] sa=[abc def] ua=[<3> <4>] uuid=<5> +]])]) + AT_CLEANUP]) + +OVSDB_CHECK_IDL_CHANGE_AWARE([standalone]) +OVSDB_CHECK_IDL_CHANGE_AWARE([clustered]) diff --git a/tests/ovsdb-lock.at b/tests/ovsdb-lock.at index a3acd2f27a0..6bc24730273 100644 --- a/tests/ovsdb-lock.at +++ b/tests/ovsdb-lock.at @@ -12,8 +12,8 @@ m4_define([OVSDB_CHECK_LOCK_SETUP], AT_KEYWORDS([ovsdb lock $2]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) - AT_CAPTURE_FILE([ovsdb-server-log]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1], [0], [], [])]) + AT_CAPTURE_FILE([ovsdb-server.log]) + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore])]) # # Two sessions create two locks. Both sessions should be able to get their @@ -23,7 +23,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c1-output 2>& [0], [], []) AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock1 >c2-output 2>&1], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} ], []) AT_CHECK([cat c2-output], 0, [{"locked":true} @@ -40,7 +40,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir --pidfile lock unix:socket lock0 >c1- AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c2-output 2>&1], [0], [], []) AT_CHECK([ovs-appctl -t ovsdb-client unlock lock0], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} {} ]) @@ -60,7 +60,7 @@ AT_CHECK([ovsdb-client --detach --no-chdir lock unix:socket lock0 >c1-output 2>& AT_CHECK([ovsdb-client --detach --no-chdir --pidfile steal unix:socket lock0 >c2-output 2>&1], [0], [], []) AT_CHECK([ovs-appctl -t ovsdb-client unlock lock0], [0], [], []) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([cat c1-output], 0, [{"locked":true} stolen [["lock0"]] diff --git a/tests/ovsdb-macros.at b/tests/ovsdb-macros.at index 0f8e4bd20b3..503b8b722e5 100644 --- a/tests/ovsdb-macros.at +++ b/tests/ovsdb-macros.at @@ -13,6 +13,18 @@ m4_define([OVSDB_INIT], "row": {}}]']], [0], [ignore], [ignore])]) +dnl OVSDB_SERVER_SHUTDOWN([ALLOWLIST]) +dnl +dnl Gracefully stops ovsdb-server, checking log files for messages with +dnl severity WARN or higher and signaling an error if any is present. +dnl The optional ALLOWLIST may contain shell-quoted "sed" commands to +dnl delete any warnings that are actually expected, e.g.: +dnl +dnl OVSDB_SERVER_SHUTDOWN(["/expected error/d"]) +m4_define([OVSDB_SERVER_SHUTDOWN], + [AT_CHECK([check_logs $1]) + OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid])]) + # OVSDB_CHECK_POSITIVE(TITLE, TEST-OVSDB-ARGS, OUTPUT, [KEYWORDS], [PREREQ]) # # Runs "test-ovsdb TEST-OVSDB-ARGS" and checks that it exits with diff --git a/tests/ovsdb-monitor.at b/tests/ovsdb-monitor.at index 3b622b3ec05..3e1df18a112 100644 --- a/tests/ovsdb-monitor.at +++ b/tests/ovsdb-monitor.at @@ -28,7 +28,7 @@ ovsdb_check_monitor () { for txn in ${1+"$@"} '[["'$db'"]]'; do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done - OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid]) + OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) AT_CHECK_UNQUOTED([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$output], [ignore]) } @@ -88,10 +88,10 @@ m4_define([OVSDB_CHECK_MONITOR_COND], for txn in m4_foreach([txn], [$3], ['txn' ]); do AT_CHECK([ovsdb-tool transact db "$txn"], [0], [ignore], [ignore]) done - AT_CAPTURE_FILE([ovsdb-server-log]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) + AT_CAPTURE_FILE([ovsdb-server.log]) + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' - AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket $4 '[$8]' $5 $9 > output], + AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket $4 '[$8]' $5 $9 > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [$6], ['txn' ]); do @@ -103,7 +103,7 @@ m4_define([OVSDB_CHECK_MONITOR_COND], done AT_CHECK([ovsdb-client transact unix:socket '[["$4"]]'], [0], [ignore], [ignore]) - AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) + OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [$7], [ignore]) AT_CLEANUP]) @@ -586,6 +586,7 @@ row,action,name,number,_version [[]], [], [[[[["name","==","one"],["name","==","two"]]]], + [[[["name","==","two"],["name","==","one"]]]], [[[["name","==","one"]]]], [[[false]]], [[[true]]]]) @@ -595,9 +596,9 @@ AT_SETUP(monitor-cond-change with many sessions pending) AT_KEYWORDS([ovsdb server monitor monitor-cond negative]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) +AT_CAPTURE_FILE([ovsdb-server.log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -619,14 +620,14 @@ done cond='[[["name","==","ten"]]]' for i in `seq 1 990`; do - AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"]], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] >ovsdb-client$i.out 2>&1], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) for i in `seq 991 1000`; do - AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"]], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-client -vjsonrpc --pidfile=ovsdb-client$i.pid --detach --no-chdir -d json monitor-cond --format=csv unix:socket ordinals $cond ordinals ["name"] >ovsdb-client$i.out 2>&1 ], [0], [ignore], [ignore]) done for txn in m4_foreach([txn], [[[["ordinals", @@ -647,7 +648,7 @@ sleep 1 AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN("/Too many open files/d") OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[row,action,name <0>,insert,"""ten""" @@ -666,8 +667,8 @@ AT_SETUP([monitor-cond-since not found]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -684,7 +685,7 @@ done # Omitting the last_id parameter in ovsdb-client monitor-cond-since command # will by default using all zero uuid, which doesn't exist in any history txn. -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -699,7 +700,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -720,8 +721,8 @@ AT_SETUP([monitor-cond-since db restart]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -736,19 +737,18 @@ for txn in m4_foreach([txn], [[[["ordinals", AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) -kill `cat ovsdb-client.pid` -kill `cat ovsdb-server.pid` +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Remember the last_id, which will be used for monitor-cond-since later. last_id=`grep last_id output | awk '{print $4}'` -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) # Some new changes made to db after restarting the server. for txn in m4_foreach([txn], [[[["ordinals", @@ -763,12 +763,12 @@ for txn in m4_foreach([txn], [[[["ordinals", done # Use last_id to monitor and get only the new changes. -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: true, last_id: <0> @@ -784,8 +784,8 @@ AT_SETUP([monitor-cond-since found but no new rows]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -799,7 +799,7 @@ for txn in m4_foreach([txn], [[[["ordinals", "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) @@ -807,12 +807,12 @@ OVS_WAIT_UNTIL([grep last_id output]) kill `cat ovsdb-client.pid` OVS_WAIT_UNTIL([test ! -e ovsdb-client.pid]) last_id=`grep last_id output | awk '{print $4}'` -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: true, last_id: <0> @@ -825,17 +825,17 @@ AT_SETUP([monitor-cond-since empty db]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' OVS_WAIT_UNTIL([grep last_id output]) AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -848,8 +848,8 @@ AT_SETUP([monitor-cond-since condition change]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:db.raft], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file -vjsonrpc:file:dbg db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -863,7 +863,8 @@ for txn in m4_foreach([txn], [[[["ordinals", "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[]]' ordinals > output], [0], [ignore], [ignore]) +AT_CAPTURE_FILE([ovsdb-client.log]) +AT_CHECK([ovsdb-client -vjsonrpc --log-file --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[]]' ordinals > output 2> ovsdb-client.stderr]) on_exit 'kill `cat ovsdb-client.pid`' for cond in m4_foreach([cond], [[[[["name","==","one"],["name","==","two"]]]], @@ -874,7 +875,7 @@ for cond in m4_foreach([cond], done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < output | uuidfilt], [0], [[found: false, last_id: <0> @@ -909,8 +910,8 @@ AT_SETUP([monitor-cond-since non-cluster]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -925,7 +926,7 @@ for txn in m4_foreach([txn], [[[["ordinals", AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) done -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -940,7 +941,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Transaction shouldn't be found, and last_id returned should always @@ -962,8 +963,8 @@ AT_SETUP([monitor-cond-since non-cluster non-zero last_id]) AT_KEYWORDS([ovsdb server monitor monitor-cond-since negative]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) -AT_CAPTURE_FILE([ovsdb-server-log]) -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file="`pwd`"/ovsdb-server-log db >/dev/null 2>&1]) +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-server.pid`' for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -980,7 +981,7 @@ done # A non-zero uuid last_id=11111111-1111-1111-1111-111111111111 -AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output], +AT_CHECK([ovsdb-client -vjsonrpc --pidfile --detach --no-chdir -d json monitor-cond-since --format=csv unix:socket ordinals $last_id '[[["name","==","one"],["name","==","ten"]]]' ordinals > output 2> ovsdb-client.stderr], [0], [ignore], [ignore]) on_exit 'kill `cat ovsdb-client.pid`' for txn in m4_foreach([txn], [[[["ordinals", @@ -995,7 +996,7 @@ for txn in m4_foreach([txn], [[[["ordinals", done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && test ! -e ovsdb-client.pid]) # Transaction shouldn't be found, and last_id returned should always @@ -1011,3 +1012,69 @@ row,action,name,number,_version ]], [ignore]) AT_CLEANUP +AT_SETUP([monitor-cond initial reply with condition on non-monitored column]) +AT_KEYWORDS([ovsdb server monitor monitor-cond positive initial non-monitored]) + +ordinal_schema > schema +AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' +AT_CAPTURE_FILE([ovsdb-server.log]) +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile \ + --remote=punix:socket --log-file db], [0], [ignore], [ignore]) + +dnl Initialize the database content. +for txn in m4_foreach([txn], [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) +done + +dnl Start a first client that monitors only the column 'name'. +on_exit 'kill `cat client-1.pid`' +AT_CAPTURE_FILE([client-1.out]) +AT_CHECK([ovsdb-client -vjsonrpc --pidfile=client-1.pid --detach --no-chdir \ + -d json monitor-cond --format=csv unix:socket \ + ordinals '[[true]]' ordinals ["name"] \ + > client-1.out 2> client-1.err], [0], [ignore], [ignore]) +dnl Wait for the initial monitor reply. +OVS_WAIT_UNTIL([grep -q 'initial' client-1.out]) + +dnl Start a second client that monitors the column 'name', but has a condition +dnl on column 'number'. +on_exit 'kill `cat client-2.pid`' +AT_CAPTURE_FILE([client-2.out]) +AT_CHECK([ovsdb-client -vjsonrpc --pidfile=client-2.pid --detach --no-chdir \ + -d json monitor-cond --format=csv unix:socket \ + ordinals '[[["number", "!=", 1]]]' ordinals ["name"] \ + > client-2.out 2> client-2.err], [0], [ignore], [ignore]) +dnl Wait for the initial monitor reply. +OVS_WAIT_UNTIL([grep -q 'initial' client-2.out]) + +OVSDB_SERVER_SHUTDOWN +OVS_WAIT_UNTIL([test ! -e ovsdb-server.pid && \ + test ! -e client-1.pid && test ! -e client-2.pid]) + +dnl The first client should have all the names. +AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < client-1.out | uuidfilt], + [0], [dnl +row,action,name +<0>,initial,"""one""" +<1>,initial,"""two""" +<2>,initial,"""zero""" +]) + +dnl The second client should not have the name 'one'. +AT_CHECK([$PYTHON3 $srcdir/ovsdb-monitor-sort.py < client-2.out | uuidfilt], + [0], [dnl +row,action,name +<0>,initial,"""two""" +<1>,initial,"""zero""" +]) +AT_CLEANUP diff --git a/tests/ovsdb-rbac.at b/tests/ovsdb-rbac.at index 7de3711fbd0..c1e5a9134eb 100644 --- a/tests/ovsdb-rbac.at +++ b/tests/ovsdb-rbac.at @@ -355,6 +355,29 @@ AT_CHECK([uuidfilt stdout], [0], [[[{"details":"RBAC rules for client \"client-2 ], [ignore]) # Test 14: +# Count the rows in other_colors. This should pass even though the RBAC +# authorization would fail because "client-2" does not match the +# "creator" column for this row. Because the RBAC check is bypassed when +# mutation is empty. +AT_CHECK([ovsdb-client transact ssl:127.0.0.1:$SSL_PORT \ + --private-key=$RBAC_PKIDIR/client-2-privkey.pem \ + --certificate=$RBAC_PKIDIR/client-2-cert.pem \ + --ca-cert=$RBAC_PKIDIR/pki/switchca/cacert.pem \ + ['["mydb", + {"op": "mutate", + "table": "other_colors", + "where": [], + "mutations": []}, + {"op": "mutate", + "table": "other_colors", + "where": [["name", "==", "seafoam"]], + "mutations": []} + ]']], [0], [stdout], [ignore]) +cat stdout >> output +AT_CHECK([uuidfilt stdout], [0], [[[{"count":1},{"count":1}]] +], [ignore]) + +# Test 15: # Attempt to delete a row from the "other_colors" table. This should pass # the RBAC authorization test because "client-1" does matches the # "creator" column for this row. @@ -371,5 +394,7 @@ cat stdout >> output AT_CHECK([uuidfilt stdout], [0], [[[{"count":1}]] ], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Connection table/d +"]) AT_CLEANUP diff --git a/tests/ovsdb-server.at b/tests/ovsdb-server.at index c7b2fe3ae6e..ce6d32aee1d 100644 --- a/tests/ovsdb-server.at +++ b/tests/ovsdb-server.at @@ -1,15 +1,17 @@ AT_BANNER([OVSDB -- ovsdb-server transactions (Unix sockets)]) -m4_define([OVSDB_SERVER_SHUTDOWN], - [OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovsdb-server], [ovsdb-server.pid])]) - +dnl OVSDB_SERVER_SHUTDOWN_N(N, [ALLOWLIST]) +dnl +dnl Similar to OVSDB_SERVER_SHUTDOWN, but stops the server started with N.pid +dnl pidfile and unixctlN socket. m4_define([OVSDB_SERVER_SHUTDOWN_N], - [cp $1.pid savepid$1 + [AT_CHECK([check_logs $2]) + cp $1.pid savepid$1 AT_CHECK([ovs-appctl -t "`pwd`"/unixctl$1 -e exit], [0], [ignore], [ignore]) OVS_WAIT_WHILE([kill -0 `cat savepid$1`], [kill `cat savepid$1`])]) m4_define([OVSDB_SERVER_SHUTDOWN2], - [OVSDB_SERVER_SHUTDOWN_N([2])]) + [OVSDB_SERVER_SHUTDOWN_N([2], $1)]) # OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # @@ -24,6 +26,9 @@ m4_define([OVSDB_SERVER_SHUTDOWN2], # If a given UUID appears more than once it is always replaced by the # same marker. # +# Additionally, checks that records written to a database file can be +# read back producing the same in-memory database content. +# # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_EXECUTION], [AT_SETUP([$1]) @@ -31,12 +36,22 @@ m4_define([OVSDB_CHECK_EXECUTION], $2 > schema AT_CHECK([ovsdb-tool create db schema], [0], [stdout], [ignore]) on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --remote=punix:socket db], [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ovsdb-client transact unix:socket 'txn'], [0], [stdout], [ignore]) cat stdout >> output ]) AT_CHECK([uuidfilt output], [0], [$4], [ignore]) + + AT_CHECK([ovsdb-client dump unix:socket], [0], [stdout], [ignore]) + + OVSDB_SERVER_SHUTDOWN + + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --remote=punix:socket db], [0], [ignore], [ignore]) + OVS_WAIT_UNTIL([ovsdb-client dump unix:socket > dump2; diff stdout dump2]) + OVSDB_SERVER_SHUTDOWN AT_CLEANUP]) @@ -157,7 +172,7 @@ constraint_schema > schema2 AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) CHECK_DBS([constraints ordinals ]) @@ -168,6 +183,31 @@ AT_CHECK( OVSDB_SERVER_SHUTDOWN AT_CLEANUP +AT_SETUP([database multiplexing implementation with config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +ordinal_schema > schema1 +constraint_schema > schema2 +AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) +on_exit 'kill $(cat *.pid)' + +AT_DATA([config.json], [ +{"remotes" : { "punix:db.sock": {} }, + "databases": { "db1": {}, "db2": { "service-model": "standalone" } } } +]) + +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile \ + --config-file=config.json], [0], [ignore], [ignore]) +CHECK_DBS([constraints +ordinals +]) +AT_CHECK( + [[ovstest test-jsonrpc request unix:db.sock get_schema [\"nonexistent\"]]], [0], + [[{"error":{"details":"get_schema request specifies unknown database nonexistent","error":"unknown database","syntax":"[\"nonexistent\"]"},"id":0,"result":null} +]]) +OVSDB_SERVER_SHUTDOWN +AT_CLEANUP + AT_SETUP([ovsdb-server/add-db and remove-db]) AT_KEYWORDS([ovsdb server positive]) on_exit 'kill `cat *.pid`' @@ -177,7 +217,7 @@ AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) # Start ovsdb-server with just a single database - db1. -AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) CHECK_DBS([ordinals ]) @@ -280,25 +320,171 @@ AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl <0> initial _Server ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN(["/no database named ordinals/d"]) +AT_CLEANUP + +AT_SETUP([ovsdb-server/add-db and remove-db with a config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +on_exit 'kill $(cat *.pid)' +ordinal_schema > schema1 +constraint_schema > schema2 +AT_CHECK([ovsdb-tool create db1 schema1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) + +dnl Start ovsdb-server with just a single database - db1. +AT_DATA([config.json], [ +{ + "remotes": { + "punix:db.sock": {} + }, + "databases": { + "db1": {} + } +} +]) +AT_CAPTURE_FILE([config.json]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --log-file --detach --no-chdir \ + --pidfile --config-file=config.json], [0], [ignore], [ignore]) +CHECK_DBS([ordinals +]) + +dnl Remove the database. +AT_CHECK([sed -i'back' '/db1/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([]) + +dnl Start monitoring processes. +AT_CHECK([ovsdb-client --detach --no-chdir --pidfile=ovsdb-client-1.pid \ + --no-db-change-aware --no-headings monitor _Server Database name \ + > db-change-unaware.stdout 2> db-change-unaware.stderr]) +AT_CHECK([ovsdb-client --detach --no-chdir --pidfile=ovsdb-client-2.pid \ + --db-change-aware --no-headings monitor _Server Database name \ + > db-change-aware.stdout 2> db-change-aware.stderr]) +AT_CAPTURE_FILE([db-change-unaware.stdout]) +AT_CAPTURE_FILE([db-change-unaware.stderr]) +AT_CAPTURE_FILE([db-change-aware.stdout]) +AT_CAPTURE_FILE([db-change-aware.stderr]) + +dnl Add the first database back. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db1": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([ordinals +]) + +dnl Add the second database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([constraints +ordinals +]) + +dnl The databases are responsive. +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [0], [ignore], [ignore]) + +dnl Add an already added database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) + +dnl Fix the config back. +AT_CHECK([sed -i'back' '/db2/d' config.json]) +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) + +dnl Add a non-existing database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db3": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +OVS_WAIT_UNTIL([grep -q 'failed to configure databases' ovsdb-server.log]) +AT_CHECK([sed -i'back' '/db3/d' config.json]) + +dnl Add a remote through a db path in db1. +AT_CHECK([sed -i'back' '/"remotes"/a\ + "db:ordinals,ordinals,name": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [db:ordinals,ordinals,name +punix:db.sock +]) + +dnl Removing db1 has no effect on its remote. +AT_CHECK([sed -i'back' '/db1/d' config.json]) +AT_CHECK([sed -i'back' 's/"db2": {},/"db2": {}/' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +CHECK_DBS([constraints +]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [db:ordinals,ordinals,name +punix:db.sock +]) +AT_CHECK([ovsdb-client list-tables unix:db.sock ordinals], [1], [ignore], [ignore]) + +dnl Remove now missing remote. +AT_CHECK([sed -i'back' '/db:ordinals,ordinals,name/d' config.json]) + +dnl Remove db2. +AT_CHECK([sed -i'back' '/db2/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS() +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [1], [ignore], [ignore]) + +dnl Add a removed database. +AT_CHECK([sed -i'back' '/"databases"/a\ + "db2": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +CHECK_DBS([constraints +]) +AT_CHECK([ovsdb-client list-tables unix:db.sock constraints], [0], [ignore], [ignore]) + +# Check the monitoring results. +AT_CHECK([uuidfilt db-change-aware.stdout], [0], [dnl +<0> initial _Server + +<1> insert ordinals + +<2> insert constraints + +<1> delete ordinals + +<2> delete constraints + +<3> insert constraints +]) +AT_CHECK([uuidfilt db-change-unaware.stdout], [0], [dnl +<0> initial _Server +]) + +OVSDB_SERVER_SHUTDOWN([" + /no database named ordinals/d + /failed to open database 'db3'/d + /failed to configure databases/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-db with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with one db. ordinal_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1], [0], [ignore], [ignore]) # Add the second database. constraint_schema > schema2 @@ -319,19 +505,19 @@ OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) CHECK_DBS([constraints ordinals ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-db and remove-db with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with one db. ordinal_schema > schema @@ -339,7 +525,7 @@ AT_CHECK([ovsdb-tool create db1 schema], [0], [ignore], [ignore]) constraint_schema > schema2 AT_CHECK([ovsdb-tool create db2 schema2], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1 db2], [0], [ignore], [ignore]) # Remove the second database. AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-db constraints]) @@ -356,7 +542,10 @@ OVS_WAIT_UNTIL( OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) CHECK_DBS([ordinals ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([--remote=db: implementation]) @@ -400,7 +589,7 @@ AT_CHECK( "uuid-name": "x", "row": {"target": "punix:socket2"}}]']], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=db:mydb,Root,managers --remote=db:mydb,Root,manager_options --log-file db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=db:mydb,Root,managers --remote=db:mydb,Root,manager_options db], [0], [ignore], [ignore]) ovs-appctl -t ovsdb-server time/warp 6000 1000 AT_CHECK( [[ovsdb-client transact unix:socket1 \ @@ -420,7 +609,9 @@ AT_CHECK( [[[{"rows":[{"managers":"punix:socket1"}]},{"rows":[{"is_connected":false,"target":"punix:socket2"}]}] ]], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Manager table/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-remote and remove-remote]) @@ -428,7 +619,7 @@ AT_KEYWORDS([ovsdb server positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile db], [0], [ignore], [ignore]) AT_CHECK([test ! -e socket1]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/add-remote punix:socket1]) @@ -473,25 +664,97 @@ AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/remove-remote punix:socket2]) OVS_WAIT_UNTIL([test ! -e socket2]) AT_CHECK([test ! -e socket1]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN +AT_CLEANUP + +AT_SETUP([ovsdb-server/add-remote and remove-remote with config file]) +AT_KEYWORDS([ovsdb server positive config-file]) +ordinal_schema > schema +AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) +on_exit 'kill $(cat *.pid)' + +AT_DATA([config.json], [ +{ + "remotes": { + }, + "databases": { "db": {} } +} +]) +AT_CAPTURE_FILE([config.json]) + +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file --pidfile \ + --config-file=config.json], [0], [ignore], [ignore]) + +AT_CHECK([test ! -e socket1]) +AT_CHECK([sed -i'back' '/"remotes"/a\ + "punix:socket1": {} + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +if test "$IS_WIN32" = "yes"; then + OVS_WAIT_UNTIL([test -e socket1]) +else + OVS_WAIT_UNTIL([test -S socket1]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket1 +]) + +AT_CHECK([test ! -e socket2]) +AT_CHECK([sed -i'back' '/"remotes"/a\ + "punix:socket2": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +if test "$IS_WIN32" = "yes"; then + OVS_WAIT_UNTIL([test -e socket2]) +else + OVS_WAIT_UNTIL([test -S socket2]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket1 +punix:socket2 +]) + +AT_CHECK([sed -i'back' '/"remotes"/a\ + "db:x,y,z": {}, + ' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload], [2], [ignore], [ignore]) +OVS_WAIT_UNTIL([grep -q '"db:x,y,z": no database named x' ovsdb-server.log]) +AT_CHECK([sed -i'back' '/db:x,y,z/d' config.json]) + +AT_CHECK([sed -i'back' '/punix:socket1/d' config.json]) +AT_CHECK([sed -i'back' 's/"punix:socket2": {},/"punix:socket2": {}/' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +OVS_WAIT_UNTIL([test ! -e socket1]) +if test "$IS_WIN32" = "yes"; then + AT_CHECK([test -e socket2]) +else + AT_CHECK([test -S socket2]) +fi +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes], + [0], [punix:socket2 +]) + +AT_CHECK([sed -i'back' '/punix:socket2/d' config.json]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/reload]) +OVS_WAIT_UNTIL([test ! -e socket2]) +AT_CHECK([test ! -e socket1]) +AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/list-remotes]) +OVSDB_SERVER_SHUTDOWN(['/"db:x,y,z": no database named x/d']) AT_CLEANUP AT_SETUP([ovsdb-server/add-remote with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with no remotes. ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db], [0], [ignore], [ignore]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -512,25 +775,25 @@ OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) OVS_WAIT_UNTIL([test -S socket1]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server/add-remote and remove-remote with --monitor]) AT_KEYWORDS([ovsdb server positive]) AT_SKIP_IF([test "$IS_WIN32" = "yes"]) -# This test intentionally causes SIGSEGV, so make Address Sanitizer ignore it. +# This test intentionally causes SIGSEGV, so make sanitizers ignore it. ASAN_OPTIONS=$ASAN_OPTIONS:handle_segv=0; export ASAN_OPTIONS - -# Skip it if UB Sanitizer is being used. There's no way to disable the -# SEGV check at runtime. -AT_SKIP_IF([test $TESTS_WITH_UBSAN = yes]) +UBSAN_OPTIONS=$UBSAN_OPTIONS:handle_segv=0; export UBSAN_OPTIONS # Start ovsdb-server, initially with no remotes. ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db]) +AT_CHECK([ovsdb-server -vfile -vvlog:off --monitor --detach --no-chdir --pidfile --log-file db], [0], [ignore], [ignore]) # Add a remote. AT_CHECK([test ! -e socket1]) @@ -555,7 +818,10 @@ OVS_WAIT_UNTIL( [test -s ovsdb-server.pid && test `cat ovsdb-server.pid` != `cat old.pid`]) OVS_WAIT_UNTIL([ovs-appctl -t ovsdb-server version]) AT_CHECK([test ! -e socket1]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN([" + /backtrace/d + /killed/d +"]) AT_CLEANUP AT_SETUP([SSL db: implementation]) @@ -670,11 +936,16 @@ AT_CHECK_UNQUOTED( [ignore]) # The error message for being unable to negotiate a shared ciphersuite # is 'sslv3 alert handshake failure'. This is not the clearest message. +# In openssl 3.2.0 all the error messages were updated to replace 'sslv3' +# with 'ssl/tls'. AT_CHECK_UNQUOTED( - [grep "sslv3 alert handshake failure" output], [0], + [grep -E "(sslv3|ssl/tls) alert handshake failure" output], [0], [stdout], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /stream_ssl|WARN/d + /Protocol error/d +"]) AT_CLEANUP OVS_START_SHELL_HELPERS @@ -701,7 +972,7 @@ ovsdb_check_online_compaction() { fi]) dnl Start ovsdb-server. on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0]) + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) AT_CHECK([ovsdb_client_wait unix:socket ordinals connected]) AT_CAPTURE_FILE([ovsdb-server.log]) dnl Do a bunch of random transactions that put crap in the database log. @@ -837,8 +1108,8 @@ _uuid name number dnl Then check that the dumped data is correct. This time first kill dnl and restart the database server to ensure that the data is correct on dnl disk as well as in memory. - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) - AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db]) + OVSDB_SERVER_SHUTDOWN + AT_CHECK([ovsdb-server -vvlog:off -vconsole:off --detach --no-chdir --pidfile --remote=punix:socket --log-file db], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -893,7 +1164,7 @@ ovsdb_check_online_conversion() { fi]) dnl Start the database server. - AT_CHECK([ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0]) + AT_CHECK([ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db], [0], [ignore], [ignore]) AT_CAPTURE_FILE([ovsdb-server.log]) dnl Put some data in the database. @@ -1109,9 +1380,9 @@ _uuid number ]) dnl Now kill and restart the database server to ensure that the data is dnl correct on disk as well as in memory. - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + OVSDB_SERVER_SHUTDOWN AT_CHECK([[ovsdb-server -vfile -vvlog:off -vconsole:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db]], - [0]) + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:db.sock ordinals | uuidfilt], [0], [dnl ordinals table _uuid number @@ -1134,7 +1405,7 @@ _uuid number AT_CHECK([test -f dir/.db.~lock~]) fi - OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + OVSDB_SERVER_SHUTDOWN } OVS_END_SHELL_HELPERS @@ -1243,7 +1514,7 @@ AT_CHECK([test $logged_updates -lt $logged_nonblock_updates]) AT_CHECK_UNQUOTED([ovs-vsctl get open_vswitch . system_version], [0], [xyzzy$counter ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-server transaction history size]) @@ -1270,7 +1541,7 @@ dnl a case where there is only one transaction in a history. get_memory_value () { n=$(ovs-appctl -t ovsdb-server memory/show dnl | tr ' ' '\n' | grep "^$1:" | cut -d ':' -f 2) - if test X"$n" == "X"; then + if test X"$n" = "X"; then n=0 fi echo $n @@ -1308,7 +1579,25 @@ dnl After removing all the bridges, the number of atoms in the database dnl should return to its initial value. AT_CHECK([test $(get_memory_value atoms) -eq $initial_db_atoms]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +dnl Add a few more resources. +for i in $(seq 1 10); do + cmd=$(add_ports $i $(($i / 4 + 1))) + AT_CHECK([ovs-vsctl --no-wait add-br br$i $cmd]) +done +check_atoms + +db_atoms_before_conversion=$(get_memory_value atoms) + +dnl Trigger online conversion. +AT_CHECK([ovsdb-client convert $abs_top_srcdir/vswitchd/vswitch.ovsschema], + [0], [ignore], [ignore]) + +dnl Check that conversion didn't change the number of atoms and the history +dnl still has a reasonable size. +check_atoms +AT_CHECK([test $(get_memory_value atoms) -eq $db_atoms_before_conversion]) + +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_BANNER([OVSDB -- ovsdb-server transactions (SSL IPv4 sockets)]) @@ -1543,12 +1832,30 @@ m4_define([OVSDB_CHECK_EXECUTION_RELAY], ], [0], [ignore], [ignore]) for i in $(seq 2 ${n_servers}); do - AT_CHECK([ovsdb-server --detach --no-chdir dnl - --log-file=ovsdb-server$i.log dnl - --pidfile=${i}.pid --remote=punix:db${i}.sock dnl - --unixctl=unixctl${i} -vjsonrpc:file:dbg dnl - relay:${schema_name}:unix:db$((i-1)).sock - ], [0], [ignore], [ignore]) + dnl Run every second relay with a config file. + if test $(expr $i % 2) -eq 0; then + echo "{ + \"remotes\": { \"punix:db${i}.sock\": {} }, + \"databases\": { + \"${schema_name}\": { + \"service-model\": \"relay\", + \"source\": { \"unix:db$((i-1)).sock\": {} } + } + } + }" > config${i}.json + AT_CHECK([ovsdb-server --detach --no-chdir --pidfile=${i}.pid \ + --log-file=ovsdb-server$i.log \ + --unixctl=unixctl${i} -vjsonrpc:file:dbg \ + --config-file=config${i}.json + ], [0], [ignore], [ignore]) + else + AT_CHECK([ovsdb-server --detach --no-chdir \ + --log-file=ovsdb-server$i.log \ + --pidfile=${i}.pid --remote=punix:db${i}.sock \ + --unixctl=unixctl${i} -vjsonrpc:file:dbg \ + relay:${schema_name}:unix:db$((i-1)).sock + ], [0], [ignore], [ignore]) + fi done m4_foreach([txn], [$4], @@ -1595,13 +1902,14 @@ AT_BANNER([OVSDB -- ovsdb-server replication]) # OVSDB_CHECK_EXECUTION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # -# Creates two databases with the given SCHEMA, and starts an ovsdb-server on +# Creates three databases with the given SCHEMA, and starts an ovsdb-server on # each database. # Runs each of the TRANSACTIONS (which should be a quoted list of # quoted strings) against one of the servers with ovsdb-client one at a -# time. The server replicates its database to the other ovsdb-server. +# time. The server replicates its database to the other two ovsdb-servers, +# one of which is configured via command line and the other via --config-file. # -# Checks that the dump of both databases are the same. +# Checks that the dump of all databases are the same. # # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_EXECUTION], @@ -1610,22 +1918,43 @@ m4_define([OVSDB_CHECK_EXECUTION], $2 > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) - - on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) - i - - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 --sync-from=unix:db.sock db2], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db3 schema], [0], [stdout], [ignore]) + + on_exit 'kill $(cat *.pid)' + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) + + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 \ + --sync-from=unix:db.sock db2], [0], [ignore], [ignore]) + + AT_DATA([config3.json], [ + { + "remotes": { "punix:db3.sock": {} }, + "databases": { + "db3": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} } + } + } + } +]) + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server3.log \ + --pidfile=3.pid --unixctl=unixctl3 --config-file=config3.json], + [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ovsdb-client transact 'txn'], [0], [stdout], [ignore]) ]) AT_CHECK([ovsdb-client dump], [0], [stdout], [ignore]) - OVS_WAIT_UNTIL([ ovsdb-client dump unix:db2.sock > dump2; diff stdout dump2]) + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db2.sock > dump2; diff -u stdout dump2]) + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db3.sock > dump3; diff -u stdout dump3]) OVSDB_SERVER_SHUTDOWN OVSDB_SERVER_SHUTDOWN2 + OVSDB_SERVER_SHUTDOWN_N([3]) AT_CLEANUP]) EXECUTION_EXAMPLES @@ -1634,19 +1963,22 @@ AT_BANNER([OVSDB -- ovsdb-server replication table-exclusion]) # OVSDB_CHECK_REPLICATION(TITLE, SCHEMA, TRANSACTIONS, OUTPUT, [KEYWORDS]) # -# Creates two databases with the given SCHEMA, and starts an +# Creates three databases with the given SCHEMA, and starts an # ovsdb-server on each database. # Runs each of the TRANSACTIONS (which should be a quoted list of # quoted strings) against one of the servers with ovsdb-client one at a -# time. The server replicates its database to the other ovsdb-server. +# time. The server replicates its database to the other two ovsdb-servers, +# one of which is configured via command line and the other via --config-file. # -# Checks that the difference between the dump of the databases is -# OUTPUT, but UUIDs in the output are replaced by markers of the form -# where N is a number. The first unique UUID is replaced by <0>, +# Checks that the difference between the dump of the first and the other two +# databases is OUTPUT, but UUIDs in the output are replaced by markers of the +# form where N is a number. The first unique UUID is replaced by <0>, # the next by <1>, and so on. # If a given UUID appears more than once it is always replaced by the # same marker. # +# Also checks that the dumps of the second and third databases are the same. +# # TITLE is provided to AT_SETUP and KEYWORDS to AT_KEYWORDS. m4_define([OVSDB_CHECK_REPLICATION], [AT_SETUP([$1]) @@ -1655,11 +1987,33 @@ m4_define([OVSDB_CHECK_REPLICATION], $2 > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) - - on_exit 'kill `cat *.pid`' - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) - - AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 --sync-from=unix:db.sock --sync-exclude-tables=mydb:b db2], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db3 schema], [0], [stdout], [ignore]) + + on_exit 'kill $(cat *.pid)' + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) + + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 \ + --sync-from=unix:db.sock --sync-exclude-tables=mydb:b db2], + [0], [ignore], [ignore]) + + AT_DATA([config3.json], [ + { + "remotes": { "punix:db3.sock": {} }, + "databases": { + "db3": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} }, + "exclude-tables": [["b"]] + } + } + } +]) + AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server3.log \ + --pidfile=3.pid --unixctl=unixctl3 --config-file=config3.json], + [0], [ignore], [ignore]) m4_foreach([txn], [$3], [AT_CHECK([ ovsdb-client transact 'txn' ], [0], [stdout], [ignore]) @@ -1672,6 +2026,11 @@ m4_define([OVSDB_CHECK_REPLICATION], AT_CHECK([ovsdb-client dump unix:db2.sock], [0], [stdout], [ignore]) cat stdout > dump2 + OVS_WAIT_UNTIL([ ovsdb-client dump unix:db3.sock | grep one ]) + AT_CHECK([ovsdb-client dump unix:db3.sock], [0], [stdout], [ignore]) + cat stdout > dump3 + AT_CHECK([diff -u dump2 dump3]) + AT_CHECK([diff dump1 dump2], [1], [stdout], [ignore]) cat stdout > output @@ -1679,6 +2038,7 @@ m4_define([OVSDB_CHECK_REPLICATION], OVSDB_SERVER_SHUTDOWN OVSDB_SERVER_SHUTDOWN2 + OVSDB_SERVER_SHUTDOWN_N([3]) AT_CLEANUP]) REPLICATION_EXAMPLES @@ -1691,7 +2051,7 @@ AT_KEYWORDS([ovsdb server replication get-active]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --sync-from=tcp:127.0.0.1:9999 db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --sync-from=tcp:127.0.0.1:9999 db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-active-ovsdb-server], [0], [tcp:127.0.0.1:9999 @@ -1704,7 +2064,7 @@ AT_KEYWORDS([ovsdb server replication set-active]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/set-active-ovsdb-server tcp:127.0.0.1:9999]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-active-ovsdb-server], @@ -1718,7 +2078,7 @@ AT_KEYWORDS([ovsdb server replication get-exclude-tables]) ordinal_schema > schema AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --sync-exclude-tables=mydb:db1,mydb:db2 db]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --sync-exclude-tables=mydb:db1,mydb:db2 db], [0], [ignore], [ignore]) AT_CHECK([ovs-appctl -t ovsdb-server ovsdb-server/get-sync-exclude-tables], [0], [mydb:db1,mydb:db2 @@ -1780,9 +2140,14 @@ replication_schema > schema AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock db1], + [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl=unixctl2 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server2.log --pidfile=2.pid \ + --remote=punix:db2.sock --unixctl=unixctl2 db2], + [0], [ignore], [ignore]) dnl Try to connect without specifying the active server. AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/connect-active-ovsdb-server], [0], @@ -1933,7 +2298,9 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status |grep re dnl Switch the 'db1' to active AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/disconnect-active-ovsdb-server]) -AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status], [0], [state: active +AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active ]) dnl Issue a transaction to 'db1' @@ -1952,7 +2319,9 @@ AT_CHECK([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/connect-active-ovsdb-server dnl Verify the change happend OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status |grep replicating]) -AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status], [0], [state: active +AT_CHECK([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active ]) dnl Issue an transaction to 'db2' which is now active. @@ -1978,6 +2347,140 @@ dnl OVSDB_SERVER_SHUTDOWN dnl OVSDB_SERVER_SHUTDOWN2 AT_CLEANUP +AT_SETUP([ovsdb-server/active-backup-role-switching with config file]) +AT_KEYWORDS([ovsdb server replication active-backup-switching config-file]) +replication_schema > schema +AT_CHECK([ovsdb-tool create db1 schema], [0], [stdout], [ignore]) +AT_CHECK([ovsdb-tool create db2 schema], [0], [stdout], [ignore]) + +dnl Add some data to both DBs. +AT_CHECK([ovsdb-tool transact db1 \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 9, "name": "nine"}}]]'], [0], [ignore], [ignore]) + +AT_CHECK([ovsdb-tool transact db2 \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 9, "name": "nine"}}]]'], [0], [ignore], [ignore]) + +dnl Start both 'db1' and 'db2' in backup mode. Let them backup from each +dnl other. This is not a supported operation state, but to simulate a start +dnl up condition where an HA manger can select which one to be an active +dnl server soon after. +on_exit 'kill $(cat *.pid)' + +AT_DATA([config1.json], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db1": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db2.sock": {} } + } + } +} +]) + +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server1.log \ + --pidfile=1.pid --unixctl=unixctl1 --config-file=config1.json], + [0], [ignore], [ignore]) + +AT_DATA([config2.json], [ +{ + "remotes": { "punix:db2.sock": {} }, + "databases": { + "db2": { + "service-model": "active-backup", + "backup": true, + "source": { "unix:db.sock": {} } + } + } +} +]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir --log-file=ovsdb-server2.log \ + --pidfile=2.pid --unixctl=unixctl2 --config-file=config2.json], + [0], [ignore], [ignore]) + +dnl Make sure both servers reached the replication state. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status | grep replicating]) +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/sync-status | grep replicating]) + +dnl Switch the 'db1' to active. +AT_CHECK([sed -i'back' 's/"backup": true/"backup": false/' config1.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/reload]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active +]) + +dnl Issue a transaction to 'db1'. +AT_CHECK([ovsdb-client transact unix:db.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 0, "name": "zero"}}]]'], [0], [ignore]) + +dnl It should be replicated to 'db2'. +OVS_WAIT_UNTIL([ovsdb-client dump unix:db2.sock | grep zero]) + +dnl Issue a transaction to 'db2', it should fail. +AT_CHECK([ovsdb-client transact unix:db2.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 1, "name": "one"}}]]'], [0], [dnl +[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] +]) + +dnl Flip the role of 'db1' and 'db2'. 'db1' becomes backup, and 'db2' becomes active. +AT_CHECK([sed -i'back' 's/"backup": true/"backup": false/' config2.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/reload]) +AT_CHECK([sed -i'back' 's/"backup": false/"backup": true/' config1.json]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/reload]) + +dnl Verify the change happend. +OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/unixctl1 ovsdb-server/sync-status | grep replicating]) +AT_CHECK([ovs-appctl -t $(pwd)/unixctl2 ovsdb-server/sync-status], [0], [dnl +database: mydb +state: active +]) + +dnl Issue a transaction to 'db2' which is now active. +AT_CHECK([ovsdb-client transact unix:db2.sock \ +'[["mydb", + {"op": "insert", + "table": "b", + "row": {"number": 1, "name": "one"}}]]'], [0], [ignore]) + +dnl The transaction should be replicated to 'db1'. +OVS_WAIT_UNTIL([ovsdb-client dump unix:db.sock | grep one]) + +dnl Issue a transaction to 'db1', it should fail. +AT_CHECK([ovsdb-client transact unix:db.sock \ +'[["mydb", + {"op": "insert", + "table": "a", + "row": {"number": 2, "name": "two"}}]]'], [0], [dnl +[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] +]) + +dnl Both servers should have the same content. +AT_CHECK([ovsdb-client dump unix:db.sock], [0], [stdout]) +cat stdout > dump1 + +AT_CHECK([ovsdb-client dump unix:db2.sock], [0], [stdout]) +cat stdout > dump2 + +AT_CHECK([diff -u dump1 dump2]) + +OVSDB_SERVER_SHUTDOWN_N([1]) +OVSDB_SERVER_SHUTDOWN2 +AT_CLEANUP + #ovsdb-server prevent self replicating AT_SETUP([ovsdb-server prevent self replicating]) AT_KEYWORDS([ovsdb server replication]) @@ -2061,7 +2564,7 @@ AT_CHECK( "row": {"target": "ptcp:0:127.0.0.1", "read_only": true}}]']], [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --log-file --detach --no-chdir --pidfile --remote=db:mydb,Root,managers db], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=db:mydb,Root,managers db], [0], [ignore], [ignore]) PARSE_LISTENING_PORT([ovsdb-server.log], [TCP_PORT]) AT_CHECK([ovsdb-client get-schema-version tcp:127.0.0.1:$TCP_PORT mydb], [0], [5.1.3 ]) @@ -2075,7 +2578,9 @@ AT_CHECK([ovsdb-client transact tcp:127.0.0.1:$TCP_PORT \ cat stdout >> output AT_CHECK([uuidfilt output], [0], [[[{"details":"insert operation not allowed when database server is in read only mode","error":"not allowed"}]] ], [ignore]) -OVSDB_SERVER_SHUTDOWN +OVSDB_SERVER_SHUTDOWN([" + /No status column present in the Manager table/d +"]) AT_CLEANUP AT_SETUP([ovsdb-server replication with schema mismatch]) @@ -2101,9 +2606,16 @@ AT_CHECK([ovsdb-tool transact db2 \ dnl Start both 'db1' and 'db2'. on_exit 'kill `cat *.pid`' -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server1.log --pidfile --remote=punix:db.sock --unixctl="`pwd`"/unixctl db1 --active ], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server1.log --pidfile \ + --remote=punix:db.sock \ + --unixctl="$(pwd)"/unixctl db1 --active ], + [0], [ignore], [ignore]) -AT_CHECK([ovsdb-server --detach --no-chdir --log-file=ovsdb-server2.log --pidfile=2.pid --remote=punix:db2.sock --unixctl="`pwd`"/unixctl2 db2], [0], [ignore], [ignore]) +AT_CHECK([ovsdb-server -vfile --detach --no-chdir \ + --log-file=ovsdb-server2.log --pidfile=2.pid \ + --remote=punix:db2.sock --unixctl="$(pwd)"/unixctl2 db2], + [0], [ignore], [ignore]) OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl ovsdb-server/sync-status |grep active]) OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/unixctl2 ovsdb-server/sync-status |grep active]) @@ -2292,8 +2804,8 @@ AT_CHECK([uuidfilt monitor.stdout | sed '/^$/d'], [0], [dnl <8> delete 4 four <9> insert 4 four ]) -OVSDB_SERVER_SHUTDOWN -OVSDB_SERVER_SHUTDOWN2 +OVSDB_SERVER_SHUTDOWN(["/Address already in use/d"]) +OVSDB_SERVER_SHUTDOWN2(["/Address already in use/d"]) dnl Starting a replay. AT_CHECK([ovsdb-server --replay=./replay_dir dnl @@ -2323,6 +2835,7 @@ m4_define([CLEAN_LOG_FILE], [sed 's/[[0-9\-]]*T[[0-9:\.]]*Z|[[0-9]]*\(|.*$\)/\1/g' $1 | dnl sed '/|poll_loop|/d' | dnl sed '/|socket_util|/d' | dnl + sed '/|cooperative_multitasking|DBG|/d' | dnl sed 's/[[0-9]]*\.ctl/\.ctl/g'> $2]) CLEAN_LOG_FILE([1.log], [1.log.clear]) @@ -2330,6 +2843,276 @@ CLEAN_LOG_FILE([2.log], [2.log.clear]) dnl Checking that databases and logs are equal. AT_CHECK([diff db.clear ./replay_dir/db.copy.clear]) -AT_CHECK([diff 1.log.clear 2.log.clear]) +AT_CHECK([diff -u 1.log.clear 2.log.clear]) AT_CLEANUP + +AT_BANNER([OVSDB -- ovsdb-server configuration file]) + +dnl TEST_CONFIG_FILE([NAME], [config], [EXIT_CODE], [FAILURE_STRINGS]) +dnl +dnl Tries the config as a data for --config-file, checks the EXIT_CODE +dnl of the ovsdb-server and checks the stderr for FAILURE_STRINGS. +dnl NAME is added to the test name and keywords. +m4_define([TEST_CONFIG_FILE], +[ + AT_SETUP([ovsdb-server config-file - $1]) + AT_KEYWORDS([ovsdb server config-file $1]) + on_exit 'kill $(cat *.pid)' + echo '$2' > config.json + AT_CAPTURE_FILE([config.json]) + ordinal_schema > schema + constraint_schema > schema2 + AT_CHECK([ovsdb-tool create db schema], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create db2 schema], [0], [ignore], [ignore]) + AT_CHECK([ovsdb-tool create-cluster db_cluster schema2 unix:s1.raft], + [0], [ignore], [ignore]) + AT_CHECK([ovsdb-server -vfile -vPATTERN:console:'%p|%m' -vvlog:off \ + --log-file --detach --no-chdir --pidfile \ + --config-file=config.json], [$3], [ignore], [stderr]) + m4_if([$4], [], [], [ + AT_CHECK([cat stderr | grep -v -E 'INFO|DBG' \ + | grep -v 'failed to load configuration from' \ + | sed -e "/duplicate database name/ s/'db'/'db2'/" \ + > warnings]) + AT_CHECK([cat warnings], [0], [m4_if([$3], [0], [$4], [$4 +ovsdb-server: server configuration failed +])])]) + m4_if([$3$4], [0], [OVSDB_SERVER_SHUTDOWN]) + AT_CLEANUP +]) + +TEST_CONFIG_FILE([simple], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": null, "db_cluster": {} } +} +], [0]) + +TEST_CONFIG_FILE([standalone], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "standalone" } } +} +], [0]) + +TEST_CONFIG_FILE([clustered], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db_cluster": { "service-model": "clustered" } } +} +], [0]) + +TEST_CONFIG_FILE([unknown service model], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "not-a-service-model" } } +} +], [1], [dnl +WARN|Unrecognized database service model: 'not-a-service-model' +WARN|syntax "{"service-model":"not-a-service-model"}": syntax error:dnl + Parsing database db failed: 'not-a-service-model' is not a valid service model +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([same schema], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": null, "db2": {} } +} +], [1], [dnl +WARN|failed to open database 'db2': ovsdb error: ordinals: duplicate database name +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([model mismatch], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db": { "service-model": "clustered" } } +} +], [1], [dnl +WARN|failed to open database 'db': ovsdb error: db: database is standalone and not clustered +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([model mismatch clustered], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { "db_cluster": { "service-model": "standalone" } } +} +], [1], [dnl +WARN|failed to open database 'db_cluster': ovsdb error: db_cluster: database is clustered and not standalone +WARN|failed to configure databases]) + +TEST_CONFIG_FILE([relay], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { "unix:db2.sock": {} } + } + } +} +], [0]) + +TEST_CONFIG_FILE([relay without source], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay" + } + } +} +], [1], [dnl +WARN|syntax "{"service-model":"relay"}": syntax error: Parsing database RelaySchema failed:dnl + Required 'source' member is missing. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([relay with options], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { + "punix:db2.sock": { + "inactivity-probe": 10000, + "max-backoff": 8000, + "dscp": 42 + } + } + } + } +} +], [0]) + +TEST_CONFIG_FILE([relay with unrelated options], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "RelaySchema": { + "service-model": "relay", + "source": { + "punix:db2.sock": { + "inactivity-probe": 10000, + "max-backoff": 8000, + "dscp": 42, + "role": "My-RBAC-role" + } + } + } + } +} +], [0], [dnl +WARN|syntax "{"dscp":42,"inactivity-probe":10000,"max-backoff":8000,"role":"My-RBAC-role"}":dnl + syntax error: Parsing JSON-RPC options failed: Member 'role' is present but not allowed here. +]) + +TEST_CONFIG_FILE([unknown config], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { "unknnown": "unknown" } + } +} +], [1], [dnl +WARN|syntax "{"unknnown":"unknown"}": syntax error: Parsing database db failed:dnl + Member 'unknnown' is present but not allowed here. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([active-backup active], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": false + } + } +} +], [0]) + +TEST_CONFIG_FILE([active-backup backup], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": true, + "source": { + "punix:db2.sock": { + "inactivity-probe": 100000, + "max-backoff": 16000, + "dscp": 42 + } + } + } + } +} +], [0]) + +TEST_CONFIG_FILE([active-backup backup without source], [ +{ + "remotes": { "punix:db.sock": {} }, + "databases": { + "db": { + "service-model": "active-backup", + "backup": true + } + } +} +], [1], [dnl +WARN|syntax "{"backup":true,"service-model":"active-backup"}": syntax error:dnl + Parsing database db failed: Required 'source' member is missing. +WARN|config: failed to parse 'databases']) + +TEST_CONFIG_FILE([syntax error], [ +{ + "remotes": { "punix:db.sock": {}, }, + "databases": { "db": {}, "db_cluster": {} } +} +], [1], [dnl +WARN|config: reading JSON failed (line 2, column 38, byte 41: syntax error parsing object expecting string)]) + +TEST_CONFIG_FILE([complex config], [ +{ + "remotes": { + "punix:db.sock": { + "inactivity-probe": 0, + "read-only": false + }, + "pssl:0:127.0.0.1": { + "inactivity-probe": 5000, + "max-backoff": 8000, + "read-only": true, + "role": "ovn-controller", + "dscp": 48 + }, + "db:ordinals,ordinals,name": null + }, + "databases": { + "db_cluster": { + "service-model": "clustered" + }, + "OVN_Northbound": { + "service-model": "relay", + "source": { + "unix:nb.sock": { + "max-backoff": 3000, + "inactivity-probe": 16000 + } + } + }, + "db": { + "service-model": "active-backup", + "backup": true, + "source": { + "unix:active.sock": { + "max-backoff": 16000, + "inactivity-probe": 180000 + } + }, + "exclude-tables": [["IC_SB_Global", "Availability_Zone"]] + } + } +} +], [0]) diff --git a/tests/ovsdb-tool.at b/tests/ovsdb-tool.at index 12ad6fb3fc6..d8d2b1c9990 100644 --- a/tests/ovsdb-tool.at +++ b/tests/ovsdb-tool.at @@ -118,11 +118,11 @@ AT_CHECK([[uuidfilt db | grep -v ^OVSDB | sed 's/"_date":[0-9]*/"_date":0/' | \ dnl Dump out and check the actual database contents. on_exit 'kill `cat ovsdb-server.pid`' -AT_CHECK([[ovsdb-server --detach --pidfile --no-chdir --remote=punix:socket db]], +AT_CHECK([[ovsdb-server --detach --pidfile --log-file --no-chdir --remote=punix:socket db]], [0], [stdout], [ignore]) AT_CHECK([[ovsdb-client dump unix:socket ordinals]], [0], [stdout], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -151,11 +151,11 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same too: -AT_CHECK([[ovsdb-server --detach --pidfile --no-chdir --remote=punix:socket db]], +AT_CHECK([[ovsdb-server --detach --pidfile --log-file --no-chdir --remote=punix:socket db]], [0], [stdout], [ignore]) AT_CHECK([[ovsdb-client dump unix:socket ordinals]], [0], [stdout], [ignore]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -196,8 +196,8 @@ AT_CHECK( done]], [0], [stdout], [ignore]) dnl Dump out and check the actual database contents. -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -210,7 +210,7 @@ _uuid name number <4> two 2 <5> zero 0 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Now convert the database in-place. touch .db.tmp.~lock~ AT_CHECK([[ovsdb-tool convert db new-schema]], [0], [], [ignore]) @@ -220,8 +220,8 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same except for the removed column: -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -234,7 +234,7 @@ _uuid number <4> 4 <5> 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-tool convert -- adding a column]) @@ -262,8 +262,8 @@ AT_CHECK( done]], [0], [stdout], [ignore]) dnl Dump out and check the actual database contents. -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -276,7 +276,7 @@ _uuid number <4> 4 <5> 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN dnl Now convert the database in-place. touch .db.tmp.~lock~ AT_CHECK([[ovsdb-tool convert db new-schema]], [0], [], [ignore]) @@ -286,8 +286,8 @@ dnl in it now. AT_CAPTURE_FILE([db]) AT_CHECK([test `wc -l < db` -eq 4]) dnl And check that the dumped data is the same except for the added column: -AT_CHECK([[ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db]], - [0]) +AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], + [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket ordinals], [0], [stdout], [ignore]) AT_CHECK([uuidfilt stdout], [0], [dnl ordinals table @@ -300,7 +300,7 @@ _uuid name number <4> "" 4 <5> "" 5 ]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN AT_CLEANUP AT_SETUP([ovsdb-tool unsupported cluster operations]) @@ -446,7 +446,7 @@ AT_CHECK( # Dump the data. AT_CHECK([ovsdb-server -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) AT_CHECK([ovsdb-client dump > expout]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Create a clustered database from the standalone one. ovsdb-tool create-cluster db2 db1 unix:s1.raft @@ -455,7 +455,7 @@ ovsdb-tool create-cluster db2 db1 unix:s1.raft AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db2]) AT_CHECK([ovsdb_client_wait ordinals connected]) AT_CHECK([ovsdb-client dump > dump2]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Make sure that the clustered data matched the standalone data. AT_CHECK([cat dump2], [0], [expout]) @@ -465,6 +465,7 @@ AT_SETUP([ovsdb-tool convert-to-standalone]) AT_KEYWORDS([ovsdb file positive]) ordinal_schema > schema AT_CHECK([ovsdb-tool create-cluster db schema unix:s1.raft], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket --log-file db >/dev/null 2>&1]) for txn in m4_foreach([txn], [[[["ordinals", {"op": "insert", @@ -481,7 +482,7 @@ done AT_CHECK([ovsdb-client transact unix:socket '[["ordinals"]]'], [0], [ignore], [ignore]) AT_CHECK([ovsdb-client dump unix:socket > clusterdump]) -AT_CHECK([ovs-appctl -t ovsdb-server -e exit], [0], [ignore], [ignore]) +OVSDB_SERVER_SHUTDOWN # Convert to standalone database from clustered database. AT_CHECK(ovsdb-tool cluster-to-standalone db1 db) @@ -493,8 +494,76 @@ AT_CHECK([ovsdb-tool db-is-standalone db1]) AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir --pidfile --log-file --remote=punix:db.sock db1]) AT_CHECK([ovsdb_client_wait ordinals connected]) AT_CHECK([ovsdb-client dump > standalonedump]) -OVS_APP_EXIT_AND_WAIT([ovsdb-server]) +OVSDB_SERVER_SHUTDOWN # Make sure both standalone and cluster db data matches. AT_CHECK([diff standalonedump clusterdump]) AT_CLEANUP + +AT_SETUP([ovsdb-tool convert-to-standalone after schema conversion]) +AT_KEYWORDS([ovsdb file positive]) +ordinal_schema > schema +AT_CHECK([ovsdb-tool create-cluster db schema unix:s1.raft], [0], [stdout], [ignore]) +on_exit 'kill `cat ovsdb-server.pid`' +AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket dnl + --log-file db >/dev/null 2>&1]) +for txn in m4_foreach([txn], [[[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 0, "name": "zero"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 1, "name": "one"}}, + {"op": "insert", + "table": "ordinals", + "row": {"number": 2, "name": "two"}}]]]], ['txn' ]); do + AT_CHECK([ovsdb-client transact unix:socket "$txn"], [0], [ignore], [ignore]) +done + +dnl Change the schema. +AT_CHECK([sed 's/5\.1\.3/5.1.4/' < schema > schema2]) +AT_CHECK([sed -i'back' -e '/.*"number":.*/a \ + "is_seven": {"type": "boolean"}, + ' schema2]) + +dnl Convert the database. +AT_CHECK([ovsdb-client convert unix:socket schema2]) + +dnl Add a new row with a new column. +AT_CHECK([ovsdb-client transact unix:socket dnl + '[["ordinals", + {"op": "insert", + "table": "ordinals", + "row": {"number": 7, "name": "seven", "is_seven": true} + }]]'], [0], [ignore], [ignore]) + +AT_CHECK([ovsdb-client dump unix:socket > clusterdump]) + +AT_CHECK([uuidfilt clusterdump], [0], [dnl +ordinals table +_uuid is_seven name number +------------------------------------ -------- ----- ------ +<0> false one 1 +<1> false two 2 +<2> false zero 0 +<3> true seven 7 +]) + +OVSDB_SERVER_SHUTDOWN + +dnl Convert to standalone database from clustered database. +AT_CHECK(ovsdb-tool cluster-to-standalone db1 db) + +dnl Check it's a standalone db. +AT_CHECK([ovsdb-tool db-is-standalone db1]) + +dnl Dump the standalone db data. +AT_CHECK([ovsdb-server -vconsole:off -vfile -vvlog:off --detach --no-chdir dnl + --pidfile --log-file --remote=punix:db.sock db1]) +AT_CHECK([ovsdb_client_wait ordinals connected]) +AT_CHECK([ovsdb-client dump > standalonedump]) +OVSDB_SERVER_SHUTDOWN + +dnl Make sure both standalone and cluster db data matches. +AT_CHECK([diff standalonedump clusterdump]) +AT_CLEANUP diff --git a/tests/packet-type-aware.at b/tests/packet-type-aware.at index 3b5c66fe526..d634930fd52 100644 --- a/tests/packet-type-aware.at +++ b/tests/packet-type-aware.at @@ -142,30 +142,27 @@ AT_CHECK([ ### Setup GRE tunnels AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br-p1 10.0.0.1/24 && - ovs-appctl ovs/route/add 10.0.0.0/24 br-p1 && ovs-appctl tnl/arp/set br-p1 10.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p1 10.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p1 10.0.0.3 $HWADDR_BRP3 && ovs-appctl netdev-dummy/ip4addr br-p2 20.0.0.2/24 && - ovs-appctl ovs/route/add 20.0.0.0/24 br-p2 && ovs-appctl tnl/arp/set br-p2 20.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p2 20.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p2 20.0.0.3 $HWADDR_BRP3 && ovs-appctl netdev-dummy/ip4addr br-p3 30.0.0.3/24 && - ovs-appctl ovs/route/add 30.0.0.0/24 br-p3 && ovs-appctl tnl/arp/set br-p3 30.0.0.1 $HWADDR_BRP1 && ovs-appctl tnl/arp/set br-p3 30.0.0.2 $HWADDR_BRP2 && ovs-appctl tnl/arp/set br-p3 30.0.0.3 $HWADDR_BRP3 ], [0], [ignore]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: | sort ], [0], [dnl -User: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 -User: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 -User: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 +Cached: 10.0.0.0/24 dev br-p1 SRC 10.0.0.1 local +Cached: 20.0.0.0/24 dev br-p2 SRC 20.0.0.2 local +Cached: 30.0.0.0/24 dev br-p3 SRC 30.0.0.3 local ]) AT_CHECK([ @@ -327,7 +324,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) -tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 +recirc_id(0),tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 ]) # Clear up megaflow cache @@ -345,7 +342,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=20.0.0.1,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.1,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 +recirc_id(0),tunnel(src=20.0.0.1,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 ]) # Clear up megaflow cache @@ -363,7 +360,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=3a:6d:d2:09:9c:ab),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:01)),n1 +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=3a:6d:d2:09:9c:ab),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:01)),n1 ]) # Clear up megaflow cache @@ -381,8 +378,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n2),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) -tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.30,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:03,src=aa:55:00:00:00:01,dl_type=0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p1)),set(ipv4(src=30.0.0.1,dst=30.0.0.3)),tnl_pop(gre_sys) +recirc_id(0),tunnel(src=30.0.0.1,dst=30.0.0.3,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=1e:2c:e9:2a:66:9e),eth_type(0x0800),ipv4(dst=192.168.10.30,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:03)),n3 ]) # Clear up megaflow cache @@ -400,8 +397,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:01),n1 -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:84, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) +recirc_id(0),tunnel(src=10.0.0.2,dst=10.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,frag=no), packets:1, bytes:84, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=aa:55:aa:55:00:01),n1 +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.10,tos=0/0x3,frag=no), packets:1, bytes:84, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:01,src=aa:55:00:00:00:02,dl_type=0x0800),ipv4(src=20.0.0.2,dst=20.0.0.1,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p2)),set(ipv4(src=10.0.0.2,dst=10.0.0.1)),tnl_pop(gre_sys) ]) # Clear up megaflow cache @@ -419,7 +416,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth(dst=46:1e:7d:1a:95:a1),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:98, used:0.0s, actions:set(eth(dst=aa:55:aa:55:00:02)),n2 ]) ### Check the received packets @@ -505,7 +502,7 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(n3),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=192.168.10.20,tos=0/0x3,frag=no), packets:1, bytes:98, used:0.0s, actions:pop_eth,tnl_push(tnl_port(gre_sys),header(size=38,type=3,eth(dst=aa:55:00:00:00:02,src=aa:55:00:00:00:03,dl_type=0x0800),ipv4(src=30.0.0.3,dst=30.0.0.2,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x800))),out_port(br-p3)),set(ipv4(src=20.0.0.3,dst=20.0.0.2)),tnl_pop(gre_sys) -tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:84, used:0.0s, actions:drop +recirc_id(0),tunnel(src=20.0.0.3,dst=20.0.0.2,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x800),eth_type(0x0800),ipv4(dst=192.168.10.20,frag=no), packets:1, bytes:84, used:0.0s, actions:drop ]) OVS_VSWITCHD_STOP(["/The Open vSwitch kernel module is probably not loaded/d"]) @@ -681,14 +678,13 @@ AT_CHECK([ AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br2 10.0.0.1/24 && - ovs-appctl ovs/route/add 10.0.0.0/24 br2 && ovs-appctl tnl/arp/set br2 10.0.0.2 de:af:be:ef:ba:be ], [0], [ignore]) AT_CHECK([ - ovs-appctl ovs/route/show | grep User: + ovs-appctl ovs/route/show | grep Cached: ], [0], [dnl -User: 10.0.0.0/24 dev br2 SRC 10.0.0.1 +Cached: 10.0.0.0/24 dev br2 SRC 10.0.0.1 local ]) @@ -955,7 +951,6 @@ AT_CHECK([ AT_CHECK([ ovs-appctl netdev-dummy/ip4addr br0 20.0.0.1/24 && - ovs-appctl ovs/route/add 20.0.0.2/24 br0 && ovs-appctl tnl/neigh/set br0 20.0.0.1 aa:bb:cc:00:00:01 && ovs-appctl tnl/neigh/set br0 20.0.0.2 aa:bb:cc:00:00:02 ], [0], [ignore]) @@ -963,9 +958,9 @@ AT_CHECK([ ovs-appctl time/warp 1000 AT_CHECK([ - ovs-appctl ovs/route/show | grep User + ovs-appctl ovs/route/show | grep Cached: ],[0], [dnl -User: 20.0.0.0/24 dev br0 SRC 20.0.0.1 +Cached: 20.0.0.0/24 dev br0 SRC 20.0.0.1 local ]) AT_CHECK([ @@ -1020,8 +1015,8 @@ AT_CHECK([ ovs-appctl dpctl/dump-flows --names dummy@ovs-dummy | strip_used | grep -v ipv6 | sort ], [0], [flow-dump from the main thread: recirc_id(0),in_port(p0),packet_type(ns=0,id=0),eth(src=aa:bb:cc:00:00:02,dst=aa:bb:cc:00:00:01),eth_type(0x0800),ipv4(dst=20.0.0.1,proto=47,frag=no), packets:3, bytes:378, used:0.0s, actions:tnl_pop(gre_sys) -tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),recirc_id(0),in_port(gre_sys),packet_type(ns=1,id=0x8847),eth_type(0x8847),mpls(label=999/0x0,tc=0/0,ttl=64/0x0,bos=1/1), packets:3, bytes:264, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00),pop_mpls(eth_type=0x800),recirc(0x1) -tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),recirc_id(0x1),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br +recirc_id(0),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=1,id=0x8847),eth_type(0x8847),mpls(label=999/0x0,tc=0/0,ttl=64/0x0,bos=1/1), packets:3, bytes:264, used:0.0s, actions:push_eth(src=00:00:00:00:00:00,dst=00:00:00:00:00:00),pop_mpls(eth_type=0x800),recirc(0x1) +recirc_id(0x1),tunnel(src=20.0.0.2,dst=20.0.0.1,flags(-df-csum)),in_port(gre_sys),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(proto=1,ttl=64,frag=no), packets:3, bytes:294, used:0.0s, actions:set(ipv4(ttl=63)),int-br ]) ovs-appctl time/warp 1000 diff --git a/tests/pmd.at b/tests/pmd.at index 10879a349b9..35a44b4dfee 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -60,6 +60,34 @@ m4_define([CHECK_PMD_THREADS_CREATED], [ fi ]) +dnl CHECK_DP_SLEEP_MAX([max_sleep], [+line]) +dnl +dnl Checks correct pmd load based sleep value for the datapath. +dnl Checking starts from line number 'line' in ovs-vswithd.log . +m4_define([CHECK_DP_SLEEP_MAX], [ + SLEEP_TIME="Default PMD thread max sleep: *[$1] us." + line_st=$2 + if [[ -z "$line_st" ]] + then + line_st="+0" + fi + OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$SLEEP_TIME"]) +]) + +dnl CHECK_PMD_SLEEP_MAX([core_id], [numa_id], [max_sleep], [+line]) +dnl +dnl Checks max sleep time of each pmd with core_id. +dnl Checking starts from line number 'line' in ovs-vswithd.log . +m4_define([CHECK_PMD_SLEEP_MAX], [ + PATTERN="PMD thread on numa_id: *[$1], core id: *[$2], max sleep: *[$3] us." + line_st=$4 + if [[ -z "$line_st" ]] + then + line_st="+0" + fi + OVS_WAIT_UNTIL([tail -n $line_st ovs-vswitchd.log | grep "$PATTERN"]) +]) + m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1\2:/"]) m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"]) @@ -70,17 +98,18 @@ CHECK_CPU_DISCOVERED() CHECK_PMD_THREADS_CREATED() AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL overhead: NOT AVAIL ]) -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=1, configured_tx_queues=, requested_rx_queues=1, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) ]) OVS_VSWITCHD_STOP @@ -94,14 +123,15 @@ CHECK_PMD_THREADS_CREATED() AT_CHECK([ovs-vsctl set interface p0 options:n_rxq=8]) -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -126,14 +156,15 @@ OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n CHECK_CPU_DISCOVERED(2) CHECK_PMD_THREADS_CREATED() -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -183,6 +214,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) CHECK_PMD_THREADS_CREATED([1], [], [+$TMP]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -207,14 +239,15 @@ TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1)) CHECK_CPU_DISCOVERED(4) CHECK_PMD_THREADS_CREATED() -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 1/1: (dummy-pmd: configured_rx_queues=8, configured_tx_queues=, requested_rx_queues=8, requested_tx_queues=) + p0 1/1: (dummy-pmd: n_rxq=8, n_txq=1, numa_id=1) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | sed SED_NUMA_CORE_PATTERN], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id core_id : isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -280,6 +313,7 @@ CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using group algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -302,6 +336,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=roundrobin]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using roundrobin algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -322,6 +357,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles]) OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx queue assignment using cycles algorithm"]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 1: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -343,6 +379,7 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) CHECK_PMD_THREADS_CREATED([1], [1], [+$TMP]) AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl +Displaying last 60 seconds pmd usage % pmd thread numa_id 1 core_id 0: isolated : false port: p0 queue-id: 0 (enabled) pmd usage: NOT AVAIL @@ -411,11 +448,11 @@ AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:smc-enable=true]) sleep 1 -AT_CHECK([ovs-appctl dpif/show | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl +AT_CHECK([ovs-appctl dpif/show], [0], [dnl dummy@ovs-dummy: hit:0 missed:0 br0: br0 65534/100: (dummy-internal) - p0 7/1: (dummy-pmd: configured_rx_queues=4, configured_tx_queues=, requested_rx_queues=4, requested_tx_queues=) + p0 7/1: (dummy-pmd: n_rxq=4, n_txq=1, numa_id=0) ]) AT_CHECK([ovs-appctl dpif-netdev/pmd-stats-show | sed SED_NUMA_CORE_PATTERN | sed '/cycles/d' | grep pmd -A 12], [0], [dnl @@ -446,7 +483,7 @@ for i in `seq 0 19`; ovs-appctl time/warp 100 AT_CHECK([grep -A 1 'miss upcall' ovs-vswitchd.log | tail -n 1], [0], [dnl -skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc_id(0),dp_hash(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) +recirc_id(0),dp_hash(0),skb_priority(0),in_port(1),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0) ]) AT_CHECK([cat ovs-vswitchd.log | filter_flow_install | strip_xout], [0], [dnl recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:77,dst=50:54:00:00:01:78),eth_type(0x0800),ipv4(frag=no), actions: @@ -471,6 +508,59 @@ pmd thread numa_id core_id : OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([PMD - pmd-rxq-show pmd usage time]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd], [], [], [DUMMY_NUMA]) + +#CHECK_CPU_DISCOVERED() +#CHECK_PMD_THREADS_CREATED() + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs -1 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 0 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 1 | grep Displaying], [0], [dnl +Displaying last 5 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 5 | grep Displaying], [0], [dnl +Displaying last 5 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 6 | grep Displaying], [0], [dnl +Displaying last 10 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 51 | grep Displaying], [0], [dnl +Displaying last 55 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 55 | grep Displaying], [0], [dnl +Displaying last 55 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 56 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 60 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show -secs 61 | grep Displaying], [0], [dnl +Displaying last 60 seconds pmd usage % +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + dnl Reconfigure the number of rx queues of a port, make sure that all the dnl queues are polled by the datapath and try to send a couple of packets. AT_SETUP([PMD - reconfigure n_rxq]) @@ -526,8 +616,8 @@ icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10 dnl Check resetting to default number of rx queues after removal from the db. AT_CHECK([ovs-vsctl remove interface p1 options n_rxq]) -AT_CHECK([ovs-appctl dpif/show | grep p1 | sed 's/\(tx_queues=\)[[0-9]]*/\1/g'], [0], [dnl - p1 1/1: (dummy-pmd: configured_rx_queues=1, configured_tx_queues=, requested_rx_queues=1, requested_tx_queues=) +AT_CHECK([ovs-appctl dpif/show | grep p1], [0], [dnl + p1 1/1: (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) ]) OVS_VSWITCHD_STOP @@ -1074,7 +1164,7 @@ dummy@dp0: lookups: hit:0 missed:0 lost:0 flows: 0 port 0: dp0 (dummy-internal) - port 1: p1 (dummy-pmd: configured_rx_queues=1, configured_tx_queues=1, requested_rx_queues=1, requested_tx_queues=1) + port 1: p1 (dummy-pmd: n_rxq=1, n_txq=1, numa_id=0) port 2: p2 (dummy) ]) @@ -1192,3 +1282,425 @@ ovs-appctl: ovs-vswitchd: server returned an error OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([PMD - pmd sleep]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1], [], [], [--dummy-numa 0,0,0,1,1,8,8]) + +dnl Check default +CHECK_DP_SLEEP_MAX([0], []) +CHECK_PMD_SLEEP_MAX([0], [0], [0], []) +CHECK_PMD_SLEEP_MAX([1], [3], [0], []) +CHECK_PMD_SLEEP_MAX([8], [5], [0], []) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +]) + +dnl Check low value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="1"]) +CHECK_DP_SLEEP_MAX([1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [1], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [1], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 1 us +pmd thread numa_id 0 core_id 0: + max sleep: 1 us +pmd thread numa_id 1 core_id 3: + max sleep: 1 us +pmd thread numa_id 8 core_id 5: + max sleep: 1 us +]) + +dnl Check high value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10000"]) +CHECK_DP_SLEEP_MAX([10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 10000 us +pmd thread numa_id 0 core_id 0: + max sleep: 10000 us +pmd thread numa_id 1 core_id 3: + max sleep: 10000 us +pmd thread numa_id 8 core_id 5: + max sleep: 10000 us +]) + +dnl Check setting max sleep to zero +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="0"]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +]) + +dnl Check above high value max sleep +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="10001"]) +CHECK_DP_SLEEP_MAX([10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [10000], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [10000], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 10000 us +pmd thread numa_id 0 core_id 0: + max sleep: 10000 us +pmd thread numa_id 1 core_id 3: + max sleep: 10000 us +pmd thread numa_id 8 core_id 5: + max sleep: 10000 us +]) + +dnl Check rounding +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="490"]) +CHECK_DP_SLEEP_MAX([490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [490], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [490], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 490 us +pmd thread numa_id 0 core_id 0: + max sleep: 490 us +pmd thread numa_id 1 core_id 3: + max sleep: 490 us +pmd thread numa_id 8 core_id 5: + max sleep: 490 us +]) + +dnl Check rounding +get_log_next_line_num +AT_CHECK([ovs-vsctl set open_vswitch . other_config:pmd-sleep-max="499"]) +CHECK_DP_SLEEP_MAX([499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [499], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [499], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 499 us +pmd thread numa_id 0 core_id 0: + max sleep: 499 us +pmd thread numa_id 1 core_id 3: + max sleep: 499 us +pmd thread numa_id 8 core_id 5: + max sleep: 499 us +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([PMD - per PMD sleep]) +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy-pmd options:n_rxq=8 options:numa_id=1], + [], [], [--dummy-numa 0,0,0,1,1,8,8]) + +dnl Check system default. +CHECK_DP_SLEEP_MAX([0], []) +CHECK_PMD_SLEEP_MAX([0], [0], [0], []) +CHECK_PMD_SLEEP_MAX([1], [3], [0], []) +CHECK_PMD_SLEEP_MAX([8], [5], [0], []) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +]) + +dnl Only per PMD. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:300,0:100,5:400]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [400], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 400 us +]) + +dnl Mix of not used default and per-PMD. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300,0:100,5:200]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Remove a per-pmd entry and use default. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,3:300]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 50 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 50 us +]) + +dnl Mix and change values. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=3:400,200]) +CHECK_DP_SLEEP_MAX([200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [200], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [400], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 200 us +pmd thread numa_id 0 core_id 0: + max sleep: 200 us +pmd thread numa_id 1 core_id 3: + max sleep: 400 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Add values for pmds that don't exist yet. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=2:600,50,3:300,0:100,6:400,5:200]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [100], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [300], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [200], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +]) + +dnl Add more cores. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=7f]) +CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [600], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6],[400], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 100 us +pmd thread numa_id 0 core_id 1: + max sleep: 50 us +pmd thread numa_id 0 core_id 2: + max sleep: 600 us +pmd thread numa_id 1 core_id 3: + max sleep: 300 us +pmd thread numa_id 1 core_id 4: + max sleep: 50 us +pmd thread numa_id 8 core_id 5: + max sleep: 200 us +pmd thread numa_id 8 core_id 6: + max sleep: 400 us +]) + +dnl Go back to just a global value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=90]) +CHECK_DP_SLEEP_MAX([90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [90], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [90], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 90 us +pmd thread numa_id 0 core_id 0: + max sleep: 90 us +pmd thread numa_id 0 core_id 1: + max sleep: 90 us +pmd thread numa_id 0 core_id 2: + max sleep: 90 us +pmd thread numa_id 1 core_id 3: + max sleep: 90 us +pmd thread numa_id 1 core_id 4: + max sleep: 90 us +pmd thread numa_id 8 core_id 5: + max sleep: 90 us +pmd thread numa_id 8 core_id 6: + max sleep: 90 us +]) + +dnl Try invalid value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=qwe]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 0 core_id 1: + max sleep: 0 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 1 core_id 4: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +pmd thread numa_id 8 core_id 6: + max sleep: 0 us +]) + +dnl Try invalid key:value. +get_log_next_line_num +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-sleep-max=50,1:qwe,2:0]) +CHECK_DP_SLEEP_MAX([50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [50], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [50], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 50 us +pmd thread numa_id 0 core_id 0: + max sleep: 50 us +pmd thread numa_id 0 core_id 1: + max sleep: 50 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 50 us +pmd thread numa_id 1 core_id 4: + max sleep: 50 us +pmd thread numa_id 8 core_id 5: + max sleep: 50 us +pmd thread numa_id 8 core_id 6: + max sleep: 50 us +]) + +dnl Remove config. +get_log_next_line_num +AT_CHECK([ovs-vsctl remove Open_vSwitch . other_config pmd-sleep-max]) +CHECK_DP_SLEEP_MAX([0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [0], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [1], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([0], [2], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [3], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([1], [4], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [5], [0], [+$LINENUM]) +CHECK_PMD_SLEEP_MAX([8], [6], [0], [+$LINENUM]) +AT_CHECK([ovs-appctl dpif-netdev/pmd-sleep-show], [0], [dnl +Default max sleep: 0 us +pmd thread numa_id 0 core_id 0: + max sleep: 0 us +pmd thread numa_id 0 core_id 1: + max sleep: 0 us +pmd thread numa_id 0 core_id 2: + max sleep: 0 us +pmd thread numa_id 1 core_id 3: + max sleep: 0 us +pmd thread numa_id 1 core_id 4: + max sleep: 0 us +pmd thread numa_id 8 core_id 5: + max sleep: 0 us +pmd thread numa_id 8 core_id 6: + max sleep: 0 us +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([PMD - revalidator modify overlapping flows]) + +OVS_VSWITCHD_START( +[add-port br0 p1 \ + -- set bridge br0 datapath-type=dummy \ + -- set interface p1 type=dummy-pmd \ + -- add-port br0 p2 \ + -- set interface p2 type=dummy-pmd +], [], [], [DUMMY_NUMA]) + +dnl Add one OpenFlow rule and generate a megaflow. +AT_CHECK([ovs-ofctl add-flow br0 'table=0,in_port=p1,ip,nw_dst=10.1.2.0/24,actions=p2']) +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//'], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:never, actions:2]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.2.2,proto=6),tcp(src=1,dst=2)']) +dnl Replace OpenFlow rules, trigger the revalidation. +AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=ct(commit)' | dnl + ovs-ofctl --bundle replace-flows br0 -]) +AT_CHECK([ovs-appctl revalidator/wait]) + +dnl Prevent flows from expiring. +AT_CHECK([ovs-appctl time/stop]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:never, actions:ct(commit) +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:ct(commit)]) + +dnl Send more 10.1.0.2 to make 10.1.0.0/16 tuple prepend 10.1.2.0/24 tuple in the pvector of subtables. +for i in $(seq 0 256); do + AT_CHECK([ovs-appctl netdev-dummy/receive p1 'ipv4(src=10.0.0.1,dst=10.1.0.2,proto=6),tcp(src=1,dst=2)']) +done + +dnl Warp time enough to trigger subtable optimization. +AT_CHECK([ovs-appctl time/warp 500 2000], [0], [ignore]) + +AT_CHECK([echo 'table=0,in_port=p1,ip,nw_dst=10.1.0.0/16 actions=p2' | dnl + ovs-ofctl --bundle replace-flows br0 -]) + +AT_CHECK([ovs-appctl revalidator/wait]) +AT_CHECK([ovs-appctl dpctl/dump-flows | sed 's/.*core: [[0-9]]*//' | strip_xout_keep_actions], [0], [ +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.0.2/255.255.0.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2 +recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(dst=10.1.2.2/255.255.255.0,frag=no), packets:0, bytes:0, used:0.0s, actions:2 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/rstp.at b/tests/rstp.at index 600e85dabde..e0d4bed4f05 100644 --- a/tests/rstp.at +++ b/tests/rstp.at @@ -253,3 +253,60 @@ AT_CHECK([ovs-vsctl del-port br0 p1]) OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([RSTP - patch ports]) +# Create br0 with interfaces p1 and p7 +# and br1 with interfaces p2 and p8 +# with p1 and p2 being connected patch ports. +OVS_VSWITCHD_START( + [set port br0 other_config:rstp-enable=false -- \ + set bridge br0 rstp-enable=true +]) + +AT_CHECK([add_of_br 1 \ + set port br1 other_config:rstp-enable=false -- \ + set bridge br1 rstp-enable=true]) + +ovs-appctl time/stop + +AT_CHECK([ovs-vsctl \ + add-port br0 p1 -- \ + set interface p1 type=patch options:peer=p2 ofport_request=1 -- \ + set port p1 other_config:rstp-enable=true -- \ + add-port br1 p2 -- \ + set interface p2 type=patch options:peer=p1 ofport_request=2 -- \ + set port p2 other_config:rstp-enable=true -- \ +]) + +AT_CHECK([ovs-vsctl \ + add-port br0 p7 -- \ + set interface p7 ofport_request=7 type=dummy -- \ + set port p7 other_config:rstp-enable=false -- \ + add-port br1 p8 -- \ + set interface p8 ofport_request=8 type=dummy -- \ + set port p8 other_config:rstp-enable=false -- \ +]) + +AT_CHECK([ovs-ofctl add-flow br0 "in_port=7 icmp actions=1"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 icmp actions=7"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=8 icmp actions=2"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=2 icmp actions=8"]) + +# Give time for RSTP to synchronize. +ovs-appctl time/warp 5000 500 + +OVS_WAIT_UNTIL_EQUAL([cat ovs-vswitchd.log | FILTER_STP_TOPOLOGY], [dnl +port p1: RSTP state changed from Disabled to Discarding +port p2: RSTP state changed from Disabled to Discarding +port p2: RSTP state changed from Discarding to Forwarding +port p1: RSTP state changed from Discarding to Forwarding]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(7),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 8 +]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(8),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.3,dst=10.0.0.4,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 7 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/sendpkt.py b/tests/sendpkt.py index 49ac45275a9..7cbea516548 100755 --- a/tests/sendpkt.py +++ b/tests/sendpkt.py @@ -48,28 +48,10 @@ if options.packet_type != "eth": parser.error('invalid argument to "-t"/"--type". Allowed value is "eth".') -# store the hex bytes with 0x appended at the beginning -# if not present in the user input and validate the hex bytes -hex_list = [] -for a in args[1:]: - if a[:2] != "0x": - hex_byte = "0x" + a - else: - hex_byte = a - try: - temp = int(hex_byte, 0) - except: - parser.error("invalid hex byte " + a) - - if temp > 0xff: - parser.error("hex byte " + a + " cannot be greater than 0xff!") - - hex_list.append(temp) - -if sys.version_info < (3, 0): - pkt = "".join(map(chr, hex_list)) -else: - pkt = bytes(hex_list) +# Strip '0x' prefixes from hex input, combine into a single string and +# convert to bytes. +hex_str = "".join([a[2:] if a.startswith("0x") else a for a in args[1:]]) +pkt = bytes.fromhex(hex_str) try: sockfd = socket.socket(socket.AF_PACKET, socket.SOCK_RAW) diff --git a/tests/stp.at b/tests/stp.at index 7ddacfc3a0e..75abe8e5ca0 100644 --- a/tests/stp.at +++ b/tests/stp.at @@ -368,7 +368,7 @@ AT_CLEANUP # Strips out uninteresting parts of flow output, as well as parts # that vary from one run to another (e.g., timing and bond actions). m4_define([STRIP_USED], [[sed ' - s/used:[0-9]*\.[0-9]*/used:0.0/ + s/used:[0-9\.][0-9\.]*/used:0.0/ s/duration=[0-9.]*s*/duration=Xs/ s/idle_age=[0-9]*,/idle_age=X,/ ']]) @@ -464,6 +464,65 @@ Datapath actions: 2 AT_CLEANUP +AT_SETUP([STP - patch ports]) +# Create br0 with interfaces p1 and p7 +# and br1 with interfaces p2 and p8 +# with p1 and p2 being connected patch ports. +OVS_VSWITCHD_START( + [set port br0 other_config:stp-enable=false -- \ + set bridge br0 stp-enable=true +]) + +AT_CHECK([add_of_br 1 \ + set port br1 other_config:stp-enable=false -- \ + set bridge br1 stp-enable=true]) + +ovs-appctl time/stop + +AT_CHECK([ovs-vsctl \ + add-port br0 p1 -- \ + set interface p1 type=patch options:peer=p2 ofport_request=1 -- \ + set port p1 other_config:stp-enable=true -- \ + add-port br1 p2 -- \ + set interface p2 type=patch options:peer=p1 ofport_request=2 -- \ + set port p2 other_config:stp-enable=true -- \ +]) + +AT_CHECK([ovs-vsctl \ + add-port br0 p7 -- \ + set interface p7 ofport_request=7 type=dummy -- \ + set port p7 other_config:stp-enable=false -- \ + add-port br1 p8 -- \ + set interface p8 ofport_request=8 type=dummy -- \ + set port p8 other_config:stp-enable=false -- \ +]) + +AT_CHECK([ovs-ofctl add-flow br0 "in_port=7 icmp actions=1"]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=1 icmp actions=7"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=8 icmp actions=2"]) +AT_CHECK([ovs-ofctl add-flow br1 "in_port=2 icmp actions=8"]) + +# Give time for STP to synchronize. +ovs-appctl time/warp 30000 3000 + +OVS_WAIT_UNTIL_EQUAL([cat ovs-vswitchd.log | FILTER_STP_TOPOLOGY], [dnl +port <>: STP state changed from disabled to listening +port <>: STP state changed from disabled to listening +port <>: STP state changed from listening to learning +port <>: STP state changed from listening to learning +port <>: STP state changed from learning to forwarding +port <>: STP state changed from learning to forwarding]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(7),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 8 +]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(8),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.3,dst=10.0.0.4,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)' | grep Datapath], [0], [dnl +Datapath actions: 7 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([STP - flush the fdb and mdb when topology changed]) OVS_VSWITCHD_START([]) @@ -583,13 +642,13 @@ AT_CHECK([ovs-appctl fdb/show br2], [0], [dnl ]) AT_CHECK([ovs-appctl mdb/show br0], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CHECK([ovs-appctl mdb/show br1], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CHECK([ovs-appctl mdb/show br2], [0], [dnl - port VLAN GROUP Age + port VLAN protocol GROUP Age ]) AT_CLEANUP @@ -620,10 +679,10 @@ ovs-appctl time/stop ovs-appctl time/warp 31000 1000 AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) # add a stp port @@ -637,10 +696,10 @@ ovs-appctl netdev-dummy/set-admin-state p3 down # We should not show the p3 because its link-state is down AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p3], [1], [dnl ]) @@ -648,13 +707,13 @@ AT_CHECK([ovs-appctl stp/show br0 | grep p3], [1], [dnl ovs-appctl netdev-dummy/set-admin-state p3 up AT_CHECK([ovs-appctl stp/show br0 | grep p1], [0], [dnl - p1 designated forwarding 19 128.1 + p1 designated forwarding 2 128.1 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p2], [0], [dnl - p2 designated forwarding 19 128.2 + p2 designated forwarding 2 128.2 ]) AT_CHECK([ovs-appctl stp/show br0 | grep p3], [0], [dnl - p3 designated listening 19 128.3 + p3 designated listening 2 128.3 ]) diff --git a/tests/system-afxdp.at b/tests/system-afxdp.at index 0d09906fb6c..88f66056630 100644 --- a/tests/system-afxdp.at +++ b/tests/system-afxdp.at @@ -39,7 +39,7 @@ AT_CHECK([ovs-vsctl add-port br0 ovs-p0 -- \ set interface ovs-p0 type=afxdp-nonpmd options:n_rxq=1], [0], [], [stderr]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-common-macros.at b/tests/system-common-macros.at index 8b9f5c75254..e9be021f3ff 100644 --- a/tests/system-common-macros.at +++ b/tests/system-common-macros.at @@ -126,6 +126,22 @@ m4_define([ADD_VETH_BOND], ] ) +# ADD_VETH_NS([ns1], [port1], [ip_addr1], [ns2], [port2], [ip_addr2]) +# +# Add a pair of veth ports in 'ns1' and 'ns2'. The port names are 'port1' +# and 'port2' respectively, and the IP addresses 'ip_addr1' and 'ip_addr2' +# are assigned to each port. +m4_define([ADD_VETH_NS], + [ AT_CHECK([ip link add $2 type veth peer name $5]), + AT_CHECK([ip link set $2 netns $1]) + AT_CHECK([ip link set $5 netns $4]) + NS_CHECK_EXEC([$1], [ip link set $2 up]) + NS_CHECK_EXEC([$4], [ip link set $5 up]) + NS_CHECK_EXEC([$1], [ip addr add $3 dev $2]) + NS_CHECK_EXEC([$4], [ip addr add $6 dev $5]) + ] +) + # ADD_VLAN([port], [namespace], [vlan-id], [ip-addr]) # # Add a VLAN device named 'port' within 'namespace'. It will be configured @@ -260,18 +276,23 @@ m4_define([NETNS_DAEMONIZE], m4_define([OVS_CHECK_FIREWALL], [AT_SKIP_IF([systemctl status firewalld 2>&1 | grep running > /dev/null])]) -# OVS_START_L7([namespace], [protocol]) +# OVS_START_L7([namespace], [protocol], [port]) # -# Start a server serving 'protocol' within 'namespace'. The server will exit -# when the test finishes. +# Start a server serving 'protocol' on port 'port' within 'namespace'. +# If 'port' is not specified, the standard one for 'protocol' will be used. +# The server will exit when the test finishes. # m4_define([OVS_START_L7], [PIDFILE=$(mktemp $2XXX.pid) - NETNS_DAEMONIZE([$1], [[$PYTHON3 $srcdir/test-l7.py $2]], [$PIDFILE]) + NETNS_DAEMONIZE([$1], [[$PYTHON3 $srcdir/test-l7.py $2 $3]], [$PIDFILE]) dnl netstat doesn't print http over IPv6 as "http6"; drop the number. PROTO=$(echo $2 | sed -e 's/\([[a-zA-Z]]*\).*/\1/') - OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -l | grep $PROTO])]) + if test -z "$3"; then + OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -l | grep $PROTO])]) + else + OVS_WAIT_UNTIL([NS_EXEC([$1], [netstat -ln | grep :$3])]) + fi ] ) @@ -281,6 +302,12 @@ m4_define([OVS_START_L7], # m4_define([OFPROTO_CLEAR_DURATION_IDLE], [[sed -e 's/duration=.*s,/duration=,/g' -e 's/idle_age=[0-9]*,/idle_age=,/g']]) +# OVS_CHECK_TC_QDISC() +# +# Macro to skip tests when tc qdisc can't be applied on a OVS port. +m4_define([OVS_CHECK_TC_QDISC], + [AT_SKIP_IF([test $HAVE_TC = no])]) + # OVS_CHECK_TUNNEL_TSO() # # Macro to be used in general tunneling tests that could be also @@ -343,3 +370,15 @@ m4_define([OVS_CHECK_IPROUTE_ENCAP], # OVS_CHECK_CT_CLEAR() m4_define([OVS_CHECK_CT_CLEAR], [AT_SKIP_IF([! grep -q "Datapath supports ct_clear action" ovs-vswitchd.log])]) + +# OVS_CHECK_GITHUB_ACTION +m4_define([OVS_CHECK_GITHUB_ACTION], + [AT_SKIP_IF([test "$GITHUB_ACTIONS" = "true"])]) + +# OVS_CHECK_DROP_ACTION() +m4_define([OVS_CHECK_DROP_ACTION], + [AT_SKIP_IF([! grep -q "Datapath supports explicit drop action" ovs-vswitchd.log])]) + +# OVS_CHECK_PSAMPLE() +m4_define([OVS_CHECK_PSAMPLE], + [AT_SKIP_IF([! grep -q "Datapath supports psample action" ovs-vswitchd.log])]) diff --git a/tests/system-dpdk-find-device.py b/tests/system-dpdk-find-device.py new file mode 100755 index 00000000000..ced74e7f310 --- /dev/null +++ b/tests/system-dpdk-find-device.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +import os +import sys + +# The tester might want to select a PCI device, if so, trust it. +if 'DPDK_PCI_ADDR' in os.environ: + print(os.environ['DPDK_PCI_ADDR']) + sys.exit(0) + +for device in sorted(Path('/sys/bus/pci/devices').iterdir()): + class_path = device / 'class' + # Only consider Network class devices + if class_path.read_text().strip() != '0x020000': + continue + kmod_path = device / 'driver' / 'module' + kmod_name = kmod_path.resolve().name + # Only care about devices bound to vfio_pci or igb_uio. + if kmod_name not in ['vfio_pci', 'igb_uio']: + continue + print(device.resolve().name) + sys.exit(0) + +sys.exit(1) diff --git a/tests/system-dpdk-macros.at b/tests/system-dpdk-macros.at index 53fbc13206c..f8ba7667390 100644 --- a/tests/system-dpdk-macros.at +++ b/tests/system-dpdk-macros.at @@ -7,9 +7,6 @@ m4_define([OVS_DPDK_PRE_CHECK], [dnl Check Hugepages AT_CHECK([cat /proc/meminfo], [], [stdout]) AT_SKIP_IF([grep -E 'HugePages_Free: *0' stdout], [], [stdout]) - AT_CHECK([mount], [], [stdout]) - AT_CHECK([grep 'hugetlbfs' stdout], [], [stdout], []) - ]) @@ -22,14 +19,8 @@ m4_define([OVS_DPDK_PRE_PHY_SKIP], [dnl Perform the precheck OVS_DPDK_PRE_CHECK() - dnl Check if VFIO or UIO driver is loaded - AT_SKIP_IF([ ! (lsmod | grep -E "igb_uio|vfio") ], [], [stdout]) - - dnl Find PCI address candidate, skip if there is no DPDK-compatible NIC - AT_CHECK([$DPDK_DIR/usertools/dpdk-devbind.py -s | head -n +4 | tail -1], [], [stdout]) - AT_CHECK([cat stdout | cut -d" " -s -f1 > PCI_ADDR]) - AT_SKIP_IF([ ! test -s PCI_ADDR ]) - + dnl Check if a device is available for DPDK + AT_SKIP_IF([ ! $abs_top_srcdir/tests/system-dpdk-find-device.py > DPDK_PCI_ADDR ]) ]) @@ -39,12 +30,13 @@ m4_define([OVS_DPDK_PRE_PHY_SKIP], # m4_define([OVS_DPDK_START], [dnl start ovs dpdk - OVS_DPDK_START_OVSDB() + OVS_DPDK_START_OVSDB($3) dnl Enable DPDK functionality AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true]) - OVS_DPDK_START_VSWITCHD() + OVS_DPDK_START_VSWITCHD([$1], [$2]) ]) + # OVS_DPDK_START_OVSDB() # # Create an empty database and start ovsdb-server. @@ -63,19 +55,147 @@ m4_define([OVS_DPDK_START_OVSDB], AT_CAPTURE_FILE([ovsdb-server.log]) dnl Initialize database. - AT_CHECK([ovs-vsctl --no-wait init]) + AT_CHECK([ovs-vsctl --no-wait init $1]) ]) + # OVS_DPDK_START_VSWITCHD() # # Add special configuration for dpdk-init. Start ovs-vswitchd. # m4_define([OVS_DPDK_START_VSWITCHD], [dnl Change DPDK drivers log levels so that tests only catch errors - AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-extra=--log-level=pmd.*:error]) + AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-extra="--log-level=pmd.*:error $1"]) dnl Start ovs-vswitchd. - AT_CHECK([ovs-vswitchd --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif -vunixctl], [0], [stdout], [stderr]) + AT_CHECK([ovs-vswitchd $2 --detach --no-chdir --pidfile --log-file -vvconn -vofproto_dpif -vunixctl], [0], [stdout], [stderr]) AT_CAPTURE_FILE([ovs-vswitchd.log]) on_exit "kill_ovs_vswitchd `cat ovs-vswitchd.pid`" ]) + + +m4_define([OVS_DPDK_STOP_VSWITCHD], + [OVS_VSWITCHD_STOP([dnl +$1";/does not exist. The Open vSwitch kernel module is probably not loaded./d +/does not support MTU configuration,/d +/EAL: No \(available\|free\) .*hugepages reported/d +/Failed to enable flow control/d +/ice_vsi_config_outer_vlan_stripping(): Single VLAN mode (SVM) does not support qinq/d +/Rx checksum offload is not supported on/d +/TELEMETRY: No legacy callbacks, legacy socket not created/d"]) +]) + + +# OVS_DPDK_CHECK_TESTPMD() +# +# Check dpdk-testpmd availability. +# +m4_define([OVS_DPDK_CHECK_TESTPMD], + [AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +]) + + +# OVS_DPDK_START_TESTPMD() +# +# Start dpdk-testpmd in background. +# +m4_define([OVS_DPDK_START_TESTPMD], + [AT_CHECK([lscpu], [], [stdout]) + AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) + eal_options="$DPDK_EAL_OPTIONS --in-memory --socket-mem="$(cat NUMA_NODE)" --single-file-segments --no-pci --file-prefix testpmd" + options="$1" + test "$options" != "${options%% -- *}" || options="$options -- " + eal_options="$eal_options ${options%% -- *}" + testpmd_options="-a --stats-period 2 ${options#* -- }" + echo "dpdk-testpmd $eal_options -- $testpmd_options" >testpmd.cmd + dpdk-testpmd $eal_options -- $testpmd_options >testpmd.log 2>&1 & \ + echo $! > testpmd.pid + on_exit "kill -9 `cat testpmd.pid`" +]) + + +# OVS_DPDK_STOP_TESTPMD() +# +# Stop background dpdk-testpmd. +# +m4_define([OVS_DPDK_STOP_TESTPMD], + [AT_CHECK([kill `cat testpmd.pid`]) + OVS_WAIT([kill -0 `cat testpmd.pid`], [kill -9 `cat testpmd.pid`]) +]) + + +# OVS_TRAFFIC_VSWITCHD_START([vsctl-args], [vsctl-output], [dbinit-aux-args]) +# +# Creates a database and starts ovsdb-server, starts ovs-vswitchd +# connected to that database, calls ovs-vsctl to create a bridge named +# br0 with predictable settings, passing 'vsctl-args' as additional +# commands to ovs-vsctl. If 'vsctl-args' causes ovs-vsctl to provide +# output (e.g. because it includes "create" commands) then 'vsctl-output' +# specifies the expected output after filtering through uuidfilt. +# 'dbinit-aux-args' are passed as additional commands to 'ovs-vsctl init' +# before starting ovs-vswitchd. +m4_define([OVS_TRAFFIC_VSWITCHD_START], + [ + OVS_DPDK_PRE_CHECK() + OVS_WAIT_WHILE([ip link show ovs-netdev]) + dnl For functional tests, no need for DPDK PCI probing. + OVS_DPDK_START([--no-pci], [--disable-system], [$3]) + dnl Add bridges, ports, etc. + OVS_WAIT_WHILE([ip link show br0]) + AT_CHECK([ovs-vsctl -- _ADD_BR([br0]) -- $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2]) +]) + + +# OVS_TRAFFIC_VSWITCHD_STOP([ALLOWLIST], [extra_cmds]) +# +# Gracefully stops ovs-vswitchd and ovsdb-server, checking their log files +# for messages with severity WARN or higher and signaling an error if any +# is present. The optional ALLOWLIST may contain shell-quoted "sed" +# commands to delete any warnings that are actually expected, e.g.: +# +# OVS_TRAFFIC_VSWITCHD_STOP(["/expected error/d"]) +# +# 'extra_cmds' are shell commands to be executed after OVS_VSWITCHD_STOP() is +# invoked. They can be used to perform additional cleanups such as name space +# removal. +m4_define([OVS_TRAFFIC_VSWITCHD_STOP], + [OVS_DPDK_STOP_VSWITCHD([$1]) + AT_CHECK([:; $2]) +]) + + +# Plug a veth into OVS via DPDK net/af_xdp. +m4_define([ADD_VETH], + [ AT_CHECK([ip link add $1 type veth peer name ovs-$1 || return 77]) + CONFIGURE_VETH_OFFLOADS([$1]) + AT_CHECK([ip link set $1 netns $2]) + AT_CHECK([ip link set dev ovs-$1 up]) + AT_CHECK([ovs-vsctl add-port $3 ovs-$1 -- \ + set interface ovs-$1 external-ids:iface-id="$1" -- \ + set interface ovs-$1 type=dpdk -- \ + set interface ovs-$1 options:dpdk-devargs=net_af_xdp$1,iface=ovs-$1]) + NS_CHECK_EXEC([$2], [ip addr add $4 dev $1 $7]) + NS_CHECK_EXEC([$2], [ip link set dev $1 up]) + if test -n "$5"; then + NS_CHECK_EXEC([$2], [ip link set dev $1 address $5]) + fi + if test -n "$6"; then + NS_CHECK_EXEC([$2], [ip route add default via $6]) + fi + on_exit 'ip link del ovs-$1' + ] +) + + +m4_define([OVS_CHECK_8021AD], + [AT_SKIP_IF([:])]) + + +m4_define([OVS_CHECK_TC_QDISC], + [AT_SKIP_IF([:])]) + + +m4_define([CONFIGURE_VETH_OFFLOADS], + [AT_CHECK([ethtool -K $1 tx off], [0], [ignore], [ignore]) + AT_CHECK([ethtool -K $1 txvlan off], [0], [ignore], [ignore])] +) diff --git a/tests/system-dpdk-testsuite.at b/tests/system-dpdk-testsuite.at index 382f09e9ff2..f61fbf9212a 100644 --- a/tests/system-dpdk-testsuite.at +++ b/tests/system-dpdk-testsuite.at @@ -20,6 +20,8 @@ m4_include([tests/ovs-macros.at]) m4_include([tests/ovsdb-macros.at]) m4_include([tests/ofproto-macros.at]) m4_include([tests/system-common-macros.at]) +m4_include([tests/system-userspace-macros.at]) m4_include([tests/system-dpdk-macros.at]) m4_include([tests/system-dpdk.at]) +m4_include([tests/system-traffic.at]) diff --git a/tests/system-dpdk.at b/tests/system-dpdk.at index fd7884e0f8c..1c97bf77720 100644 --- a/tests/system-dpdk.at +++ b/tests/system-dpdk.at @@ -1,17 +1,5 @@ -m4_define([CONFIGURE_VETH_OFFLOADS], - [AT_CHECK([ethtool -K $1 tx off], [0], [ignore], [ignore])]) - AT_BANNER([OVS-DPDK unit tests]) -m4_define([SYSTEM_DPDK_ALLOWED_LOGS],[ -\@does not exist. The Open vSwitch kernel module is probably not loaded.@d -\@does not support MTU configuration,@d -\@EAL: No \(available\|free\) .*hugepages reported@d -\@Failed to enable flow control@d -\@Rx checksum offload is not supported on@d -\@TELEMETRY: No legacy callbacks, legacy socket not created@d -]) - dnl CHECK_MEMPOOL_PARAM([mtu], [numa], [+line]) dnl dnl Waits for logs to indicate that the user has configured a mempool @@ -27,16 +15,39 @@ m4_define([CHECK_MEMPOOL_PARAM], [ | grep "User configured shared mempool set for: MTU $1, NUMA $2."]) ]) +dnl ADD_VHOST_USER_CLIENT_PORT([bridge], [port], [socket]) +dnl Add a dpdk vhost-user client port to a bridge and check this port is ready +dnl to be used by looking at the logs. +m4_define([ADD_VHOST_USER_CLIENT_PORT], [ + AT_CHECK([ovs-vsctl add-port $1 $2 -- \ + set Interface $2 type=dpdkvhostuserclient options:vhost-server-path=$3], + [], [stdout], [stderr]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) vhost-user client: socket created" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "vHost User device '$2' created in 'client' mode, using client socket" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) reconnecting..." ovs-vswitchd.log]) +]) + +dnl ADD_VHOST_USER_PORT([bridge], [port], [socket]) +dnl Add a dpdk vhost-user port to a bridge and check this port is ready +dnl to be used by looking at the logs. +m4_define([ADD_VHOST_USER_PORT], [ + AT_CHECK([ovs-vsctl add-port $1 $2 -- set Interface $2 type=dpdkvhostuser], [], + [stdout], [stderr]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) vhost-user server: socket created" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "Socket $3 created for vhost-user port $2" ovs-vswitchd.log]) + OVS_WAIT_UNTIL([grep "VHOST_CONFIG: ($3) binding succeeded" ovs-vswitchd.log]) +]) + dnl -------------------------------------------------------------------------- dnl Check if EAL init is successful AT_SETUP([OVS-DPDK - EAL init]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) AT_CHECK([grep "DPDK Enabled - initializing..." ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "EAL" ovs-vswitchd.log], [], [stdout]) AT_CHECK([grep "DPDK Enabled - initialized" ovs-vswitchd.log], [], [stdout]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -52,13 +63,13 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -69,24 +80,18 @@ dnl Add vhost-user-client port AT_SETUP([OVS-DPDK - add vhost-user-client port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -97,40 +102,22 @@ dnl Ping vhost-user port AT_SETUP([OVS-DPDK - ping vhost-user ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_CHECK_TESTPMD() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuser0 -- set Interface dpdkvhostuser0 \ - type=dpdkvhostuser], [], - [stdout], [stderr]) +ADD_VHOST_USER_PORT([br10], [dpdkvhostuser0], [$OVS_RUNDIR/dpdkvhostuser0]) AT_CHECK([ovs-vsctl show], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user server: socket created" \ - ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "Socket $OVS_RUNDIR/dpdkvhostuser0 created for vhost-user port dpdkvhostuser0" \ - ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: bind to $OVS_RUNDIR/dpdkvhostuser0" ovs-vswitchd.log], [], - [stdout]) - dnl Set up namespaces ADD_NAMESPACES(ns1, ns2) dnl Add veth device ADD_VETH(tap1, ns2, br10, "172.31.110.12/24") -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostuser0" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuser0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostuser0" \ + --vdev="net_tap0,iface=tap0"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ip link show dev tap0 | grep -qw LOWER_UP]) @@ -151,17 +138,18 @@ AT_CHECK([ip netns exec ns2 ip link show], [], [stdout], [stderr]) AT_CHECK([ip netns exec ns1 ping -c 4 -I tap0 172.31.110.12], [], [stdout], [stderr]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +OVS_DPDK_STOP_TESTPMD() + +dnl Wait for vhost-user handling the socket disconnect. +OVS_WAIT_UNTIL([grep "vHost Device '$OVS_RUNDIR/dpdkvhostuser0' has been removed" ovs-vswitchd.log]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuser0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: recvmsg failed@d -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostuser0: No such file or directory@d -\@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d -\@failed to enumerate system datapaths: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostuser0) recvmsg failed/d +/VHOST_CONFIG: (.*dpdkvhostuser0) failed to connect: No such file or directory/d +/dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports./d +/failed to enumerate system datapaths: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -170,39 +158,22 @@ dnl Ping vhost-user-client port AT_SETUP([OVS-DPDK - ping vhost-user-client ports]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_CHECK_TESTPMD() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface \ - dpdkvhostuserclient0 \ - type=dpdkvhostuserclient \ - options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], - [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Set up namespaces ADD_NAMESPACES(ns1, ns2) dnl Add veth device ADD_VETH(tap1, ns2, br10, "172.31.110.12/24") -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,queues=2,server=1" \ + --vdev="net_tap0,iface=tap0" -- --nb-cores 2 --rxq 2 --txq 2]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) OVS_WAIT_UNTIL([ip link show dev tap0 | grep -qw LOWER_UP]) @@ -220,20 +191,40 @@ AT_CHECK([ip netns exec ns1 ip addr add 172.31.110.11/24 dev tap0], [], AT_CHECK([ip netns exec ns1 ip link show], [], [stdout], [stderr]) AT_CHECK([ip netns exec ns2 ip link show], [], [stdout], [stderr]) -AT_CHECK([ip netns exec ns1 ping -c 4 -I tap0 172.31.110.12], [], [stdout], +AT_CHECK([ip netns exec ns1 ping -i 0.1 -c 10 -I tap0 172.31.110.12], [], [stdout], [stderr]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +AT_CHECK([ip netns exec ns1 ip link set tap0 down], [], [stdout], [stderr]) + +# Wait for stats to be queried ("stats-update-interval") +sleep 5 +AT_CHECK([ovs-vsctl get interface dpdkvhostuserclient0 statistics], [], [stdout], [stderr]) + +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_packets` -gt 0 -a dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_bytes` -gt 0]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_packets` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q0_good_packets` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q1_good_packets`))]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_bytes` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q0_good_bytes` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:rx_q1_good_bytes`))]) + +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_packets` -gt 0 -a dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes` -gt 0]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_packets` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_packets` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_packets`))]) +AT_CHECK([test `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_bytes` -eq dnl + $((`ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q0_good_bytes` + dnl + `ovs-vsctl get interface dpdkvhostuserclient0 statistics:tx_q1_good_bytes`))]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: recvmsg failed@d -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -\@dpdkvhostuser ports are considered deprecated; please migrate to dpdkvhostuserclient ports.@d -\@failed to enumerate system datapaths: No such file or directory@d -])") +OVS_DPDK_STOP_TESTPMD() +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) recvmsg failed/d +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/failed to enumerate system datapaths: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -249,7 +240,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and add policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set interface phy0 ingress_policing_rate=10000 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -269,7 +260,7 @@ AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -281,11 +272,11 @@ AT_SETUP([OVS-DPDK - Ingress policing create delete vport port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_rate=10000 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -303,16 +294,10 @@ AT_CHECK([grep -E 'ingress_policing_burst: 0' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -324,11 +309,11 @@ AT_SETUP([OVS-DPDK - Ingress policing no policing rate]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_burst=1000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -343,17 +328,10 @@ AT_CHECK([grep -E 'ingress_policing_burst: 1000' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 0' stdout], [], [stdout]) - -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -365,11 +343,11 @@ AT_SETUP([OVS-DPDK - Ingress policing no policing burst]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add ingress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set interface dpdkvhostuserclient0 ingress_policing_rate=10000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 @@ -384,16 +362,10 @@ AT_CHECK([grep -E 'ingress_policing_burst: 0' stdout], [], [stdout]) AT_CHECK([ovs-vsctl list interface dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'ingress_policing_rate: 10000' stdout], [], [stdout]) -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -408,7 +380,7 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) OVS_WAIT_UNTIL([ovs-vsctl set port phy0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000 other-config:cbs=2048]) AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show phy0], [], [stdout]) sleep 2 @@ -425,7 +397,7 @@ AT_CHECK([grep -E 'QoS not configured on phy0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -437,21 +409,16 @@ AT_SETUP([OVS-DPDK - QoS create delete vport port]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000 \ other-config:cbs=2048]) AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Fail if egress policer could not be created AT_FAIL_IF([grep "Could not create rte meter for egress policer" ovs-vswitchd.log], [], [stdout]) @@ -464,9 +431,8 @@ AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [std dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -478,30 +444,24 @@ AT_SETUP([OVS-DPDK - QoS no cir]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cbs=2048]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -\@Could not create rte meter for egress policer@d -\@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/Could not create rte meter for egress policer/d +/Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -513,30 +473,24 @@ AT_SETUP([OVS-DPDK - QoS no cbs]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and add egress policer AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) OVS_WAIT_UNTIL([ovs-vsctl set port dpdkvhostuserclient0 qos=@newqos -- --id=@newqos create qos type=egress-policer other-config:cir=1250000]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - dnl Check egress policer was not created AT_CHECK([ovs-appctl -t ovs-vswitchd qos/show dpdkvhostuserclient0], [], [stdout]) AT_CHECK([grep -E 'QoS not configured on dpdkvhostuserclient0' stdout], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -\@Could not create rte meter for egress policer@d -\@Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/Could not create rte meter for egress policer/d +/Failed to set QoS type egress-policer on port dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -555,16 +509,15 @@ dnl First set MTU to its default value and confirm that value, then increase the dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 dnl Check default MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=1500' stdout], [], [stdout]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [1500]) dnl Increase MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9000]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -572,13 +525,10 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -596,10 +546,12 @@ dnl First set an increased MTU value and confirm that value, then decrease the M dnl Add userspace bridge and attach it to OVS and modify MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9000]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9000]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -607,20 +559,14 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9000) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) - dnl Decrease MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=2000]) - -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=2000' stdout], [], [stdout]) +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [2000]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -631,52 +577,37 @@ dnl MTU increase vport port AT_SETUP([OVS-DPDK - MTU increase vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS with default MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & - +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check default MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=1500' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +1500 +]) dnl Increase MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9000]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) - -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9000 +]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_TESTPMD() +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -687,53 +618,38 @@ dnl MTU decrease vport port AT_SETUP([OVS-DPDK - MTU decrease vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and modify MTU value AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9000]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & - +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9000' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9000 +]) dnl Decrease MTU value and check in the datapath AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=2000]) -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=2000' stdout], [], [stdout]) - -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +2000 +]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -])") +OVS_DPDK_STOP_TESTPMD() +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -749,10 +665,12 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and set MTU value to max upper bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [9702]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -760,10 +678,6 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (9702) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9702' stdout], [], [stdout]) - dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=9711]) AT_CHECK([grep "phy0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) @@ -771,10 +685,9 @@ AT_CHECK([grep "phy0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@phy0: unsupported MTU 9711@d -\@failed to set MTU for network device phy0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/phy0: unsupported MTU 9711/d +/failed to set MTU for network device phy0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -790,10 +703,12 @@ OVS_DPDK_START() dnl Add userspace bridge and attach it to OVS and set MTU value to min lower bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat PCI_ADDR)], [], [stdout], [stderr]) +AT_CHECK([ovs-vsctl add-port br10 phy0 -- set Interface phy0 type=dpdk options:dpdk-devargs=$(cat DPDK_PCI_ADDR)], [], [stdout], [stderr]) AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=68]) AT_CHECK([ovs-vsctl show], [], [stdout]) -sleep 2 + +dnl Check MTU value in the datapath +OVS_WAIT_UNTIL_EQUAL([ovs-vsctl get Interface phy0 mtu], [68]) dnl Fail if MTU is not supported AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitchd.log], [], [stdout]) @@ -801,10 +716,6 @@ AT_FAIL_IF([grep "Interface phy0 does not support MTU configuration" ovs-vswitch dnl Fail if error is encountered during MTU setup AT_FAIL_IF([grep "Interface phy0 MTU (68) setup error" ovs-vswitchd.log], [], [stdout]) -dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=68' stdout], [], [stdout]) - dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface phy0 mtu_request=67]) AT_CHECK([grep "phy0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) @@ -812,10 +723,9 @@ AT_CHECK([grep "phy0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) dnl Clean up AT_CHECK([ovs-vsctl del-port br10 phy0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@phy0: unsupported MTU 67@d -\@failed to set MTU for network device phy0: Invalid argument@d -])") +OVS_DPDK_STOP_VSWITCHD(["dnl +/phy0: unsupported MTU 67/d +/failed to set MTU for network device phy0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -826,48 +736,38 @@ dnl MTU upper bound vport port AT_SETUP([OVS-DPDK - MTU upper bound vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and set MTU value to max upper bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9702]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=9702' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +9702 +]) dnl Set MTU value above upper bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=9711]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 9711" ovs-vswitchd.log], [], [stdout]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -\@dpdkvhostuserclient0: unsupported MTU 9711@d -\@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d -])") +OVS_DPDK_STOP_TESTPMD() +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/dpdkvhostuserclient0: unsupported MTU 9711/d +/failed to set MTU for network device dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- @@ -878,254 +778,48 @@ dnl MTU lower bound vport port AT_SETUP([OVS-DPDK - MTU lower bound vport port]) AT_KEYWORDS([dpdk]) -AT_SKIP_IF([! which dpdk-testpmd >/dev/null 2>/dev/null]) +OVS_DPDK_CHECK_TESTPMD() OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() - -dnl Find number of sockets -AT_CHECK([lscpu], [], [stdout]) -AT_CHECK([cat stdout | grep "NUMA node(s)" | awk '{c=1; while (c++<$(3)) {printf "512,"}; print "512"}' > NUMA_NODE]) +OVS_DPDK_START([--no-pci]) dnl Add userspace bridge and attach it to OVS and set MTU value to min lower bound AT_CHECK([ovs-vsctl add-br br10 -- set bridge br10 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br10 dpdkvhostuserclient0 -- set Interface dpdkvhostuserclient0 type=dpdkvhostuserclient options:vhost-server-path=$OVS_RUNDIR/dpdkvhostclient0], [], [stdout], [stderr]) +ADD_VHOST_USER_CLIENT_PORT([br10], [dpdkvhostuserclient0], [$OVS_RUNDIR/dpdkvhostclient0]) AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=68]) AT_CHECK([ovs-vsctl show], [], [stdout]) sleep 2 -dnl Parse log file -AT_CHECK([grep "VHOST_CONFIG: vhost-user client: socket created" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "vHost User device 'dpdkvhostuserclient0' created in 'client' mode, using client socket" ovs-vswitchd.log], [], [stdout]) -AT_CHECK([grep "VHOST_CONFIG: $OVS_RUNDIR/dpdkvhostclient0: reconnecting..." ovs-vswitchd.log], [], [stdout]) - -dnl Execute testpmd in background -on_exit "pkill -f -x -9 'tail -f /dev/null'" -tail -f /dev/null | dpdk-testpmd --socket-mem="$(cat NUMA_NODE)" --no-pci\ - --vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1" \ - --vdev="net_tap0,iface=tap0" --file-prefix page0 \ - --single-file-segments -- -a >$OVS_RUNDIR/testpmd-dpdkvhostuserclient0.log 2>&1 & +OVS_DPDK_START_TESTPMD([--vdev="net_virtio_user,path=$OVS_RUNDIR/dpdkvhostclient0,server=1"]) OVS_WAIT_UNTIL([grep "virtio is now ready for processing" ovs-vswitchd.log]) +OVS_WAIT_UNTIL([ovs-vsctl get Interface dpdkvhostuserclient0 link_state | grep -w up]) dnl Check MTU value in the datapath -AT_CHECK([ovs-appctl dpctl/show], [], [stdout]) -AT_CHECK([grep -E 'mtu=68' stdout], [], [stdout]) +AT_CHECK([ovs-vsctl get Interface dpdkvhostuserclient0 mtu], [0], [dnl +68 +]) dnl Set MTU value below lower bound and check for error AT_CHECK([ovs-vsctl set Interface dpdkvhostuserclient0 mtu_request=67]) AT_CHECK([grep "dpdkvhostuserclient0: unsupported MTU 67" ovs-vswitchd.log], [], [stdout]) -dnl Clean up the testpmd now -pkill -f -x -9 'tail -f /dev/null' - dnl Clean up AT_CHECK([ovs-vsctl del-port br10 dpdkvhostuserclient0], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@VHOST_CONFIG: failed to connect to $OVS_RUNDIR/dpdkvhostclient0: No such file or directory@d -\@dpdkvhostuserclient0: unsupported MTU 67@d -\@failed to set MTU for network device dpdkvhostuserclient0: Invalid argument@d -])") -AT_CLEANUP -dnl -------------------------------------------------------------------------- - - - -dnl -------------------------------------------------------------------------- -dnl MFEX Autovalidator -AT_SETUP([OVS-DPDK - MFEX Autovalidator]) -AT_KEYWORDS([dpdk]) -OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl -]) - -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py test_traffic.pcap 2000], [], [stdout]) - -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=test_traffic.pcap,infinite_rx=1], [], [stdout], [stderr]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl -DPIF implementation set to dpif_avx512. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator. -]) - -OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("[SYSTEM_DPDK_ALLOWED_LOGS]") -AT_CLEANUP -dnl -------------------------------------------------------------------------- - -dnl -------------------------------------------------------------------------- -dnl MFEX Autovalidator Fuzzy -AT_SETUP([OVS-DPDK - MFEX Autovalidator Fuzzy]) -AT_KEYWORDS([dpdk]) -OVS_DPDK_PRE_CHECK() -OVS_DPDK_START() -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_SKIP_IF([! ovs-appctl dpif-netdev/miniflow-parser-get | sed 1,4d | grep "True"], [], [dnl -]) - -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py fuzzy.pcap 2000 fuzzy], [], [stdout]) - -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=fuzzy.pcap,infinite_rx=1], [], [stdout], [stderr]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/dpif-impl-set dpif_avx512], [0], [dnl -DPIF implementation set to dpif_avx512. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator. -]) - -OVS_WAIT_UNTIL([test `ovs-vsctl get interface p1 statistics | grep -oP 'rx_packets=\s*\K\d+'` -ge 16000]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@upcall: datapath reached the dynamic limit of .* flows.@d -])") +OVS_DPDK_STOP_TESTPMD() +OVS_DPDK_STOP_VSWITCHD(["dnl +/VHOST_CONFIG: (.*dpdkvhostclient0) failed to connect: No such file or directory/d +/dpdkvhostuserclient0: unsupported MTU 67/d +/failed to set MTU for network device dpdkvhostuserclient0: Invalid argument/d"]) AT_CLEANUP dnl -------------------------------------------------------------------------- -dnl -------------------------------------------------------------------------- -AT_SETUP([OVS-DPDK - MFEX Configuration]) -AT_KEYWORDS([dpdk]) -OVS_DPDK_PRE_CHECK() -AT_SKIP_IF([! $PYTHON3 -c "import scapy"], [], []) -AT_CHECK([$PYTHON3 $srcdir/mfex_fuzzy.py test_traffic.pcap 1], [], [stdout]) -OVS_DPDK_START() -AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=0x1]) -dnl Add userspace bridge and attach it to OVS -AT_CHECK([ovs-vsctl add-br br0 -- set bridge br0 datapath_type=netdev]) -AT_CHECK([ovs-vsctl add-port br0 p1 -- set Interface p1 type=dpdk options:dpdk-devargs=net_pcap1,rx_pcap=test_traffic.pcap,infinite_rx=1], [], [stdout], [stderr]) -AT_CHECK([ovs-vsctl show], [], [stdout]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar 1], [2], -[], [dnl -Error: unknown argument 1. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 6 study 300 xyz], [2], -[], [dnl -Error: invalid study_pkt_cnt value: xyz. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set scalar abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 scalar abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd], [2], -[], [dnl -Error: -pmd option requires a thread id argument. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set tudy abcd], [2], -[], [dnl -Error: unknown argument abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 7 study abcd], [2], -[], [dnl -Error: invalid study_pkt_cnt value: abcd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study], [0], [dnl -Miniflow extract implementation set to study, on pmd thread 0, studying 128 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 study 512], [0], [dnl -Miniflow extract implementation set to study, on pmd thread 0, studying 512 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study 512], [0], [dnl -Miniflow extract implementation set to study, studying 512 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set study], [0], [dnl -Miniflow extract implementation set to study, studying 128 packets. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 0 autovalidator], [0], [dnl -Miniflow extract implementation set to autovalidator, on pmd thread 0. -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd zero study], [2], -[], [dnl -Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1], [2], -[], [dnl -Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 superstudy], [2], -[], [dnl -Error: unknown miniflow extract implementation superstudy. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set superstudy], [2], -[], [dnl -Error: unknown miniflow extract implementation superstudy. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -AT_CHECK([ovs-appctl dpif-netdev/miniflow-parser-set -pmd 1 study -pmd], [2], -[], [dnl -Error: invalid study_pkt_cnt value: -pmd. -ovs-appctl: ovs-vswitchd: server returned an error -]) - -dnl Clean up -AT_CHECK([ovs-vsctl del-port br0 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -\@Error: unknown argument 1.@d -\@Error: invalid study_pkt_cnt value: xyz.@d -\@Error: unknown argument abcd.@d -\@Error: -pmd option requires a thread id argument.@d -\@Error: invalid study_pkt_cnt value: abcd.@d -\@Error: miniflow extract parser not changed, PMD thread passed is not valid: 'zero'. Pass a valid pmd thread ID.@d -\@Error: no miniflow extract name provided. Output of miniflow-parser-get shows implementation list.@d -\@Error: unknown miniflow extract implementation superstudy.@d -\@Error: invalid study_pkt_cnt value: -pmd.@d -])") -AT_CLEANUP dnl -dnl -------------------------------------------------------------------------- - dnl -------------------------------------------------------------------------- dnl Setup user configured mempools AT_SETUP([OVS-DPDK - user configured mempool]) AT_KEYWORDS([dpdk]) OVS_DPDK_PRE_CHECK() OVS_DPDK_START_OVSDB() -OVS_DPDK_START_VSWITCHD() +OVS_DPDK_START_VSWITCHD([--no-pci]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:shared-mempool-config=8000,6000,1500]) AT_CHECK([ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true]) @@ -1166,7 +860,6 @@ OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Port p1: Requesting a mem dnl Clean up AT_CHECK([ovs-vsctl del-port br10 p1], [], [stdout], [stderr]) -OVS_VSWITCHD_STOP("m4_join([], [SYSTEM_DPDK_ALLOWED_LOGS], [ -])") +OVS_DPDK_STOP_VSWITCHD AT_CLEANUP dnl -------------------------------------------------------------------------- diff --git a/tests/system-interface.at b/tests/system-interface.at index 784bada12cb..d4ee5c46bad 100644 --- a/tests/system-interface.at +++ b/tests/system-interface.at @@ -63,3 +63,149 @@ AT_CHECK([ [stdout], [Device "br-p1" does not exist.] ) AT_CLEANUP + +AT_SETUP([interface - datapath ports garbage collection]) +OVS_CHECK_GENEVE() +OVS_TRAFFIC_VSWITCHD_START() + +dnl Not relevant for userspace datapath. +AT_SKIP_IF([! ovs-appctl dpctl/show | grep -q ovs-system]) + +AT_CHECK([ovs-vsctl add-port br0 tunnel_port dnl + -- set Interface tunnel_port dnl + type=geneve options:remote_ip=flow options:key=123]) + +AT_CHECK([ip link add ovs-veth0 type veth peer name ovs-veth1]) +on_exit 'ip link del ovs-veth0' + +AT_CHECK([ovs-vsctl add-port br0 ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q " genev_sys_[[0-9]]*: .* ovs-system "]) + +dnl Store the output of ip link for geneve port to compare ifindex later. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " > geneve.0]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: genev_sys_6081 (geneve: packet_type=ptap) + port 3: ovs-veth0 +]) + +OVS_APP_EXIT_AND_WAIT_BY_TARGET([ovs-vswitchd], [ovs-vswitchd.pid]) + +dnl Check that geneve backing interface is still in the datapath. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - geneve.0]) + +dnl Remove the veth port from the database while ovs-vswitchd is down. +AT_CHECK([ovs-vsctl --no-wait del-port ovs-veth0]) + +dnl Check that it is still tied to the OVS datapath. +AT_CHECK([ip link show ovs-veth0 | grep -q ovs-system]) + +dnl Bring ovs-vswitchd back up. +AT_CHECK([ovs-vswitchd --detach --no-chdir --pidfile --log-file -vdpif:dbg], + [0], [], [stderr]) + +dnl Wait for the veth port to be removed from the datapath. +OVS_WAIT_WHILE([ip link show ovs-veth0 | grep -q ovs-system]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: genev_sys_6081 (geneve: packet_type=ptap) +]) + +dnl Check that geneve backing interface is still in the datapath and it wasn't +dnl re-created, i.e. the ifindex is the same. +AT_CHECK([ip link show | grep " genev_sys_[[0-9]]*: .* ovs-system " | diff -u - geneve.0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([interface - datapath port rename]) +OVS_TRAFFIC_VSWITCHD_START() + +dnl Not relevant for userspace datapath. +AT_SKIP_IF([! ovs-appctl dpctl/show | grep -q ovs-system]) + +AT_CHECK([ip link add ovs-veth0 type veth peer name ovs-veth1]) +dnl We will rename ovs-veth0, so removing the peer on exit. +on_exit 'ip link del ovs-veth1' + +AT_CHECK([ovs-vsctl add-port br0 ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q "ovs-veth0.* ovs-system "]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: ovs-veth0 +]) + +dnl Rename the interface while attached to OVS. +AT_CHECK([ip l set ovs-veth0 name ovs-new-port]) + +dnl Wait for the port to be detached from the OVS datapath. +OVS_WAIT_UNTIL([ip link show | grep "ovs-new-port" | grep -v "ovs-system"]) + +dnl Check that database indicates the error. +AT_CHECK([ovs-vsctl get interface ovs-veth0 error], [0], [dnl +"could not open network device ovs-veth0 (No such device)" +]) + +dnl Check that the port is no longer in the datapath. +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) +]) + +dnl Rename the interface back and check that it is in use again. +AT_CHECK([ip l set ovs-new-port name ovs-veth0]) + +OVS_WAIT_UNTIL([ip link show | grep -q "ovs-veth0.* ovs-system "]) + +AT_CHECK([ovs-vsctl get interface ovs-veth0 error], [0], [dnl +[[]] +]) + +AT_CHECK([ovs-appctl dpctl/show | grep port], [0], [dnl + port 0: ovs-system (internal) + port 1: br0 (internal) + port 2: ovs-veth0 +]) + +OVS_TRAFFIC_VSWITCHD_STOP([" + /could not open network device ovs-veth0 (No such device)/d +"]) +AT_CLEANUP + +AT_SETUP([interface - current speed]) +AT_SKIP_IF([test $HAVE_ETHTOOL = "no"]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ip tuntap add tap0 mode tap]) +on_exit 'ip tuntap del tap0 mode tap' + +AT_CHECK([ip link set dev tap0 address aa:55:aa:55:00:01]) +AT_CHECK([ethtool -s tap0 speed 50000 duplex full]) +AT_CHECK([ip link set dev tap0 up]) + +AT_CHECK([ovs-vsctl add-port br0 tap0 -- set int tap0 type=tap]) + +AT_CHECK([ovs-ofctl -O OpenFlow15 -vwarn dump-ports-desc br0 tap0], [0], [stdout]) +AT_CHECK([strip_xids < stdout], [0], [dnl +OFPST_PORT_DESC reply (OF1.5): + 1(tap0): addr:aa:55:aa:55:00:01 + config: 0 + state: LIVE + current: COPPER + speed: 50000 Mbps now, 0 Mbps max +]) + +AT_CHECK([ovs-vsctl get interface tap0 link_speed], [0], [dnl +50000000000 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/system-ipsec.at b/tests/system-ipsec.at index 07f2b8fd0e8..1e155fecea3 100644 --- a/tests/system-ipsec.at +++ b/tests/system-ipsec.at @@ -110,16 +110,16 @@ m4_define([CHECK_LIBRESWAN], dnl IPSEC_STATUS_LOADED([]) dnl dnl Get number of loaded connections from ipsec status -m4_define([IPSEC_STATUS_LOADED], [ipsec status --rundir $ovs_base/$1 | \ +m4_define([IPSEC_STATUS_LOADED], [ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\1/m']) dnl IPSEC_STATUS_ACTIVE([]) dnl dnl Get number of active connections from ipsec status -m4_define([IPSEC_STATUS_ACTIVE], [ipsec status --rundir $ovs_base/$1 | \ +m4_define([IPSEC_STATUS_ACTIVE], [ipsec --rundir $ovs_base/$1 status | \ grep "Total IPsec connections" | \ - sed 's/[[0-9]]* Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m']) + sed 's/[[0-9]]* *Total IPsec connections: loaded \([[0-2]]\), active \([[0-2]]\).*/\2/m']) dnl CHECK_ESP_TRAFFIC() dnl @@ -141,10 +141,10 @@ m4_define([CHECK_ESP_TRAFFIC], OVS_WAIT_UNTIL([test `IPSEC_STATUS_LOADED(right)` -eq `IPSEC_STATUS_ACTIVE(right)`]) dnl Ping over IPsec tunnel - NS_CHECK_EXEC([left], [ping -q -c 3 -i 0.3 -w 2 192.0.0.2 | FORMAT_PING], [0], [dnl + NS_CHECK_EXEC([left], [ping -q -c 3 -i 0.3 -W 2 192.0.0.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) - NS_CHECK_EXEC([right], [ping -q -c 3 -i 0.3 -w 2 192.0.0.1 | FORMAT_PING], [0], [dnl + NS_CHECK_EXEC([right], [ping -q -c 3 -i 0.3 -W 2 192.0.0.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-kmod-macros.at b/tests/system-kmod-macros.at index 11920e60b66..5203b1df808 100644 --- a/tests/system-kmod-macros.at +++ b/tests/system-kmod-macros.at @@ -112,6 +112,17 @@ m4_define([CHECK_CONNTRACK_ZEROIP_SNAT], AT_SKIP_IF([test "$IS_WIN32" = "yes"]) ]) +# CHECK_CONNTRACK_SCTP() +# +# Perform requirements checks for running conntrack SCTP. The kernel +# optionally support nf proto sctp. +# +m4_define([CHECK_CONNTRACK_SCTP], +[ + AT_SKIP_IF([test "$IS_WIN32" = "yes"]) + AT_SKIP_IF([! test -e /proc/sys/net/netfilter/nf_conntrack_sctp_timeout_closed]) +]) + # CHECK_CONNTRACK_TIMEOUT() # # Perform requirements checks for running conntrack customized timeout tests. @@ -123,6 +134,15 @@ m4_define([CHECK_CONNTRACK_TIMEOUT], on_exit 'modprobe -r nfnetlink_cttimeout' ]) +# CHECK_CONNTRACK_DUMP_EXPECTATIONS() +# +# Perform requirements checks for dumping conntrack expectations. +# +m4_define([CHECK_CONNTRACK_DUMP_EXPECTATIONS], +[ + AT_SKIP_IF([:]) +]) + # CHECK_CT_DPIF_SET_GET_MAXCONNS() # # Perform requirements checks for running ovs-dpctl ct-set-maxconns or @@ -202,6 +222,14 @@ m4_define([OVS_CHECK_KERNEL_EXCL], AT_SKIP_IF([ ! ( test $version -lt $1 || ( test $version -eq $1 && test $sublevel -lt $2 ) || test $version -gt $3 || ( test $version -eq $3 && test $sublevel -gt $4 ) ) ]) ]) +# OVS_CHECK_SRV6() +# +# The kernel datapath does not support this feature. +m4_define([OVS_CHECK_SRV6], +[ + AT_SKIP_IF([:]) +]) + # CHECK_LATER_IPV6_FRAGMENTS() # # Upstream kernels beetween 4.20 and 5.19 are not parsing IPv6 fragments @@ -224,3 +252,18 @@ m4_define([VSCTL_ADD_DATAPATH_TABLE], # or necessary for the userspace datapath as it is checking for a kernel # specific regression. m4_define([CHECK_L3L4_CONNTRACK_REASM]) + +# CHECK_NO_TC_OFFLOAD +# +# The kernel module tests do not use TC offload. +m4_define([CHECK_NO_TC_OFFLOAD]) + +# OVS_CHECK_BAREUDP() +# +# The feature needs to be enabled in the kernel configuration (CONFIG_BAREUDP) +# to work. +m4_define([OVS_CHECK_BAREUDP], +[ + AT_SKIP_IF([! ip link add dev ovs_bareudp0 type bareudp dstport 6635 ethertype mpls_uc 2>&1 >/dev/null]) + AT_CHECK([ip link del dev ovs_bareudp0]) +]) diff --git a/tests/system-layer3-tunnels.at b/tests/system-layer3-tunnels.at index c37852b2163..5dcdd2afae0 100644 --- a/tests/system-layer3-tunnels.at +++ b/tests/system-layer3-tunnels.at @@ -34,15 +34,15 @@ AT_CHECK([ovs-ofctl add-flow br0 "priority=100 ip,nw_dst=10.1.1.2 action=mod_dl_ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -83,78 +83,23 @@ AT_CHECK([ovs-ofctl add-flow br0 "priority=100 ip,nw_dst=10.1.1.2 action=mod_dl_ OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([layer3 - use non-local port as tunnel endpoint]) - -OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1]) -AT_CHECK([ovs-vsctl add-port br0 vtep0 -- set int vtep0 type=dummy], [0]) -AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy], [0]) -AT_CHECK([ovs-vsctl add-port int-br t1 -- set Interface t1 type=gre \ - options:remote_ip=1.1.2.92 ofport_request=3], [0]) - -AT_CHECK([ovs-appctl dpif/show], [0], [dnl -dummy@ovs-dummy: hit:0 missed:0 - br0: - br0 65534/100: (dummy-internal) - p0 1/1: (dummy) - vtep0 2/2: (dummy) - int-br: - int-br 65534/3: (dummy-internal) - t1 3/4: (gre: remote_ip=1.1.2.92) -]) - -AT_CHECK([ovs-appctl netdev-dummy/ip4addr vtep0 1.1.2.88/24], [0], [OK -]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 vtep0], [0], [OK -]) -AT_CHECK([ovs-ofctl add-flow br0 action=normal]) -AT_CHECK([ovs-ofctl add-flow int-br action=normal]) - -dnl Use arp request and reply to achieve tunnel next hop mac binding -dnl By default, vtep0's MAC address is aa:55:aa:55:00:03 -AT_CHECK([ovs-appctl netdev-dummy/receive vtep0 'recirc_id(0),in_port(2),eth(dst=ff:ff:ff:ff:ff:ff,src=aa:55:aa:55:00:03),eth_type(0x0806),arp(tip=1.1.2.92,sip=1.1.2.88,op=1,sha=aa:55:aa:55:00:03,tha=00:00:00:00:00:00)']) -AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=aa:55:aa:55:00:03)']) - -AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl -1.1.2.92 f8:bc:12:44:34:b6 br0 -]) - -AT_CHECK([ovs-appctl ovs/route/show | tail -n+2 | sort], [0], [dnl -User: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 -]) - -dnl Check GRE tunnel pop -AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0800),ipv4(src=1.1.2.92,dst=1.1.2.88,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) - -AT_CHECK([tail -1 stdout], [0], - [Datapath actions: tnl_pop(4) -]) - -dnl Check GRE tunnel push -AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(3),eth(dst=f9:bc:12:44:34:b6,src=af:55:aa:55:00:03),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.92,proto=1,tos=0,ttl=64,frag=no)'], [0], [stdout]) -AT_CHECK([tail -1 stdout], [0], - [Datapath actions: tnl_push(tnl_port(4),header(size=38,type=3,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:03,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=47,tos=0,ttl=64,frag=0x4000),gre((flags=0x0,proto=0x6558))),out_port(2)),1 -]) - -OVS_VSWITCHD_STOP -AT_CLEANUP - AT_SETUP([layer3 - ping over MPLS Bareudp]) -OVS_CHECK_MIN_KERNEL(5, 7) +OVS_CHECK_BAREUDP() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -191,18 +136,18 @@ AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows0.txt]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br1 flows1.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([layer3 - ping over Bareudp]) -OVS_CHECK_MIN_KERNEL(5, 7) +OVS_CHECK_BAREUDP() OVS_TRAFFIC_VSWITCHD_START([_ADD_BR([br1])]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -239,11 +184,11 @@ AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br0 flows0.txt]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flows br1 flows1.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP diff --git a/tests/system-offloads-testsuite-macros.at b/tests/system-offloads-testsuite-macros.at new file mode 100644 index 00000000000..e50dc07fbcc --- /dev/null +++ b/tests/system-offloads-testsuite-macros.at @@ -0,0 +1,69 @@ +AT_COPYRIGHT([Copyright (c) 2022 Red Hat, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at: + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.]) + +# The goal is to run as many as possible of the system-traffic tests with +# OVS tc offload enabled. We do this by overriding the +# OVS_TRAFFIC_VSWITCHD_START() with offloading enabled. +m4_define([OVS_TRAFFIC_VSWITCHD_START], + [AT_CHECK([modprobe openvswitch]) + on_exit 'modprobe -r openvswitch' + m4_foreach([mod], [[vport_geneve], [vport_gre], [vport_lisp], [vport_stt], [vport_vxlan]], + [modprobe -q mod || echo "Module mod not loaded." + on_exit 'modprobe -q -r mod' + ]) + on_exit 'ovs-dpctl del-dp ovs-system' + on_exit 'ovs-appctl dpctl/flush-conntrack' + _OVS_VSWITCHD_START([], [-- set Open_vSwitch . other_config:hw-offload=true $3]) + dnl Add bridges, ports, etc. + AT_CHECK([ovs-vsctl -- _ADD_BR([br0]) -- $1 m4_if([$2], [], [], [| uuidfilt])], [0], [$2]) +]) + +# Macro to exclude tests that will fail with TC offload enabled. +# We currently have the below tests disabled in system-traffic.at +# for the following reasons: +# +# TC does not support moving ports to a different namespace than vswitchd's +# namespace, so we need to disable this test. +# - 'conntrack - multiple namespaces, internal ports' +# +# The kernel's tcf_ct_act() function does not seem to take care of any (QinQ) +# VLAN headers causing commits to fail. However, if this is solved, we have to +# make sure conntrack does not break the VLAN boundary, i.e., putting together +# two packets with different CVLAN+SVLAN values. +# - 'conntrack - IPv4 fragmentation + cvlan' +# +# Fragmentation handling in ct zone 9 does not seem to work correctly. +# When moving this test over to the default zone all works fine. +# - 'conntrack - Fragmentation over vxlan' +# +# Occasionally we fail with invalid byte counts. +# - 'datapath - truncate and output to gre tunnel by simulated packets' +# - 'datapath - truncate and output to gre tunnel' +# +m4_define([CHECK_NO_TC_OFFLOAD], +[ + AT_SKIP_IF([:]) +]) + +# Conntrack ALGs are not supported for tc. +m4_define([CHECK_CONNTRACK_ALG], +[ + AT_SKIP_IF([:]) +]) + +# Conntrack timeout not supported for tc. +m4_define([CHECK_CONNTRACK_TIMEOUT], +[ + AT_SKIP_IF([:]) +]) diff --git a/tests/system-offloads-testsuite.at b/tests/system-offloads-testsuite.at index eb5d2d4b329..23637d4f522 100644 --- a/tests/system-offloads-testsuite.at +++ b/tests/system-offloads-testsuite.at @@ -23,3 +23,6 @@ m4_include([tests/system-common-macros.at]) m4_include([tests/system-kmod-macros.at]) m4_include([tests/system-offloads-traffic.at]) + +m4_include([tests/system-offloads-testsuite-macros.at]) +m4_include([tests/system-traffic.at]) diff --git a/tests/system-offloads-traffic.at b/tests/system-offloads-traffic.at index 1a60570801e..d1da33d96c6 100644 --- a/tests/system-offloads-traffic.at +++ b/tests/system-offloads-traffic.at @@ -18,6 +18,16 @@ m4_define([OVS_CHECK_ACTIONS], [ [0], [$1]) ]) +m4_define([CHECK_TC_INGRESS_PPS], +[ + OVS_CHECK_TC_QDISC() + AT_CHECK([ip link add ovs_tc_pps0 type veth peer name ovs_tc_pps1 dnl + || exit 77]) + on_exit 'ip link del ovs_tc_pps0' + AT_CHECK([tc qdisc add dev ovs_tc_pps0 handle ffff: ingress || exit 77]) + AT_CHECK([tc filter add dev ovs_tc_pps0 parent ffff: u32 match dnl + u32 0 0 police pkts_rate 100 pkts_burst 10 || exit 77]) +]) AT_SETUP([offloads - ping between two ports - offloads disabled]) OVS_TRAFFIC_VSWITCHD_START() @@ -29,7 +39,7 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -62,7 +72,7 @@ ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_CHECK([ovs-appctl dpctl/dump-flows], [0], [ignore]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -85,7 +95,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_rate and ingress_policing_burst - offloads disabled]) AT_KEYWORDS([ingress_policing]) -AT_SKIP_IF([test $HAVE_TC = "no"]) +OVS_CHECK_TC_QDISC() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=false]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) @@ -108,7 +118,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_rate and ingress_policing_burst - offloads enabled]) AT_KEYWORDS([ingress_policing]) -AT_SKIP_IF([test $HAVE_TC = "no"]) +OVS_CHECK_TC_QDISC() OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) ADD_NAMESPACES(at_ns0) @@ -132,7 +142,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_kpkts_rate and ingress_policing_kpkts_burst - offloads disabled]) AT_KEYWORDS([ingress_policing_kpkts]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +CHECK_TC_INGRESS_PPS() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=false]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) @@ -156,7 +166,7 @@ AT_CLEANUP AT_SETUP([offloads - set ingress_policing_kpkts_rate and ingress_policing_kpkts_burst - offloads enabled]) AT_KEYWORDS([ingress_policing_kpkts]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +CHECK_TC_INGRESS_PPS() OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) ADD_NAMESPACES(at_ns0) @@ -181,6 +191,8 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads disabled]) AT_KEYWORDS([dp-meter]) +AT_SKIP_IF([test $HAVE_NC = "no"]) +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) @@ -193,7 +205,7 @@ NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.2 lladdr f0:00:00:01:01:02 dev p0]) NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev p1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 "actions=normal"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -229,7 +241,9 @@ AT_CLEANUP AT_SETUP([offloads - check interface meter offloading - offloads enabled]) AT_KEYWORDS([offload-meter]) -AT_SKIP_IF([test $SUPPORT_TC_INGRESS_PPS = "no"]) +OVS_CHECK_GITHUB_ACTION() +CHECK_TC_INGRESS_PPS() +AT_SKIP_IF([test $HAVE_NC = "no"]) OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-meter br0 'meter=1 pktps bands=type=drop rate=1']) @@ -242,7 +256,7 @@ NS_CHECK_EXEC([at_ns0], [ip neigh add 10.1.1.2 lladdr f0:00:00:01:01:02 dev p0]) NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 lladdr f0:00:00:01:01:01 dev p1]) AT_CHECK([ovs-ofctl -O OpenFlow13 add-flow br0 "actions=normal"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ]) @@ -301,11 +315,11 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3.pca NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4.pcap 2>/dev/null], [tcpdump4.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -359,11 +373,11 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3.pca NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4.pcap 2>/dev/null], [tcpdump4.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -395,7 +409,7 @@ AT_CHECK([cat p4.pcap | awk 'NF{print $NF}' | uniq -c | awk '{$1=$1;print}'], [0 # This test verifies the total packet counters work when individual branches # are taken. -AT_CHECK([ovs-appctl revalidator/wait], [0]) +AT_CHECK([ovs-appctl revalidator/purge], [0]) AT_CHECK([ovs-ofctl del-flows br0]) AT_DATA([flows.txt], [dnl table=0,in_port=2 actions=output:1 @@ -407,17 +421,17 @@ table=4,in_port=1,reg0=0x0 actions=output:2 AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed 's/bytes:11440/bytes:11720/'], [0], [dnl -in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:check_pkt_len(size=200,gt(3),le(3)) -in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:output +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed 's/bytes:11348/bytes:11614/'], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:check_pkt_len(size=200,gt(3),le(3)) +in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:output ]) @@ -435,7 +449,7 @@ table=4,in_port=1,reg0=0x0 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(3))]) @@ -451,7 +465,7 @@ table=4,in_port=1,reg0=0x0 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(drop),le(3))]) @@ -467,7 +481,7 @@ table=4,in_port=1,reg0=0x1 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(3),le(drop))]) @@ -484,13 +498,13 @@ table=4,in_port=1,reg0=0x1 actions=output:2,4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(3,5),le(3,4))]) -AT_CHECK([ovs-appctl revalidator/wait], [0]) +AT_CHECK([ovs-appctl revalidator/purge], [0]) AT_CHECK([ovs-ofctl del-flows br0]) AT_DATA([flows.txt], [dnl table=0,in_port=2 actions=output:1 @@ -507,17 +521,17 @@ NETNS_DAEMONIZE([at_ns3], [tcpdump -l -n -U -i p3 dst 10.1.1.2 and icmp > p3_2.p NETNS_DAEMONIZE([at_ns4], [tcpdump -l -n -U -i p4 dst 10.1.1.2 and icmp > p4_2.pcap 2>/dev/null], [tcpdump4_2.pid]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) -AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed -e 's/bytes:11348/bytes:11614/' -e 's/bytes:11440/bytes:11720/'], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | sed -e 's/bytes:11348/bytes:11614/'], [0], [dnl in_port(2),eth(),eth_type(0x0800),ipv4(proto=1,tos=0/0xfc,frag=no), packets:19, bytes:11614, used:0.001s, actions:check_pkt_len(size=200,gt(set(ipv4(tos=0x4/0xfc)),4),le(set(ipv4(tos=0x8/0xfc)),5)),3 -in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:20, bytes:11720, used:0.001s, actions:output +in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:11614, used:0.001s, actions:output ]) sleep 1 @@ -543,7 +557,7 @@ table=4,in_port=1,reg0=0x0 actions=mod_dl_src:00:11:11:11:11:11,output:4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(set(ipv4(tos=0x4/0xfc)),4),le(set(eth(src=00:11:11:11:11:11)),5)),3]) @@ -561,7 +575,7 @@ table=4,in_port=1,reg0=0x0 actions=mod_nw_tos:8,output:4 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(set(eth(src=00:11:11:11:11:11)),4),le(set(ipv4(tos=0x8/0xfc)),5)),3]) @@ -579,7 +593,7 @@ table=4,in_port=1,reg0=0x0 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(1),le(1)),3]) @@ -596,7 +610,7 @@ table=4,in_port=1,reg0=0x1 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(1),le(drop)),3]) @@ -613,7 +627,7 @@ table=4,in_port=1,reg0=0x0 actions=output:br0 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 1024 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) @@ -630,7 +644,7 @@ table=1,in_port=1,reg1=0x2 actions=output:2 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(drop),le(drop)),3]) @@ -650,7 +664,7 @@ table=5,in_port=1,reg0=0x0 actions=output:3 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(check_pkt_len(size=400,gt(5),le(4))),le(5)),3]) @@ -670,7 +684,7 @@ table=5,in_port=1,reg0=0x0 actions=output:3 ]) AT_CHECK([ovs-ofctl --protocols=OpenFlow10 add-flows br0 flows.txt]) sleep 1 -NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -w 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 10 -i 0.1 -W 2 -s 64 10.1.1.2 | FORMAT_PING], [0], [dnl 10 packets transmitted, 10 received, 0% packet loss, time 0ms ], [], [ovs-appctl dpctl/dump-flows; ovs-ofctl dump-flows br0]) OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5),le(4)))),3]) @@ -678,3 +692,244 @@ OVS_CHECK_ACTIONS([check_pkt_len(size=200,gt(5),le(check_pkt_len(size=100,gt(5), OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + + +AT_SETUP([offloads - offload flow to none-offload]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +add in_port=ovs-p0,actions=ovs-p1 +add in_port=ovs-p1,actions=ovs-p0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:756, used:0.0s, actions:3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:756, used:0.0s, actions:2 +]) + +dnl Here we use an output action with truncate, which will force a kernel flow. +AT_DATA([flows2.txt], [dnl +modify in_port=ovs-p0,actions=output(port=ovs-p1, max_len=128) +modify in_port=ovs-p1,actions=output(port=ovs-p0, max_len=128) +]) +AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=ovs | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:980, used:0.0s, actions:trunc(128),3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:980, used:0.0s, actions:trunc(128),2 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc | grep "eth_type(0x0800)" | sort | strip_recirc | strip_used], [0], [dnl +recirc_id(),in_port(2),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:840, used:0.0s, actions:3 +recirc_id(),in_port(3),eth(),eth_type(0x0800),ipv4(frag=no), packets:10, bytes:840, used:0.0s, actions:2 +]) + +AT_CHECK([ovs-appctl coverage/read-counter ukey_invalid_stat_reset], [0], [dnl +0 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([offloads - delete ufid mapping if device not exist - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1, at_ns2) + +dnl Disable IPv6 to skip unexpected flow +AT_CHECK([sysctl -w net.ipv6.conf.br0.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns1], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns2], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "aa:1a:54:e9:c5:56") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.2 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) + +dnl Delete and add interface ovs-p0/p0 +AT_CHECK([ip link del dev ovs-p0]) +AT_CHECK([ip link add p0 type veth peer name ovs-p0 || return 77]) +AT_CHECK([ip link set p0 netns at_ns0]) +AT_CHECK([ip link set dev ovs-p0 up]) +NS_CHECK_EXEC([at_ns0], [ip addr add dev p0 "10.1.1.1/24"]) +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 up]) +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address "aa:1a:54:e9:c5:56"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +dnl Generate flows to trigger the hmap expand once +ADD_VETH(p2, at_ns2, br0, "10.1.1.3/24") +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.2 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 2 -i 0.2 10.1.1.3 | FORMAT_PING], [0], [dnl +2 packets transmitted, 2 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +dnl Fix purge fail occasionally +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +AT_CHECK([test $(ovs-appctl dpctl/dump-flows | grep -c "eth_type(0x0800)") -eq 0], [0], [ignore]) + +OVS_TRAFFIC_VSWITCHD_STOP(["/could not open network device ovs-p0/d +/on nonexistent port/d +/No such device/d +/failed to offload flow/d +"]) +AT_CLEANUP + +AT_SETUP([offloads - ping over vxlan tunnel with gbp - offloads enabled]) +OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_VXLAN() + +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) +AT_SKIP_IF([! grep -q "probe tc: vxlan gbp is supported." ovs-vswitchd.log]) +ADD_BR([br-underlay]) + +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([vxlan], [br0], [at_vxlan0], [172.31.1.1], [10.1.1.100/24], [options:exts=gbp]) +AT_CHECK([ovs-ofctl add-flow br0 "in_port=br0 actions=load:0x200->NXM_NX_TUN_GBP_ID[], output:at_vxlan0]") +AT_CHECK([ovs-ofctl add-flow br0 "in_port=at_vxlan0, tun_gbp_id=512 actions=output:br0"]) +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [id 0 dstport 4789 gbp]) +NS_CHECK_EXEC([at_ns0], [iptables -I OUTPUT -p ip -j MARK --set-mark 512 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [iptables -I INPUT -m mark --mark 512 -j ACCEPT 2>/dev/null], [0], [ignore]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns0], [ping -q -c 1000 -i 0.01 10.1.1.100 | FORMAT_PING], [0], [dnl +1000 packets transmitted, 1000 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | grep "tp_dst=4789,vxlan(gbp(id=512))" | wc -l], [0], [dnl +1 +]) +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | grep "tp_dst=4789,vxlan(gbp(id=512,flags=0))" | wc -l], [0], [dnl +1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([offloads - IGMP with ip rewrite - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Set up the ip field modify flow. +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p0,ip actions=mod_nw_tos:12,output:ovs-p1"]) + +dnl Add and del multicast address to send IGMP packet. +NS_CHECK_EXEC([at_ns0], [ip addr add dev p0 224.10.10.10/24 autojoin 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip addr del dev p0 224.10.10.10/24 2>/dev/null], [0]) + +OVS_WAIT_UNTIL([test `ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | wc -l` -ge 1]) + +dnl Check the offloaded flow. +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | strip_stats], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(proto=2,tos=0xc0/0xfc,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(tos=0xc/0xfc)),3 +]) + +dnl Check the tc rule. +AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([offloads - IPIP wth ip rewrite - offloads enabled]) +OVS_TRAFFIC_VSWITCHD_START([], [], [-- set Open_vSwitch . other_config:hw-offload=true]) + +AT_CHECK([ovs-ofctl add-flow br0 "priority=0 actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Set up the ip field modify flow. +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p0,ip,nw_dst=10.1.1.2 actions=dec_ttl,output:ovs-p1"]) +AT_CHECK([ovs-ofctl add-flow br0 "priority=100 in_port=ovs-p1,ip,nw_dst=10.1.1.1 actions=dec_ttl,output:ovs-p0"]) + +dnl Set up ipip tunnel in NS. +NS_CHECK_EXEC([at_ns0], [ip tunnel add ipip0 remote 10.1.1.2 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip link set dev ipip0 up 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns0], [ip addr add dev ipip0 192.168.1.1/30 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip tunnel add ipip0 remote 10.1.1.1 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip link set dev ipip0 up 2>/dev/null], [0]) +NS_CHECK_EXEC([at_ns1], [ip addr add dev ipip0 192.168.1.2/30 2>/dev/null], [0]) + +dnl Check the tunnel. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 192.168.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Check the offloaded flow. +AT_CHECK([ovs-appctl dpctl/dump-flows type=tc,offloaded | grep "eth_type(0x0800)" | DUMP_CLEAN_SORTED | strip_stats], [0], [dnl +in_port(2),eth(),eth_type(0x0800),ipv4(dst=10.1.1.2,proto=4,ttl=64,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(ttl=63)),3 +in_port(3),eth(),eth_type(0x0800),ipv4(dst=10.1.1.1,proto=4,ttl=64,frag=no), packets:0, bytes:0, used:0.001s, actions:set(ipv4(ttl=63)),2 +]) + +dnl Check the tc rule. +AT_CHECK([tc -d filter show dev ovs-p0 ingress | grep -q "csum (iph)"], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([offloads - re-probe drop action]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_DROP_ACTION() +AT_KEYWORDS(drop_action) + +dnl Trigger a re-probe of the explicit drop action. +AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:hw-offload=true]) +OVS_WAIT_UNTIL([grep -q "Datapath does not support explicit drop action" ovs-vswitchd.log]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/system-route.at b/tests/system-route.at index 270956d13f6..c0ecad6cfb4 100644 --- a/tests/system-route.at +++ b/tests/system-route.at @@ -25,3 +25,106 @@ OVS_WAIT_UNTIL([test `ovs-appctl ovs/route/show | grep -c 'p1-route'` -eq 0 ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([ovs-route - add system route with src - ipv4]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() +AT_CHECK([ip link set br0 up]) + +AT_CHECK([ip addr add 192.168.9.2/24 dev br0], [0], [stdout]) +AT_CHECK([ip addr add 192.168.9.3/24 dev br0], [0], [stdout]) + +AT_CHECK([ip route add 192.168.10.12/32 dev br0 via 192.168.9.1 src 192.168.9.2], [0], [stdout]) +AT_CHECK([ip route add 192.168.10.13/32 dev br0 via 192.168.9.1 src 192.168.9.3], [0], [stdout]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -E '192.168.10.1[[23]]/32' | sort], [dnl +Cached: 192.168.10.12/32 dev br0 GW 192.168.9.1 SRC 192.168.9.2 +Cached: 192.168.10.13/32 dev br0 GW 192.168.9.1 SRC 192.168.9.3]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([ovs-route - add system route with src - ipv6]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() +AT_CHECK([ip link set br0 up]) + +AT_CHECK([ip -6 addr add fc00:db8:cafe::2/64 dev br0], [0], [stdout]) +AT_CHECK([ip -6 addr add fc00:db8:cafe::3/64 dev br0], [0], [stdout]) + +dnl If we try to add a route immediately after assigning ipv6 addresses, +dnl iproute2 would give us "Invalid source address" error, +dnl so wait a while to succeed. +OVS_WAIT_UNTIL([ip -6 route add fc00:db8:beef::12/128 via fc00:db8:cafe::1 dev br0 src fc00:db8:cafe::3]) +OVS_WAIT_UNTIL([ip -6 route add fc00:db8:beef::13/128 via fc00:db8:cafe::1 dev br0 src fc00:db8:cafe::2]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -E 'fc00:db8:beef::1[[23]]/128' | sort], [dnl +Cached: fc00:db8:beef::12/128 dev br0 GW fc00:db8:cafe::1 SRC fc00:db8:cafe::3 +Cached: fc00:db8:beef::13/128 dev br0 GW fc00:db8:cafe::1 SRC fc00:db8:cafe::2]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +dnl Checks that OVS doesn't use routes from non-standard tables. +AT_SETUP([ovs-route - route tables]) +AT_KEYWORDS([route]) +OVS_TRAFFIC_VSWITCHD_START() + +dnl Create tap port. +on_exit 'ip link del p1-route' +AT_CHECK([ip tuntap add name p1-route mode tap]) +AT_CHECK([ip link set p1-route up]) + +dnl Add ip address. +AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout]) + +dnl Check that OVS catches route updates. +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local]) + +dnl Add a route to the main routing table and check that OVS caches +dnl this new route. +AT_CHECK([ip route add 10.0.0.18/32 dev p1-route]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +Cached: 10.0.0.18/32 dev p1-route SRC 10.0.0.17]) + +dnl Add a route to a custom routing table and check that OVS doesn't cache it. +AT_CHECK([ip route add 10.0.0.19/32 dev p1-route table 42]) +AT_CHECK([ip route show table 42 | grep 'p1-route' | grep -q '10.0.0.19']) +dnl Give the main thread a chance to act. +AT_CHECK([ovs-appctl revalidator/wait]) +dnl Check that OVS didn't learn this route. +AT_CHECK([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [0], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +Cached: 10.0.0.18/32 dev p1-route SRC 10.0.0.17 +]) + +dnl Delete a route from the main table and check that OVS removes the route +dnl from the cache. +AT_CHECK([ip route del 10.0.0.18/32 dev p1-route]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local]) + +dnl Delete a route from a custom routing table and check that the cache +dnl dosn't change. +AT_CHECK([ip route del 10.0.0.19/32 dev p1-route table 42]) +dnl Give the main thread a chance to act. +AT_CHECK([ovs-appctl revalidator/wait]) +dnl Check that the cache is still the same. +AT_CHECK([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [0], [dnl +Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17 +Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local +]) + +dnl Delete ip address. +AT_CHECK([ip addr del 10.0.0.17/24 dev p1-route], [0], [stdout]) +dnl Check that routes were removed from OVS. +OVS_WAIT_UNTIL([test $(ovs-appctl ovs/route/show | grep -c 'p1-route') -eq 0 ]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/system-tap.at b/tests/system-tap.at index 871a3bda4fc..3d84a53182c 100644 --- a/tests/system-tap.at +++ b/tests/system-tap.at @@ -22,7 +22,7 @@ AT_CHECK([ip netns exec at_ns1 ip link set dev tap1 up]) AT_CHECK([ip netns exec at_ns0 ip addr add 10.1.1.1/24 dev tap0]) AT_CHECK([ip netns exec at_ns1 ip addr add 10.1.1.2/24 dev tap1]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index 731de439c7a..202ff049222 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -10,13 +10,13 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -33,7 +33,7 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -56,13 +56,13 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24") ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24") -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -88,13 +88,13 @@ ADD_CVLAN(p1.4094, at_ns1, 100, "10.2.2.2/24") OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -116,13 +116,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -147,13 +147,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -179,13 +179,13 @@ ADD_CVLAN(p1.4094, at_ns1, 100, "fc00:1::2/96") OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -209,13 +209,13 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -253,13 +253,13 @@ priority=0,actions=NORMAL AT_CHECK([ovs-ofctl del-flows br0]) AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::3 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::3 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -278,13 +278,13 @@ ADD_VETH_BOND(p1 p2, at_ns1, br0, bond0, lacp=active bond_mode=balance-tcp, "10. OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.1.1.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -292,7 +292,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan tunnel]) -OVS_CHECK_TUNNEL_TSO() +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_VXLAN() OVS_TRAFFIC_VSWITCHD_START() @@ -315,21 +315,39 @@ ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], [id 0 dstport 4789]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -363,17 +381,17 @@ ADD_VLAN(at_vxlan1, at_ns0, 100, "10.1.1.1/24") ADD_VLAN(p0, at_ns0, 42, "172.31.1.1/24") dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -381,7 +399,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over vxlan6 tunnel]) -OVS_CHECK_TUNNEL_TSO() +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_VXLAN_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -406,26 +424,43 @@ ADD_NATIVE_TUNNEL6([vxlan], [at_vxlan1], [at_ns0], [fc00::100], [10.1.1.1/24], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over gre tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() @@ -448,26 +483,43 @@ ADD_OVS_TUNNEL([gre], [br0], [at_gre0], [172.31.1.1], [10.1.1.100/24]) ADD_NATIVE_TUNNEL([gretap], [ns_gre0], [at_ns0], [172.31.1.100], [10.1.1.1/24]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6gre L2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -495,12 +547,12 @@ ADD_NATIVE_TUNNEL6([ip6gretap], [ns_gretap0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -508,7 +560,6 @@ AT_CLEANUP AT_SETUP([datapath - ping over erspan v1 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -532,12 +583,12 @@ ADD_OVS_TUNNEL([erspan], [br0], [at_erspan0], [172.31.1.1], [10.1.1.100/24], [op ADD_NATIVE_TUNNEL([erspan], [ns_erspan0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [seq key 1 erspan_ver 1 erspan 7]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl NS_CHECK_EXEC([at_ns0], [ping -s 1200 -i 0.3 -c 3 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -545,7 +596,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -569,12 +619,12 @@ ADD_OVS_TUNNEL([erspan], [br0], [at_erspan0], [172.31.1.1], [10.1.1.100/24], [op ADD_NATIVE_TUNNEL([erspan], [ns_erspan0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [seq key 1 erspan_ver 2 erspan_dir egress erspan_hwid 7]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +dnl NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl NS_CHECK_EXEC([at_ns0], [ping -s 1200 -i 0.3 -c 3 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -582,7 +632,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v1 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -610,19 +659,18 @@ ADD_NATIVE_TUNNEL6([ip6erspan], [ns_erspan0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v2 tunnel]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() OVS_CHECK_ERSPAN() @@ -651,19 +699,19 @@ ADD_NATIVE_TUNNEL6([ip6erspan], [ns_erspan0], [at_ns0], [fc00:100::100], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 2 fc00:100::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve tunnel]) -OVS_CHECK_TUNNEL_TSO() +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -686,26 +734,43 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve tunnel, delete flow regression]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -739,12 +804,12 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl ping over tunnel should work -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -760,7 +825,6 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/|ERR|/d AT_CLEANUP AT_SETUP([datapath - flow resume with geneve tun_metadata]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -812,7 +876,7 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over geneve6 tunnel]) -OVS_CHECK_TUNNEL_TSO() +AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_GENEVE_UDP6ZEROCSUM() OVS_TRAFFIC_VSWITCHD_START() @@ -837,26 +901,199 @@ ADD_NATIVE_TUNNEL6([geneve], [ns_gnv0], [at_ns0], [fc00::100], [10.1.1.1/24], OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) +dnl Start ncat listeners. +OVS_DAEMONIZE([nc -l 10.1.1.100 1234 > tcp_data], [nc.pid]) +NETNS_DAEMONIZE([at_ns0], [nc -l -u 10.1.1.1 4321 > udp_data], [nc2.pid]) + +dnl Verify that ncat is ready. +OVS_WAIT_UNTIL([netstat -ln | grep :1234]) +OVS_WAIT_UNTIL([NS_EXEC([at_ns0], [netstat -ln | grep :4321])]) + +dnl Check large bidirectional TCP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.100 1234 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin tcp_data]) + +dnl Check UDP. +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=600 count=1 2> /dev/null]) +AT_CHECK([nc $NC_EOF_OPT -u 10.1.1.1 4321 < payload.bin]) +OVS_WAIT_UNTIL([diff -q payload.bin udp_data]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([datapath - ping over gre tunnel by simulated packets]) +AT_SETUP([datapath - slow_action on geneve6 tunnel]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_CHECK_GENEVE_UDP6ZEROCSUM() + +OVS_TRAFFIC_VSWITCHD_START() +ADD_BR([br-underlay]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL6([geneve], [br0], [at_gnv0], [fc00::1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL6([geneve], [ns_gnv0], [at_ns0], [fc00::100], [10.1.1.1/24], + [vni 0 udp6zerocsumtx udp6zerocsumrx]) +AT_CHECK([ovs-ofctl add-flow br0 "table=37,actions=at_gnv0"]) + +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Start tcpdump to capture the encapsulated packets. +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -U -i p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl Generate a single packet trough the controler that needs an ARP modification +AT_CHECK([ovs-ofctl -O OpenFlow15 packet-out br0 "in_port=controller packet=fffffffffffffa163e949d8008060001080006040001fa163e949d80c0a820300000000000000a0000fe actions=set_field:0xa0000f4->reg1,move:NXM_NX_XXREG0[[64..95]]->NXM_OF_ARP_SPA[[]],resubmit(,37)"]) +sleep 1 + +dnl Stop OVS and tcpdump and verify the results. +OVS_TRAFFIC_VSWITCHD_STOP + +ovs-pcap p0.pcap + +AT_CHECK([ovs-pcap p0.pcap | grep -Eq "^[[[:xdigit:]]]{24}86dd60000000003a1140fc000000000000000000000000000100fc000000000000000000000000000001[[[:xdigit:]]]{4}17c1003a[[[:xdigit:]]]{4}0000655800000000fffffffffffffa163e949d8008060001080006040001[[[:xdigit:]]]{12}0a0000f40000000000000a0000fe$"]) +AT_CLEANUP + +AT_SETUP([datapath - bridging two geneve tunnels]) OVS_CHECK_TUNNEL_TSO() +OVS_CHECK_GENEVE() + +OVS_TRAFFIC_VSWITCHD_START() +ADD_BR([br-underlay-0]) +ADD_BR([br-underlay-1]) + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) + +dnl Set up underlay link from host into the namespaces using veth pairs. +ADD_VETH(p0, at_ns0, br-underlay-0, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay-0 "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay-0 up]) + +ADD_VETH(p1, at_ns1, br-underlay-1, "172.31.2.1/24") +AT_CHECK([ip addr add dev br-underlay-1 "172.31.2.100/24"]) +AT_CHECK([ip link set dev br-underlay-1 up]) + +dnl Set up two OVS tunnel endpoints in a root namespace and two native +dnl linux devices inside the test namespaces. +dnl +dnl ns_gnv0 | ns_gnv1 +dnl ip: 10.1.1.1/24 | ip: 10.1.1.2/24 +dnl remote_ip: 172.31.1.100 | remote_ip: 172.31.2.100 +dnl | | | +dnl | | | +dnl p0 | p1 +dnl ip: 172.31.1.1/24 | ip: 172.31.2.1/24 +dnl | NS0 | NS1 | +dnl ---------|------------------------+------------------|-------------------- +dnl | | +dnl br-underlay-0: br-underlay-1: +dnl ip: 172.31.1.100/24 ip: 172.31.2.100/24 +dnl ovs-p0 ovs-p1 +dnl | | +dnl | br0 | +dnl encap/decap --- ip: 10.1.1.100/24 --------- encap/decap +dnl at_gnv0 +dnl remote_ip: 172.31.1.1 +dnl at_gnv1 +dnl remote_ip: 172.31.2.1 +dnl +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [vni 0]) +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv1], [172.31.2.1], [10.1.1.101/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv1], [at_ns1], [172.31.2.100], [10.1.1.2/24], + [vni 0]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay-0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay-1 "actions=normal"]) + +dnl First, check both underlays. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 172.31.2.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Now, check the overlay with different packet sizes. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([datapath - handling of geneve corrupted metadata]) +OVS_CHECK_GENEVE() + +OVS_TRAFFIC_VSWITCHD_START( + [_ADD_BR([br-underlay]) -- \ + set bridge br0 other-config:hwaddr=f2:ff:00:00:00:01 -- \ + set bridge br-underlay other-config:hwaddr=f2:ff:00:00:00:02]) + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "actions=normal"]) + +ADD_NAMESPACES(at_ns0) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24", f2:ff:00:00:00:03) +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([geneve], [br0], [at_gnv0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [vni 0], [address f2:ff:00:00:00:04]) + +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 03 08 00 45 00 00 52 00 01 00 00 40 11 1f f7 ac 1f 01 01 ac 1f 01 64 de c1 17 c1 00 3e 59 e9 01 00 65 58 00 00 00 00 00 03 00 02 f2 ff 00 00 00 01 f2 ff 00 00 00 04 08 00 45 00 00 1c 00 01 00 00 40 01 64 7a 0a 01 01 01 0a 01 01 64 08 00 f7 ff 00 00 00 00 > /dev/null]) + +OVS_WAIT_UNTIL([grep -q 'Invalid Geneve tunnel metadata' ovs-vswitchd.log]) + +OVS_TRAFFIC_VSWITCHD_STOP(["/Invalid Geneve tunnel metadata on bridge br0 while processing icmp,in_port=1,vlan_tci=0x0000,dl_src=f2:ff:00:00:00:04,dl_dst=f2:ff:00:00:00:01,nw_src=10.1.1.1,nw_dst=10.1.1.100,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,icmp_type=8,icmp_code=0/d +/Unable to parse geneve options/d"]) +AT_CLEANUP + +AT_SETUP([datapath - ping over gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -882,7 +1119,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -i p0 dst host 172.31.1.1 -l > p0.pcap 2>/ sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -903,7 +1140,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v1 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -930,7 +1166,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host 172.31.1.1 -l > p0.pcap sleep 1 dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -952,7 +1188,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over erspan v2 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -983,7 +1218,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host 172.31.1.1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1006,7 +1241,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v1 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1040,7 +1274,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host fc00:100::1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1062,7 +1296,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([datapath - ping over ip6erspan v2 tunnel by simulated packets]) -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_MIN_KERNEL(3, 10) OVS_TRAFFIC_VSWITCHD_START() @@ -1096,7 +1329,7 @@ NETNS_DAEMONIZE([at_ns0], [tcpdump -n -x -i p0 dst host fc00:100::1 -l > p0.pcap sleep 1 dnl First, check the underlay. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:100::100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1116,6 +1349,128 @@ OVS_WAIT_UNTIL([cat p0.pcap | grep -E "IP6 fc00:100::100 > fc00:100::1: GREv0, . OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([datapath - ping over srv6 tunnel]) +OVS_CHECK_SRV6() + +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv4.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv4.conf.all.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.forwarding=1]) + +dnl Set up underlay link from host into the namespace 'at_ns0' +dnl using veth pair. Kernel side tunnel endpoint (SID) is +dnl 'fc00:a::1/128', so add it to the route. +dnl Only IPPROTO_IPIP(4) and IPPROTO_ICMPV6(58) are needed in underlay link. +ADD_BR([br-underlay]) +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=1,actions=drop"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=4,actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=58,actions=normal"]) +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) +AT_CHECK([ip route add fc00:a::1/128 dev br-underlay via fc00::1]) + +dnl Set up tunnel endpoints on OVS outside the namespace. +ADD_OVS_TUNNEL6([srv6], [br0], [at_srv6], [fc00:a::1], [10.100.100.100/24]) +AT_CHECK([ovs-vsctl set bridge br0 other_config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ip route add 10.1.1.0/24 dev br0 via 10.100.100.1]) +AT_CHECK([arp -s 10.100.100.1 aa:55:aa:55:00:01]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=LOCAL,actions=output:at_srv6]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55:00:00,output:LOCAL]) + +dnl Set up tunnel endpoints on the namespace 'at_ns0', +dnl and overlay port on the namespace 'at_ns1' +ADD_VETH_NS([at_ns0], [ovs-veth0], [10.1.1.2/24], [at_ns1], [ovs-veth1], [10.1.1.1/24]) +NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) +NS_CHECK_EXEC([at_ns0], [ip route add 10.100.100.0/24 encap seg6 mode encap segs fc00::100 dev p0]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX4 nh4 0.0.0.0 dev ovs-veth0]) +NS_CHECK_EXEC([at_ns1], [ip route add 10.100.100.0/24 via 10.1.1.2 dev ovs-veth1]) + +dnl Linux seems to take a little time to get its IPv6 stack in order. Without +dnl waiting, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.100.100.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([datapath - ping6 over srv6 tunnel]) +OVS_CHECK_SRV6() + +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0) +ADD_NAMESPACES(at_ns1) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.default.forwarding=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.seg6_enabled=1]) +NS_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.forwarding=1]) + +dnl Set up underlay link from host into the namespace 'at_ns0' +dnl using veth pair. Kernel side tunnel endpoint (SID) is +dnl 'fc00:a::1/128', so add it to the route. +dnl Only IPPROTO_IPV6(41) and IPPROTO_ICMPV6(58) are needed in underlay link. +ADD_BR([br-underlay]) +ADD_VETH(p0, at_ns0, br-underlay, "fc00::1/64", [], [], "nodad") +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=1,actions=drop"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=41,actions=normal"]) +AT_CHECK([ovs-ofctl add-flow br-underlay "priority=100,ipv6,nw_proto=58,actions=normal"]) +AT_CHECK([ip addr add dev br-underlay "fc00::100/64" nodad]) +AT_CHECK([ip link set dev br-underlay up]) +AT_CHECK([ip -6 route add fc00:a::1/128 dev br-underlay via fc00::1]) + +dnl Set up tunnel endpoints on OVS outside the namespace. +ADD_OVS_TUNNEL6([srv6], [br0], [at_srv6], [fc00:a::1], [fc00:100::100/64]) +AT_CHECK([ovs-vsctl set bridge br0 other_config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ip addr add dev br0 fc00:100::100/64]) +AT_CHECK([ip -6 route add fc00:1::1/128 dev br0 via fc00:100::1]) +AT_CHECK([ip -6 neigh add fc00:100::1 lladdr aa:55:aa:55:00:01 dev br0]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=LOCAL,actions=output:at_srv6]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=at_srv6,actions=mod_dl_dst:aa:55:aa:55:00:00,output:LOCAL]) + +dnl Set up tunnel endpoints on the namespace 'at_ns0', +dnl and overlay port on the namespace 'at_ns1' +ADD_VETH_NS([at_ns0], [ovs-veth0], [fc00:1::2/64], [at_ns1], [ovs-veth1], [fc00:1::1/64]) +NS_CHECK_EXEC([at_ns0], [ip sr tunsrc set fc00:a::1]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:100::0/64 encap seg6 mode encap segs fc00::100 dev p0]) +NS_CHECK_EXEC([at_ns0], [ip -6 route add fc00:a::1 encap seg6local action End.DX6 nh6 :: dev ovs-veth0]) +NS_CHECK_EXEC([at_ns1], [ip -6 route add fc00:100::/64 via fc00:1::2 dev ovs-veth1]) + +dnl Linux seems to take a little time to get its IPv6 stack in order. Without +dnl waiting, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::100]) +OVS_WAIT_UNTIL([ip netns exec at_ns1 ping6 -c 1 fc00:100::100]) + +dnl First, check the underlay. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay. +NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -W 2 fc00:100::100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([datapath - clone action]) OVS_TRAFFIC_VSWITCHD_START() @@ -1135,7 +1490,7 @@ priority=10 in_port=2,ip,actions=clone(mod_dl_src(ae:c6:7e:54:8d:4d),mod_dl_dst( AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1167,8 +1522,8 @@ AT_CHECK([ovs-vsctl add-port br0 patch0]) AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_DATA([flows.txt], [dnl -table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:3,resubmit(,1) -table=0,priority=100,dl_type=0x8847,mpls_label=3 actions=pop_mpls:0x0800,resubmit(,1) +table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:4,resubmit(,1) +table=0,priority=100,dl_type=0x8847,mpls_label=4 actions=pop_mpls:0x0800,resubmit(,1) table=0,priority=10 actions=resubmit(,1) table=1,priority=10 actions=normal ]) @@ -1176,11 +1531,11 @@ table=1,priority=10 actions=normal AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl add-flows br1 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -1204,10 +1559,10 @@ AT_CHECK([ovs-vsctl add-port br0 patch0]) AT_CHECK([ovs-vsctl add-port br1 patch1]) AT_DATA([flows.txt], [dnl -table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:3,push_mpls:0x8847,set_mpls_label:2,push_mpls:0x8847,set_mpls_label:1,resubmit(,3) +table=0,priority=100,dl_type=0x0800 actions=push_mpls:0x8847,set_mpls_label:4,push_mpls:0x8847,set_mpls_label:2,push_mpls:0x8847,set_mpls_label:1,resubmit(,3) table=0,priority=100,dl_type=0x8847,mpls_label=1 actions=pop_mpls:0x8847,resubmit(,1) table=1,priority=100,dl_type=0x8847,mpls_label=2 actions=pop_mpls:0x8847,resubmit(,2) -table=2,priority=100,dl_type=0x8847,mpls_label=3 actions=pop_mpls:0x0800,resubmit(,3) +table=2,priority=100,dl_type=0x8847,mpls_label=4 actions=pop_mpls:0x0800,resubmit(,3) table=0,priority=10 actions=resubmit(,3) table=3,priority=10 actions=normal ]) @@ -1215,11 +1570,11 @@ table=3,priority=10 actions=normal AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-ofctl add-flows br1 flows.txt]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) OVS_TRAFFIC_VSWITCHD_STOP @@ -1591,6 +1946,7 @@ dnl ns0: connect to br-underlay, with IP: 10.1.1.1 AT_SETUP([datapath - truncate and output to gre tunnel by simulated packets]) OVS_CHECK_MIN_KERNEL(3, 10) AT_SKIP_IF([test $HAVE_NC = no]) +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay], [set bridge br-underlay other-config:hwaddr=\"02:90:8c:a8:a1:49\"]) @@ -1660,9 +2016,8 @@ AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=2" | sed -n 's/.*\(n\_bytes=[ n_bytes=242 ]) dnl After truncation = outer ETH(14) + outer IP(20) + GRE(4) + 100 = 138B -AT_CHECK([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [0], [dnl -n_bytes=138 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [dnl +n_bytes=138]) dnl check tunnel pop path, from at_ns0 to at_ns1 dnl This 200-byte packet is simulated on behalf of ns_gre0 @@ -1670,9 +2025,9 @@ ovs-ofctl -O OpenFlow13 packet-out br-underlay "in_port=1 packet=02908ca8a149faa dnl After truncation = 100 byte at loopback device p2(4) AT_CHECK([ovs-appctl revalidator/purge], [0]) -AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=4" | ofctl_strip], [0], [dnl - n_packets=1, n_bytes=100, priority=1,ip,in_port=4 actions=drop -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br0 | grep "in_port=4" | ofctl_strip], [dnl + n_packets=1, n_bytes=100, priority=1,ip,in_port=4 actions=drop]) + dnl SLOW_ACTION: disable datapath truncate support dnl Repeat the test above, but exercise the SLOW_ACTION code path @@ -1697,9 +2052,8 @@ AT_CHECK([ovs-ofctl dump-flows br0 | grep "in_port=2" | sed -n 's/.*\(n\_bytes=[ n_bytes=242 ]) dnl After truncation = outer ETH(14) + outer IP(20) + GRE(4) + 100 = 138B -AT_CHECK([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [0], [dnl -n_bytes=138 -]) +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-flows br-underlay | grep "in_port=LOCAL" | sed -n 's/.*\(n\_bytes=[[0-9]]*\).*/\1/p'], [dnl +n_bytes=138]) dnl check tunnel pop path, from at_ns0 to at_ns1 dnl This 200-byte packet is simulated on behalf of ns_gre0 @@ -1724,6 +2078,7 @@ AT_SETUP([datapath - truncate and output to gre tunnel]) AT_SKIP_IF([test $HAVE_NC = no]) OVS_CHECK_KERNEL_EXCL(3, 10, 4, 15) OVS_CHECK_GRE() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) @@ -1876,78 +2231,301 @@ masks-cache:size:256 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_BANNER([MPLS]) - -AT_SETUP([mpls - encap header dp-support]) -AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +AT_SETUP([datapath - drop action]) OVS_TRAFFIC_VSWITCHD_START() - -AT_SKIP_IF([! ovs-appctl dpif/show-dp-features br0 2>&1 | grep "MPLS Label add: Yes" >/dev/null]) +OVS_CHECK_DROP_ACTION() +AT_KEYWORDS(drop_action) ADD_NAMESPACES(at_ns0, at_ns1) -ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", 36:b1:ee:7c:01:03) -ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", 36:b1:ee:7c:01:02) - -dnl The flow will encap a mpls header to the ip packet -dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp -AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +dnl Exceed the max number of resubmits. +(echo "dl_type=0x806, actions=normal" +for i in $(seq 1 64); do + j=$(expr $i + 1) + echo "in_port=$i, actions=resubmit:$j, resubmit:$j, local" + done + echo "in_port=65, actions=local" +) > flows.txt +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl Generate some traffic. +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -w 2 10.1.1.2], [1], [ignore]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8847 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | dnl + strip_ptype | strip_eth | strip_recirc | strip_stats | dnl + strip_used | sort], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:0.0s, actions:drop]) -OVS_TRAFFIC_VSWITCHD_STOP +OVS_TRAFFIC_VSWITCHD_STOP(["/WARN/d"]) AT_CLEANUP -AT_SETUP([mpls - encap header slow-path]) -AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +AT_SETUP([datapath - simulated flow action update]) OVS_TRAFFIC_VSWITCHD_START() -AT_CHECK(ovs-appctl dpif/set-dp-features br0 add_mpls false) ADD_NAMESPACES(at_ns0, at_ns1) -ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", 36:b1:ee:7c:01:03) -ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", 36:b1:ee:7c:01:02) +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") -dnl The flow will encap a mpls header to the ip packet -dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp -AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) +AT_DATA([flows.txt], [dnl +add in_port=ovs-p0,actions=ovs-p1,br0 +add in_port=ovs-p1,actions=ovs-p0,br0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:756/bytes:882/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:9, bytes:882, used:0.0s, actions:2,1 +]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8847 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +AT_DATA([flows2.txt], [dnl +modify in_port=ovs-p0,actions=ovs-p1 +modify in_port=ovs-p1,actions=ovs-p0 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) -OVS_TRAFFIC_VSWITCHD_STOP -AT_CLEANUP +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) -AT_SETUP([mpls_mc - encap header dp-support]) -AT_SKIP_IF([test $HAVE_TCPDUMP = no]) -OVS_TRAFFIC_VSWITCHD_START() +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed -e 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:1596/bytes:1862/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:3 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:19, bytes:1862, used:0.0s, actions:2 +]) -AT_SKIP_IF([! ovs-appctl dpif/show-dp-features br0 2>&1 | grep "MPLS Label add: Yes" >/dev/null]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) +AT_CHECK([ovs-appctl revalidator/wait], [0]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 10 -i 0.1 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | grep "eth_type(0x0800)" | sort | dnl + strip_recirc | strip_used | dnl + sed 's/,packet_type(ns=[[0-9]]*,id=[[0-9]]*),/,/;s/,eth(),/,/;s/bytes:2436/bytes:2842/'], + [0], [dnl +recirc_id(),in_port(2),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:3,1 +recirc_id(),in_port(3),eth_type(0x0800),ipv4(frag=no), packets:29, bytes:2842, used:0.0s, actions:2,1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([datapath - netdev offload software fallback]) +AT_SKIP_IF([test $HAVE_NC = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Test the case where only one side has all checksum and tso offload disabled. +AT_CHECK([ethtool -K ovs-p1 tso off], [0], [ignore], [ignore]) +AT_CHECK([ethtool -K ovs-p1 sg off], [0], [ignore], [ignore]) + +dnl Reinitialize. +AT_CHECK([ovs-vsctl del-port ovs-p1]) +AT_CHECK([ovs-vsctl add-port br0 ovs-p1]) + +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NS_CHECK_EXEC([at_ns1], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +NETNS_DAEMONIZE([at_ns0], [nc -l 1234 > data_0], [nc1.pid]) +NETNS_DAEMONIZE([at_ns1], [nc -l 1234 > data_1], [nc2.pid]) + +AT_CHECK([dd if=/dev/urandom of=payload.bin bs=60000 count=1 2> /dev/null]) +on_exit 'rm -f payload.bin' + +NS_CHECK_EXEC([at_ns0], [nc $NC_EOF_OPT 10.1.1.2 1234 < payload.bin]) +NS_CHECK_EXEC([at_ns1], [nc $NC_EOF_OPT 10.1.1.1 1234 < payload.bin]) + +dnl Wait until transfer completes. +OVS_WAIT_WHILE([kill -0 $(cat nc1.pid) $(cat nc2.pid)]) + +AT_CHECK([diff -q payload.bin data_0], [0]) +AT_CHECK([diff -q payload.bin data_1], [0]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([datapath - Neighbor Discovery with loose match]) +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "2001::1:0:392/64", 36:b1:ee:7c:01:03) +ADD_VETH(p1, at_ns1, br0, "2001::1:0:9/64", 36:b1:ee:7c:01:02) + +dnl Set up flows for moving icmp ND Solicit around. This should be the +dnl same for the other ND types. +AT_DATA([flows.txt], [dnl +table=0 priority=95 icmp6,icmp_type=136,nd_target=2001::1:0:9 actions=resubmit(,10) +table=0 priority=95 icmp6,icmp_type=136,nd_target=2001::1:0:392 actions=resubmit(,10) +table=0 priority=65 actions=resubmit(,20) +table=10 actions=NORMAL +table=20 actions=drop +]) +AT_CHECK([ovs-ofctl del-flows br0]) +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +m4_define([ND_NS_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x86dd], + [ipv6_src=fe80::f816:3eff:fe04:6604,ipv6_dst=fe80::f816:3eff:fea7:dd0e], + [nw_proto=58,nw_ttl=255,nw_frag=no], + [icmpv6_type=136,icmpv6_code=0], + [nd_options_type=2,nd_tll=36:b1:ee:7c:01:03])]) + +dnl Send a mismatching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=3000::1')], + [0], [ignore]) + +dnl Send a matching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=2001::1:0:392')], + [0], [ignore]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl + strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl + grep ",nd" | sort], [0], [dnl +recirc_id(),in_port(2),eth(src=36:b1:ee:7c:01:03,dst=36:b1:ee:7c:01:02),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=2001::1:0:392), packets:0, bytes:0, used:never, actions:1,3 +recirc_id(),in_port(2),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=3000::1), packets:0, bytes:0, used:never, actions:drop +]) + +OVS_WAIT_UNTIL([ovs-appctl dpctl/dump-flows | grep ",nd" | wc -l | grep -E ^0]) + +dnl Send a matching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=2001::1:0:392')], + [0], [ignore]) + +dnl Send a mismatching neighbor discovery. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ND_NS_PKT,nd_target=3000::1')], + [0], [ignore]) + +AT_CHECK([ovs-appctl dpctl/dump-flows | strip_stats | strip_used | dnl + strip_key32 | strip_ptype | strip_eth | strip_recirc | dnl + grep ",nd" | sort], [0], [dnl +recirc_id(),in_port(2),eth(src=36:b1:ee:7c:01:03,dst=36:b1:ee:7c:01:02),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=2001::1:0:392), packets:0, bytes:0, used:never, actions:1,3 +recirc_id(),in_port(2),eth_type(0x86dd),ipv6(proto=58,frag=no),icmpv6(type=136),nd(target=3000::1), packets:0, bytes:0, used:never, actions:drop +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_BANNER([MPLS]) + +AT_SETUP([mpls - encap header dp-support]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_SKIP_IF([! ovs-appctl dpif/show-dp-features br0 2>&1 | grep "MPLS Label add: Yes" >/dev/null]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", 36:b1:ee:7c:01:03) +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", 36:b1:ee:7c:01:02) + +dnl The flow will encap a mpls header to the ip packet +dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp +AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) + +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) + +dnl The packet is sent from p0(at_ns0) interface directed to +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) + +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([mpls - encap header slow-path]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK(ovs-appctl dpif/set-dp-features br0 add_mpls false) +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", 36:b1:ee:7c:01:03) +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", 36:b1:ee:7c:01:02) + +dnl The flow will encap a mpls header to the ip packet +dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp +AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) + +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) + +dnl The packet is sent from p0(at_ns0) interface directed to +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) + +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([mpls_mc - encap header dp-support]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_SKIP_IF([! ovs-appctl dpif/show-dp-features br0 2>&1 | grep "MPLS Label add: Yes" >/dev/null]) ADD_NAMESPACES(at_ns0, at_ns1) @@ -1958,20 +2536,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls_mc),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8848 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8848], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -1990,20 +2577,29 @@ dnl The flow will encap a mpls header to the ip packet dnl eth/ip/icmp --> OVS --> eth/mpls/eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x0800 actions=encap(mpls_mc),set_mpls_label:2,encap(ethernet),set_field:00:00:00:00:00:02->dl_dst,set_field:00:00:00:00:00:01->dl_src,ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl The hex dump is a icmp packet. pkt=eth/ip/icmp dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT')], [0], [ignore]) -dnl Check the expected mpls encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *0000 *0000 *0002 *0000 *0000 *0001 *8848 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *2140 *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *4500 *0054 *0344 *4000 *4001 *2161 *0a01 *0101" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0a01 *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61" 2>&1 1>/dev/null]) +dnl Check the expected mpls encapsulated packet on the egress interface. +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8848], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'MPLS_HEADER'), + $(ovs-ofctl compose-packet --bare 'ICMP_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2024,24 +2620,30 @@ dnl eth/mpls/eth/ip/icmp --> OVS --> eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x8847,mpls_label=2 actions=decap(),decap(packet_type(ns=0,type=0)),ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is an mpls packet encapsulating ethernet packet. pkt=eth/mpls/eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 00 00 00 00 00 02 00 00 00 00 00 01 88 47 00 00 21 40 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -dnl Check the expected decapsulated on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0054 *0344 *4000 *4001 *2161 *0a01 *0101 *0a01" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0000 *500b *0200 *0000 *0000 *1011 *1213 *1415" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *1617 *1819 *1a1b *1c1d *1e1f *2021 *2223 *2425" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *2627 *2829 *2a2b *2c2d *2e2f *3031 *3233 *3435" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0060: *3637" 2>&1 1>/dev/null]) +dnl The packet is an eth/mpls/eth/ip/icmp sent from p0(at_ns0) interface +dnl directed to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'MPLS_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'ICMP_PKT')"], + [0], [ignore]) +dnl Check the expected decapsulated on the egress interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'ICMP_PKT')\$"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2061,28 +2663,138 @@ dnl eth/mpls/eth/ip/icmp --> OVS --> eth/ip/icmp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x8847,mpls_label=2 actions=decap(),decap(packet_type(ns=0,type=0)),ovs-p1"]) -rm -rf p1.pcap -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is an mpls packet encapsulating ethernet packet. pkt=eth/mpls/eth/ip/icmp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 00 00 00 00 00 02 00 00 00 00 00 01 88 47 00 00 21 40 36 b1 ee 7c 01 02 36 b1 ee 7c 01 03 08 00 45 00 00 54 03 44 40 00 40 01 21 61 0a 01 01 01 0a 01 01 02 08 00 ef ac 7c e4 00 03 5b 2c 1f 61 00 00 00 00 50 0b 02 00 00 00 00 00 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 > /dev/null]) +m4_define([MPLS_HEADER], [m4_join([,], + [eth_src=00:00:00:00:00:01,eth_dst=00:00:00:00:00:02,eth_type=0x8847], + [mpls_label=2,mpls_ttl=64,mpls_bos=1])]) + +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=36:b1:ee:7c:01:03,eth_dst=36:b1:ee:7c:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) + +dnl The packet is an eth/mpls/eth/ip/icmp sent from p0(at_ns0) interface +dnl directed to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'MPLS_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'ICMP_PKT')"], + [0], [ignore]) + +dnl Check the expected decapsulated on the egress interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'ICMP_PKT')\$"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_BANNER([QoS]) -dnl Check the expected decapsulated on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *36b1 *ee7c *0102 *36b1 *ee7c *0103 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0054 *0344 *4000 *4001 *2161 *0a01 *0101 *0a01" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0102 *0800 *efac *7ce4 *0003 *5b2c *1f61 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0000 *500b *0200 *0000 *0000 *1011 *1213 *1415" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *1617 *1819 *1a1b *1c1d *1e1f *2021 *2223 *2425" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *2627 *2829 *2a2b *2c2d *2e2f *3031 *3233 *3435" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0060: *3637" 2>&1 1>/dev/null]) +AT_SETUP([QoS - basic configuration]) +OVS_CHECK_TC_QDISC() +AT_SKIP_IF([test $HAVE_ETHTOOL = "no"]) +OVS_TRAFFIC_VSWITCHD_START() + +AT_CHECK([ip tuntap add ovs-tap0 mode tap]) +on_exit 'ip link del ovs-tap0' +AT_CHECK([ip tuntap add ovs-tap1 mode tap]) +on_exit 'ip link del ovs-tap1' + +dnl Set maximum link speed to 5Gb. +AT_CHECK([ethtool -s ovs-tap0 speed 5000 duplex full]) +AT_CHECK([ip link set dev ovs-tap0 up]) +AT_CHECK([ethtool -s ovs-tap1 speed 5000 duplex full]) +AT_CHECK([ip link set dev ovs-tap1 up]) + +AT_CHECK([ovs-vsctl add-port br0 ovs-tap0 -- set int ovs-tap0 type=tap]) +AT_CHECK([ovs-vsctl add-port br0 ovs-tap1 -- set int ovs-tap1 type=tap]) + +dnl Adding a custom qdisc to ovs-tap1, ovs-tap0 will have the default qdisc. +AT_CHECK([tc qdisc add dev ovs-tap1 root noqueue]) +AT_CHECK([tc qdisc show dev ovs-tap1 | grep -q noqueue]) + +dnl Configure the same QoS for both ports: +dnl queue0 uses fixed max-rate. +dnl queue1 relies on underlying link speed. +AT_CHECK([ovs-vsctl dnl + -- --id=@queue0 create queue dnl + other_config:min-rate=2000000 other_config:max-rate=3000000 dnl + other_config:burst=3000000 dnl + -- --id=@queue1 create queue dnl + other_config:min-rate=4000000 other_config:burst=4000000 dnl + -- --id=@qos create qos dnl + type=linux-htb queues:0=@queue0 dnl + queues:1=@queue1 -- dnl + -- set port ovs-tap0 qos=@qos -- set port ovs-tap1 qos=@qos], + [ignore], [ignore]) + +dnl Wait for qdiscs to be applied. +OVS_WAIT_UNTIL([tc qdisc show dev ovs-tap0 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-tap1 | grep -q htb]) + +dnl Check the configuration. +m4_define([HTB_CONF0], [rate 2Mbit ceil 3Mbit burst 375000b cburst 375000b]) +m4_define([HTB_CONF1], [rate 4Mbit ceil 5Gbit burst 500000b cburst 500000b]) +AT_CHECK([tc class show dev ovs-tap0 | grep -q 'class htb .* HTB_CONF0']) +AT_CHECK([tc class show dev ovs-tap0 | grep -q 'class htb .* HTB_CONF1']) +AT_CHECK([tc class show dev ovs-tap1 | grep -q 'class htb .* HTB_CONF0']) +AT_CHECK([tc class show dev ovs-tap1 | grep -q 'class htb .* HTB_CONF1']) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([QoS - 64bit]) +OVS_CHECK_TC_QDISC() +AT_SKIP_IF([test $HAVE_TCA_HTB_RATE64 = no]) +OVS_TRAFFIC_VSWITCHD_START() +ADD_NAMESPACES(at_ns0, at_ns1) +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Configure the QoS with rates that require 64bits, i.e: > 34Gbps. +AT_CHECK([ovs-vsctl set port ovs-p0 qos=@qos -- set port ovs-p1 qos=@qos dnl + -- --id=@qos create qos dnl + type=linux-htb other-config:max-rate=50000000000 queues:0=@queue dnl + -- --id=@queue create queue dnl + other_config:min-rate=40000000000 other_config:max-rate=50000000000 dnl + other_config:burst=5000000], + [ignore], [ignore]) + +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p0 | grep -q htb]) +OVS_WAIT_UNTIL([tc qdisc show dev ovs-p1 | grep -q htb]) + +m4_define([HTB_CONF], [rate 40Gbit ceil 50Gbit burst 620000b cburst 618750b]) +AT_CHECK([tc class show dev ovs-p0 | grep -q 'class htb .* HTB_CONF']) +AT_CHECK([tc class show dev ovs-p1 | grep -q 'class htb .* HTB_CONF']) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([Ingress Policing - 64-bit]) +OVS_CHECK_TC_QDISC() +AT_SKIP_IF([test $HAVE_TCA_POLICE_PKTRATE64 = no]) +OVS_TRAFFIC_VSWITCHD_START() +ADD_NAMESPACES(ns0) +ADD_VETH(p0, ns0, br0, "10.1.1.1/24") + +AT_CHECK([ovs-vsctl set interface ovs-p0 ingress_policing_rate=50000000]) +AT_CHECK([ovs-vsctl set interface ovs-p0 ingress_policing_burst=400000]) + +AT_CHECK([tc -o -s -d filter show dev ovs-p0 ingress | + sed -n 's/.*\(rate [[0-9]]*[[a-zA-Z]]* burst [[0-9]]*[[a-zA-Z]]*\).*/\1/; T; p; q'], + [0],[dnl +rate 50Gbit burst 74500000b +]) + +AT_CHECK([tc -s -d filter show dev ovs-p0 ingress | + grep -E "basic|matchall" > /dev/null], [0]) +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([conntrack]) AT_SETUP([conntrack - controller]) @@ -2194,8 +2906,9 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10. OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([conntrack - ct flush by 5-tuple]) +AT_SETUP([conntrack - ct flush]) CHECK_CONNTRACK() +CHECK_CONNTRACK_SCTP() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2206,55 +2919,280 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_DATA([flows.txt], [dnl priority=1,action=drop priority=10,arp,action=normal -priority=100,in_port=1,udp,action=ct(commit),2 -priority=100,in_port=2,udp,action=ct(zone=5,commit),1 -priority=100,in_port=1,icmp,action=ct(commit),2 -priority=100,in_port=2,icmp,action=ct(zone=5,commit),1 +priority=100,in_port=1,ip,action=ct(commit,exec(set_field:0xaa->ct_mark)),2 +priority=100,in_port=2,ip,action=ct(zone=5,commit,exec(set_field:0xaa00000000->ct_label)),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) +dp=$(ovs-appctl dpctl/dump-dps) + +m4_foreach([FLUSH_CMD], [[ovs-appctl dpctl/flush-conntrack], + [ovs-appctl dpctl/flush-conntrack $dp], + [ovs-ofctl ct-flush br0]], [ +AS_BOX([Testing with FLUSH_CMD]) dnl Test UDP from port 1 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [1]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], [1], [dnl -]) dnl Test UDP from port 2 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [0], [dnl -udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) +AT_CHECK([FLUSH_CMD zone=5 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl -]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0]) dnl Test ICMP traffic -NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -W 2 10.1.1.1 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [0], [stdout]) AT_CHECK([cat stdout | FORMAT_CT(10.1.1.1)], [0],[dnl -icmp,orig=(src=10.1.1.2,dst=10.1.1.1,id=,type=8,code=0),reply=(src=10.1.1.1,dst=10.1.1.2,id=,type=0,code=0),zone=5 +icmp,orig=(src=10.1.1.2,dst=10.1.1.1,id=,type=8,code=0),reply=(src=10.1.1.1,dst=10.1.1.2,id=,type=0,code=0),zone=5,labels=0xaa00000000 ]) ICMP_ID=`cat stdout | cut -d ',' -f4 | cut -d '=' -f2` ICMP_TUPLE=ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=1,icmp_id=$ICMP_ID,icmp_type=8,icmp_code=0 -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 $ICMP_TUPLE]) +AT_CHECK([FLUSH_CMD zone=5 $ICMP_TUPLE]) AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], [1], [dnl ]) +dnl Test UDP from port 1 and 2, partial flush by src port +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_src=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by dst port +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_proto=17,ct_tp_dst=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by src address +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by dst address +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_dst=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by src address in reply direction +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD '' 'ct_nw_src=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD zone=5 '' 'ct_nw_src=10.1.1.1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, flush without arguments +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test SCTP flush based on port. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500003400010000408464410a0101010a01010200010002000000009178f7d30100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000950540000000a08004500003400010000408464410a0101020a010101000200010000000098f29e470100001470e18ccc00000000000a000a00000000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.1,ct_nw_proto=132,ct_tp_src=1,ct_tp_dst=2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sed "s/,protoinfo=.*$//" | sort], [0], [dnl +sctp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD 'ct_nw_src=10.1.1.2,ct_nw_proto=132,ct_tp_src=2,ct_tp_dst=1']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test UDP from port 1 and 2, partial flush by mark and labels. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD mark=0xaa]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD labels=0xaa00000000]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD mark=2/2]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD labels=0x0200000000/0x0200000000]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) + +dnl Test flush with invalid arguments. + +AT_CHECK([FLUSH_CMD zone=invalid 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse zone" stderr]) + +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=10.1.1.1,invalid=invalid' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "invalid conntrack tuple field: invalid" stderr]) + +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=invalid' 'ct_nw_dst=10.1.1.1'], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse field ct_nw_src" stderr]) + +AT_CHECK([FLUSH_CMD zone=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "invalid arguments" stderr]) + +AT_CHECK([FLUSH_CMD zone=1 mark=1 labels=1 'ct_nw_src=10.1.1.1' 'ct_nw_dst=10.1.1.1' invalid invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "command takes at most 6 arguments" stderr]) + +AT_CHECK([FLUSH_CMD mark=invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse mark" stderr]) + +AT_CHECK([FLUSH_CMD labels=invalid], [ignore], [ignore], [stderr]) +AT_CHECK([grep -q "failed to parse labels" stderr]) + +dnl Test UDP from port 1 and 2, partial flush by zone. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD zone=5]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),mark=170 +]) + +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 actions=resubmit(,0)"]) + +AT_CHECK([FLUSH_CMD zone=0]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [0], [dnl +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5,labels=0xaa00000000 +]) + +AT_CHECK([FLUSH_CMD]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1,"], [1]) +]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2279,7 +3217,7 @@ priority=100,in_port=2,icmp,ct_state=+trk+est,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2287,7 +3225,10 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0) ]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +]) dnl Pings from ns1->ns0 should fail. NS_CHECK_EXEC([at_ns1], [ping -q -c 3 -i 0.3 -w 2 10.1.1.1 | FORMAT_PING], [0], [dnl @@ -2320,7 +3261,7 @@ priority=100,in_port=2,icmp,ct_state=+trk+est,action=1 AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2420,7 +3361,7 @@ NS_CHECK_EXEC([at_ns1], [ping6 -q -c 3 -i 0.3 -w 2 fc00::1 | FORMAT_PING], [0], ]) dnl Pings from ns0->ns1 should work fine. -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -2428,6 +3369,11 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl icmpv6,orig=(src=fc00::1,dst=fc00::2,id=,type=128,code=0),reply=(src=fc00::2,dst=fc00::1,id=,type=129,code=0) ]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2']) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl +]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -2658,6 +3604,9 @@ AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl AT_CHECK([ovs-ofctl mod-flows br0 dnl 'priority=100,ct_state=-trk,tcp,in_port="ovs-p0" actions=ct(table=0,zone=15)']) +dnl Wait for a flow flush as some datapaths (read TC) might take time to clear. +AT_CHECK([ovs-appctl revalidator/wait], [0]) + NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o wget0.log]) AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl @@ -2668,6 +3617,7 @@ AT_CLEANUP AT_SETUP([conntrack - zones from other field, more tests]) CHECK_CONNTRACK() +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -2706,6 +3656,9 @@ AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl AT_CHECK([ovs-ofctl mod-flows br0 'priority=100,ct_state=-trk,tcp,in_port="ovs-p0" actions=ct(table=0,zone=15,commit,exec(load:0xffff000f->NXM_NX_CT_LABEL[[0..31]]))']) +dnl Wait for a flow flush as some datapaths (read TC) might take time to clear. +AT_CHECK([ovs-appctl revalidator/wait], [0]) + NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o wget0.log]) AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p0 dnl @@ -2799,6 +3752,7 @@ AT_CLEANUP AT_SETUP([conntrack - multiple namespaces, internal ports]) CHECK_CONNTRACK() CHECK_CONNTRACK_LOCAL_STACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START( [set-fail-mode br0 secure -- ]) @@ -3196,7 +4150,6 @@ OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP AT_SETUP([conntrack - ICMP related to original direction]) -AT_SKIP_IF([test $HAVE_NC = no]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() @@ -3304,6 +4257,43 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.3)], [0], [dnl OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMP related NAT with single port]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "f0:00:00:01:01:01") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "f0:00:00:01:01:02") + +AT_DATA([flows.txt], [dnl +table=0,ip,ct_state=-trk,actions=ct(table=0,nat) +table=0,in_port=ovs-p0,ct_state=+trk+new,udp,actions=ct(commit,nat(dst=10.1.1.2:8080)),ovs-p1 +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp,actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +rm p0.pcap +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) + +dnl Send UDP packet from 10.1.1.1:1234 to 10.1.1.240:80 +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p0,packet=f00000010102f0000001010108004500002944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a,actions=resubmit(,0)"]) +dnl Send "destination unreachable" response +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p1,packet=f00000010101f00000010102080045c000456a3700004001f9bc0a0101020a01010103031328000000004500002944c140004011dffe0a0101010a01010204d21f9000154cd2646573745f756e72656163680a,actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | sort], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.240,sport=1234,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=8080,dport=1234) +]) + +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q "f00000010101f00000010102080045c000456a3700004001f8ce0a0101f00a01010103031416000000004500002944c140004011df100a0101010a0101f004d2005000156b24646573745f756e72656163680a"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - IPv4 fragmentation]) CHECK_CONNTRACK() OVS_TRAFFIC_VSWITCHD_START() @@ -3328,12 +4318,12 @@ dnl Modify userspace conntrack fragmentation handling. DPCTL_MODIFY_FRAGMENTATION() dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3405,12 +4395,12 @@ dnl Modify userspace conntrack fragmentation handling. DPCTL_MODIFY_FRAGMENTATION() dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3422,6 +4412,7 @@ AT_CLEANUP AT_SETUP([conntrack - IPv4 fragmentation + cvlan]) CHECK_CONNTRACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() @@ -3450,22 +4441,22 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.255.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.255.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.255.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.255.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3624,12 +4615,12 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl Ipv6 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3706,12 +4697,12 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl Ipv4 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv4 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3749,22 +4740,22 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00:1::4]) dnl Ipv6 fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:1::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Ipv6 larger fragmentation connectivity check. (outer svlan) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00:ffff::4 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -3945,6 +4936,7 @@ AT_SETUP([conntrack - Fragmentation over vxlan]) OVS_CHECK_VXLAN() CHECK_CONNTRACK() CHECK_CONNTRACK_LOCAL_STACK() +CHECK_NO_TC_OFFLOAD() OVS_TRAFFIC_VSWITCHD_START() ADD_BR([br-underlay]) @@ -3975,18 +4967,18 @@ ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], [id 0 dstport 4789]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4035,18 +5027,18 @@ dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay with different packet sizes -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -W 2 fc00::2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4159,7 +5151,7 @@ dnl The default udp_single and icmp_first timeouts are 30 seconds in dnl kernel DP, and 60 seconds in userspace DP. dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4185,7 +5177,7 @@ done AT_CHECK([ovs-vsctl --may-exist add-zone-tp $DP_TYPE zone=5 udp_first=1 udp_single=1 icmp_first=1 icmp_reply=1]) dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4203,7 +5195,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl ]) dnl Re-send ICMP and UDP traffic to test conntrack cache -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4224,7 +5216,7 @@ dnl Set the timeout policy to default again. AT_CHECK([ovs-vsctl del-zone-tp $DP_TYPE zone=5]) dnl Send ICMP and UDP traffic -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 actions=resubmit(,0)"]) @@ -4239,6 +5231,79 @@ udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src= OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - SCTP SNAT with port range]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_SCTP() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address e6:66:c1:11:11:11]) +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address e6:66:c1:22:22:22]) + +dnl Allow any traffic from ns0->ns1. Only allow return traffic from ns1->ns0. +AT_DATA([flows.txt], [dnl +table=0,priority=100,in_port=1,sctp,action=ct(commit,zone=1,nat(src=10.1.1.240:34567)),controller +table=0,priority=100,in_port=2,ct_state=-trk,sctp,tp_dst=34567,action=ct(table=1,zone=1,nat) +table=0,priority=0,action=drop +table=1,priority=100,in_port=2,ct_state=+trk+rpl,ct_zone=1,sctp,action=controller +table=1,priority=0,action=drop +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +AT_CAPTURE_FILE([ofctl_monitor.log]) +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +dnl Simple SCTP association local and remote single homing +dnl Send INIT. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c111111108004502004400004000408424300a0101010a010102d6b9303900000000c5cc426b0100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c0000004 actions=resubmit(,0)"]) +dnl Reply INIT_ACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502012400004000408422610a0101020a0101f03039870770e18ccc97abd49a0200010425bb9dfa0001a000000a000abb90fba5000700e827a048cd1474b111490710816ec95cfc501126b200000000000000000000000000000000fa9dbb25cc8ce17000000000000000002b953b0e1d346d160a000a00a5fb90bb020087070a0101f00000000000000000000000000000000000000000393001000000000080020024fbb82eae13af8d70329bc42bb7cd7e6458d60ff1a181e9b41167c2cab54471bf0000000000000000000000000000000000000000000000000000000000000000000000000100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c00000040000000000000000000000000000000080000004c0000004 actions=resubmit(,0)"]) +dnl Send COOKIE_ECHO. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c1111111080045020108000040004084236c0a0101010a010102d6b9303925bb9dfaf2c860300a0000e827a048cd1474b111490710816ec95cfc501126b200000000000000000000000000000000fa9dbb25cc8ce17000000000000000002b953b0e1d346d160a000a00a5fb90bb020087070a0101f00000000000000000000000000000000000000000393001000000000080020024fbb82eae13af8d70329bc42bb7cd7e6458d60ff1a181e9b41167c2cab54471bf0000000000000000000000000000000000000000000000000000000000000000000000000100002470e18ccc0001a000000affff7ae1c142000c00060005000080000004c000000400000000000000000000000000000000 actions=resubmit(,0)"]) +dnl Reply COOKIE_ACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502002400004000408423610a0101020a0101f03039870770e18ccc0391398b0b000004 actions=resubmit(,0)"]) +dnl Send DATA. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c1111111080045020034000140004084243f0a0101010a010102d6b9303925bb9dfabc366345000300147ae1c1420000000000000000666f6f0a actions=resubmit(,0)"]) +dnl Reply SACK. +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=e666c1111111e666c122222208004502003042c840004084e08c0a0101020a0101f03039870770e18ccc6a990714030000107ae1c14200019ffc00000000 actions=resubmit(,0)"]) +dnl ABORT the association. The association cannot be gracefully terminated because of +dnl a small timeouts in SHUTDOWN_SENT in the kernel datapath that would make the test unreliable +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=e666c1222222e666c111111108004500002400010000408464510a0101010a010102d6b9303925bb9dfae3b82c3806000004 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) + +OVS_APP_EXIT_AND_WAIT([ovs-ofctl]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=82 in_port=1 (via action) data_len=82 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:9670267b +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=306 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=306 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:49864886 +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=278 in_port=1 (via action) data_len=278 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:8c816918 +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=50 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=50 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:ef4749fc +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=66 in_port=1 (via action) data_len=66 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:eb2b2c17 +NXT_PACKET_IN2 (xid=0x0): table_id=1 cookie=0x0 total_len=62 ct_state=est|rpl|trk|dnat,ct_zone=1,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=132,ct_tp_src=54969,ct_tp_dst=12345,ip,in_port=2 (via action) data_len=62 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:22:22:22,dl_dst=e6:66:c1:11:11:11,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=2,nw_ttl=64,nw_frag=no,tp_src=12345,tp_dst=54969 sctp_csum:9b67e853 +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=50 in_port=1 (via action) data_len=50 (unbuffered) +sctp,vlan_tci=0x0000,dl_src=e6:66:c1:11:11:11,dl_dst=e6:66:c1:22:22:22,nw_src=10.1.1.240,nw_dst=10.1.1.2,nw_tos=0,nw_ecn=0,nw_ttl=64,nw_frag=no,tp_src=34567,tp_dst=12345 sctp_csum:4bb49f65 +]) + +dnl Check the ct entry +dnl protoinfo has to be removed in order to normalize the current difference between user and kernel output +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed 's/,protoinfo=.*$//' ], [], [dnl +sctp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + dnl Check kernel datapath to make sure conntrack fills in L3 and L4 dnl protocol information AT_SETUP([conntrack - fragment reassembly with L3 L4 protocol information]) @@ -4417,7 +5482,7 @@ table=2,in_port=1,ip,ct_state=+trk+est,ct_zone=2,action=LOCAL AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +AT_CHECK([ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4488,7 +5553,7 @@ table=4,priority=100,ip,action=output:NXM_NX_REG0[[]] AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +AT_CHECK([ping -q -c 3 -i 0.3 -W 2 10.1.1.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -4512,6 +5577,7 @@ AT_CLEANUP AT_SETUP([conntrack - limit by zone]) CHECK_CONNTRACK() +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START() ADD_NAMESPACES(at_ns0, at_ns1) @@ -4522,20 +5588,20 @@ ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") AT_DATA([flows.txt], [dnl priority=1,action=drop priority=10,arp,action=normal -priority=100,in_port=1,udp,action=ct(commit),2 +priority=100,in_port=1,udp,action=ct(zone=1,commit),2 priority=100,in_port=2,udp,action=ct(zone=3,commit),1 ]) AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10 zone=0,limit=5 zone=1,limit=15 zone=2,limit=3 zone=3,limit=3]) -AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1,2,4]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,1,2,3], [],[dnl +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10 zone=1,limit=5 zone=2,limit=3 zone=3,limit=3 zone=4,limit=15]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=2,4,5]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,2,3,4], [],[dnl default limit=10 -zone=0,limit=5,count=0 -zone=1,limit=10,count=0 +zone=1,limit=5,count=0 zone=2,limit=10,count=0 zone=3,limit=3,count=0 +zone=4,limit=10,count=0 ]) dnl Test UDP from port 1 @@ -4549,48 +5615,176 @@ AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a5 AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000900080000 actions=resubmit(,0)"]) AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000a00080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,1,2,3,4,5], [0], [dnl +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,2,3,4,5], [0], [dnl +default limit=10 +zone=1,limit=5,count=5 +zone=2,limit=10,count=0 +zone=3,limit=3,count=0 +zone=4,limit=10,count=0 +zone=5,limit=10,count=0 +]) + +dnl Test ct-get-limits for all zones +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +zone=1,limit=5,count=5 +zone=3,limit=3,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | sort ], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=3),reply=(src=10.1.1.2,dst=10.1.1.1,sport=3,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=4),reply=(src=10.1.1.2,dst=10.1.1.1,sport=4,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=5),reply=(src=10.1.1.2,dst=10.1.1.1,sport=5,dport=1),zone=1 +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=6),reply=(src=10.1.1.2,dst=10.1.1.1,sport=6,dport=1),zone=1 +]) + +dnl Test UDP from port 2 +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000200080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000300080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000400080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000500080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000600080000 actions=resubmit(,0)"]) + +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=1,3], [0], [dnl +default limit=10 +zone=1,limit=5,count=5 +zone=3,limit=3,count=3 +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.3," | sort ], [0], [dnl +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=2),reply=(src=10.1.1.4,dst=10.1.1.3,sport=2,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=3),reply=(src=10.1.1.4,dst=10.1.1.3,sport=3,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10.1.1.3,sport=4,dport=1),zone=3 +]) + +dnl Test ct-del-limits for default zone. + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=4,limit=4]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=15 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits default]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=0 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=15 +zone=4,limit=4,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits default zone=4]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=4], [0], [dnl +default limit=0 +zone=4,limit=0,count=0 +]) + +dnl Test limit set via database. +VSCTL_ADD_DATAPATH_TABLE() + +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=3]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=10]) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=3]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +zone=1,limit=5,count=0 +]) + +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 1 3]) +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE 3 3]) + +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=10 +zone=1,limit=3,count=0 +zone=3,limit=3,count=0]) + +for i in 2 3 4 5 6; do + packet="50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000${i}00080000" + AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 \ + "in_port=2 packet=${packet} actions=resubmit(,0)"]) +done + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.3," | sort ], [0], [dnl +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=2),reply=(src=10.1.1.4,dst=10.1.1.3,sport=2,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=3),reply=(src=10.1.1.4,dst=10.1.1.3,sport=3,dport=1),zone=3 +udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10.1.1.3,sport=4,dport=1),zone=3 +]) + +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=10 +zone=1,limit=3,count=0 +zone=3,limit=3,count=3 +]) + +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 3]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl default limit=10 -zone=0,limit=5,count=5 -zone=1,limit=10,count=0 -zone=2,limit=10,count=0 -zone=3,limit=3,count=0 -zone=4,limit=10,count=0 -zone=5,limit=10,count=0 +zone=1,limit=3,count=0]) + +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 5]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=5 +zone=1,limit=3,count=0]) + +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE default]) +OVS_WAIT_UNTIL_EQUAL([ovs-appctl dpctl/ct-get-limits], [dnl +default limit=0 +zone=1,limit=3,count=0]) + +dnl Try to overwrite the zone limit via dpctl command. +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=3,limit=5 zone=1,limit=5], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl set-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error ]) -dnl Test ct-get-limits for all zones AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl -default limit=10 -zone=0,limit=5,count=5 -zone=3,limit=3,count=0 +default limit=0 +zone=1,limit=3,count=0 ]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1," | sort ], [0], [dnl -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=3),reply=(src=10.1.1.2,dst=10.1.1.1,sport=3,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=4),reply=(src=10.1.1.2,dst=10.1.1.1,sport=4,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=5),reply=(src=10.1.1.2,dst=10.1.1.1,sport=5,dport=1) -udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=6),reply=(src=10.1.1.2,dst=10.1.1.1,sport=6,dport=1) +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl del-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error ]) -dnl Test UDP from port 2 -AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000200080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000300080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000400080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000500080000 actions=resubmit(,0)"]) -AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 packet=50540000000a50540000000908004500001c000000000011a4c90a0101030a0101040001000600080000 actions=resubmit(,0)"]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=0 +zone=1,limit=3,count=0 +]) -AT_CHECK([ovs-appctl dpctl/ct-get-limits zone=0,3], [0], [dnl +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE 1]) +AT_CHECK([ovs-vsctl set-zone-limit $DP_TYPE default 10]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl default limit=10 -zone=0,limit=5,count=5 -zone=3,limit=3,count=3 ]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.3," | sort ], [0], [dnl -udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=2),reply=(src=10.1.1.4,dst=10.1.1.3,sport=2,dport=1),zone=3 -udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=3),reply=(src=10.1.1.4,dst=10.1.1.3,sport=3,dport=1),zone=3 -udp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=1,dport=4),reply=(src=10.1.1.4,dst=10.1.1.3,sport=4,dport=1),zone=3 +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=1,limit=5], [2], [ignore], [dnl +ovs-vswitchd: the zone limits are set via database, dnl +use 'ovs-vsctl set-zone-limit <...>' instead. (Operation not permitted) +ovs-appctl: ovs-vswitchd: server returned an error +]) + +dnl Delete all zones from DB, that should remove the protection. +AT_CHECK([ovs-vsctl del-zone-limit $DP_TYPE default]) + +AT_CHECK([ovs-appctl dpctl/ct-set-limits default=15 zone=1,limit=5]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=15 +zone=1,limit=5,count=0 +]) + +AT_CHECK([ovs-appctl dpctl/ct-del-limits zone=1]) +AT_CHECK([ovs-appctl dpctl/ct-get-limits], [0], [dnl +default limit=15 ]) OVS_TRAFFIC_VSWITCHD_STOP(["dnl @@ -4733,6 +5927,156 @@ tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src= OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - FTP non-standard port]) +AT_SKIP_IF([test $HAVE_FTP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_ALG() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0. +AT_DATA([flows1.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(alg=ftp,commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +dnl Similar policy but without allowing all traffic from ns0->ns1. +AT_DATA([flows2.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal + +dnl Allow outgoing TCP connections, and treat them as FTP +table=0,priority=100,in_port=1,tcp,action=ct(table=1) +table=1,in_port=1,tcp,ct_state=+trk+new,action=ct(commit,alg=ftp),2 +table=1,in_port=1,tcp,ct_state=+trk+est,action=2 + +dnl Allow incoming FTP data connections and responses to existing connections +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+new+rel,action=ct(commit),1 +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk-new+rel,action=1 +]) + +dnl flows3 is same as flows1, except no ALG is specified. +AT_DATA([flows3.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows1.txt]) + +OVS_START_L7([at_ns0], [ftp], [11111]) +OVS_START_L7([at_ns1], [ftp], [11111]) + +dnl FTP requests from p1->p0 should fail due to network failure. +dnl Try 3 times, in 1 second intervals. +NS_CHECK_EXEC([at_ns1], [wget ftp://10.1.1.1:11111 --no-passive-ftp -t 3 -T 1 -v -o wget1.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.1)], [0], [dnl +]) + +dnl FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Try the second set of flows. +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows2.txt]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl FTP requests from p1->p0 should fail due to network failure. +dnl Try 3 times, in 1 second intervals. +NS_CHECK_EXEC([at_ns1], [wget ftp://10.1.1.1:11111 --no-passive-ftp -t 3 -T 1 -v -o wget1.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.1)], [0], [dnl +]) + +dnl Active FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0-1.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +tcp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),reply=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),protoinfo=(state=) +]) + +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl Passive FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 -t 3 -T 1 --retry-connrefused -v -o wget0-2.log]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Try the third set of flows, without alg specifier. +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows3.txt]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +dnl FTP control requests from p0->p1 should work fine, but helper will not be assigned. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2:11111 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0-3.log], [4]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=) +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conntrack - FTP with expectation dump]) +AT_SKIP_IF([test $HAVE_FTP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_ALG() +CHECK_CONNTRACK_DUMP_EXPECTATIONS() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +table=0,priority=1,action=drop +table=0,priority=10,arp,action=normal +table=0,priority=10,icmp,action=normal +table=0,priority=100,in_port=1,tcp,action=ct(alg=ftp,commit),2 +table=0,priority=100,in_port=2,tcp,action=ct(table=1) +table=1,in_port=2,tcp,ct_state=+trk+est,action=1 +table=1,in_port=2,tcp,ct_state=+trk+rel,action=1 +]) + +AT_CHECK([ovs-ofctl --bundle replace-flows br0 flows.txt]) + +OVS_START_L7([at_ns1], [ftp]) + +dnl FTP requests from p0->p1 should work fine. +NS_CHECK_EXEC([at_ns0], [wget ftp://10.1.1.2 --no-passive-ftp -t 3 -T 1 --retry-connrefused -v -o wget0.log]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),protoinfo=(state=),helper=ftp +]) + +dnl Verify that a dump with zero entries in a zone doesn't return any entry. +AT_CHECK([ovs-appctl dpctl/dump-conntrack-exp zone=42], [0], [dnl +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack-exp | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=,dport=),parent=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=) +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([conntrack - FTP over IPv6]) AT_SKIP_IF([test $HAVE_FTP = no]) CHECK_CONNTRACK() @@ -5183,11 +6527,11 @@ ADD_NAMESPACES(at_ns0, at_ns1) ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +NS_CHECK_EXEC([at_ns1], [ip link set dev p1 address 80:89:89:89:89:89]) dnl Allow any traffic from ns0->ns1. Only allow nd, return traffic from ns1->ns0. AT_DATA([flows.txt], [dnl -in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568,random)),2 -in_port=2,ct_state=-trk,tcp,tp_dst=34567,action=ct(table=0,zone=1,nat) +in_port=1,tcp,action=ct(commit,zone=1,nat(src=10.1.1.240:34568)),2 in_port=2,ct_state=-trk,tcp,tp_dst=34568,action=ct(table=0,zone=1,nat) in_port=2,ct_state=+trk,ct_zone=1,tcp,action=1 dnl @@ -5211,17 +6555,28 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) dnl HTTP requests from p0->p1 should work fine. OVS_START_L7([at_ns1], [http]) -NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o wget0.log]) + +dnl Send a valid SYN to make conntrack pick it up. +dnl The source port used is 123 to prevent unwanted reuse in the next HTTP request. +syn_pkt=$(ovs-ofctl compose-packet --bare "eth_src=80:88:88:88:88:88,eth_dst=80:89:89:89:89:89,\ + dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=6,nw_ttl=64,nw_frag=no,tcp_flags=syn,\ + tcp_src=123,tcp_dst=80") +AT_CHECK([ovs-ofctl packet-out br0 "packet=${syn_pkt} actions=ct(commit,zone=1,nat(src=10.1.1.240:34568))"]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=) +]) NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 1 -T 1 --retry-connrefused -v -o wget0.log], [4]) -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/' | uniq], [0], [dnl -tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),zone=1,protoinfo=(state=) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | uniq], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.240,sport=,dport=),zone=1,protoinfo=(state=) ]) OVS_TRAFFIC_VSWITCHD_STOP(["dnl /Unable to NAT due to tuple space exhaustion - if DoS attack, use firewalling and\/or zone partitioning./d -/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) due to excessive rate/d"]) +/Dropped .* log messages in last .* seconds \(most recently, .* seconds ago\) due to excessive rate/d +/|WARN|.* execute ct.* failed/d"]) AT_CLEANUP AT_SETUP([conntrack - more complex SNAT]) @@ -5512,7 +6867,7 @@ table=10 priority=0 action=drop AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) sleep 1 dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. @@ -5536,7 +6891,96 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst= udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=),mark=1 ]) -AT_CHECK([tcpdump -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) +AT_CHECK([tcpdump -n -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conntrack - ICMP related with SNAT]) +AT_SKIP_IF([test $HAVE_NC = no]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +OVS_CHECK_MIN_KERNEL(6, 7) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Allow IP traffic from ns0->ns1, rewrite source IP with SNAT to 10.1.1.254. +dnl Only allow related ICMP responses back and undo NAT to restore original IP. +AT_DATA([flows.txt], [dnl +ct_state=-trk,ip actions=ct(table=0) +ct_state=+trk,ip,in_port=1 actions=ct(commit,nat(src=10.1.1.254)),2 +ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=ct(commit,table=1,nat) +dnl +dnl Handle ICMP related packets. +dnl These should match first rule with original IPs before SNAT. +dnl The second rule, which matches on the SNAT IP, shouldn't match any packets. +table=1,in_port=2,ct_state=+rel+trk,icmp,nw_src=10.1.1.2,nw_dst=10.1.1.1 action=1 +table=1,in_port=2,ct_state=+rel+trk,icmp,nw_dst=10.1.1.254 action=goto_table:2 +table=1,priority=0,action=drop +dnl +dnl Drop any ICMP related packets that incorrectly reach this table. +table=2,priority=0,action=drop +dnl +dnl ARP +priority=100 arp arp_op=1 action=move:OXM_OF_ARP_TPA[[]]->NXM_NX_REG2[[]],resubmit(,8),goto_table:10 +priority=10 arp action=normal +priority=0,action=drop +dnl +dnl MAC resolution table for IP in reg2, stores mac in OXM_OF_PKT_REG0 +table=8,reg2=0x0a0101f0/0xfffffff0,action=load:0x808888888888->OXM_OF_PKT_REG0[[]] +table=8,priority=0,action=load:0->OXM_OF_PKT_REG0[[]] +dnl ARP responder mac filled in at OXM_OF_PKT_REG0, or 0 for normal action. +dnl TPA IP in reg2. +dnl Swaps the fields of the ARP message to turn a query to a response. +table=10 priority=100 arp xreg0=0 action=normal +table=10 priority=10,arp,arp_op=1,action=load:2->OXM_OF_ARP_OP[[]],move:OXM_OF_ARP_SHA[[]]->OXM_OF_ARP_THA[[]],move:OXM_OF_PKT_REG0[[0..47]]->OXM_OF_ARP_SHA[[]],move:OXM_OF_ARP_SPA[[]]->OXM_OF_ARP_TPA[[]],move:NXM_NX_REG2[[]]->OXM_OF_ARP_SPA[[]],move:NXM_OF_ETH_SRC[[]]->NXM_OF_ETH_DST[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_OF_ETH_SRC[[]],move:NXM_OF_IN_PORT[[]]->NXM_NX_REG3[[0..15]],load:0->NXM_OF_IN_PORT[[]],output:NXM_NX_REG3[[0..15]] +table=10 priority=0 action=drop +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +rm p0.pcap +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. +NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc $NC_EOF_OPT -u 10.1.1.2 10000"]) + +dnl Flush conntrack state. +dnl To verify related packets are handled exactly the same as before flushing. +AT_CHECK([ovs-appctl dpctl/flush-conntrack], [0]) + +dnl Solicit another "destination unreachable" response. +dnl To verify that after flushing, the same openflow rules are matched. +NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc $NC_EOF_OPT -u 10.1.1.2 10000"]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | ofctl_strip_bytes | sort | grep -v drop], [0], [dnl + n_packets=1, priority=10,arp actions=NORMAL + n_packets=2, ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=ct(commit,table=1,nat) + n_packets=2, ct_state=+trk,ip,in_port=1 actions=ct(commit,nat(src=10.1.1.254)),output:2 + n_packets=2, priority=100,arp,arp_op=1 actions=move:NXM_OF_ARP_TPA[[]]->NXM_NX_REG2[[]],resubmit(,8),goto_table:10 + n_packets=4, ct_state=-trk,ip actions=ct(table=0) + table=1, ct_state=+rel+trk,icmp,in_port=2,nw_dst=10.1.1.254 actions=goto_table:2 + table=1, n_packets=2, ct_state=+rel+trk,icmp,in_port=2,nw_src=10.1.1.2,nw_dst=10.1.1.1 actions=output:1 + table=10, n_packets=1, priority=10,arp,arp_op=1 actions=set_field:2->arp_op,move:NXM_NX_ARP_SHA[[]]->NXM_NX_ARP_THA[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_NX_ARP_SHA[[]],move:NXM_OF_ARP_SPA[[]]->NXM_OF_ARP_TPA[[]],move:NXM_NX_REG2[[]]->NXM_OF_ARP_SPA[[]],move:NXM_OF_ETH_SRC[[]]->NXM_OF_ETH_DST[[]],move:OXM_OF_PKT_REG0[[0..47]]->NXM_OF_ETH_SRC[[]],move:NXM_OF_IN_PORT[[]]->NXM_NX_REG3[[0..15]],set_field:0->in_port,output:NXM_NX_REG3[[0..15]] + table=10, n_packets=1, priority=100,arp,reg0=0,reg1=0 actions=NORMAL + table=8, n_packets=1, priority=0 actions=set_field:0->xreg0 + table=8, n_packets=1, reg2=0xa0101f0/0xfffffff0 actions=set_field:0x808888888888->xreg0 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2) | sed -e 's/dst=10.1.1.2[[45]][[0-9]]/dst=10.1.1.2XX/'], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=,dport=),reply=(src=10.1.1.2,dst=10.1.1.2XX,sport=,dport=) +]) + +AT_CHECK([tcpdump -n -v "icmp" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -6226,7 +7670,7 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::240]) -NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::240 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -W 2 fc00::240 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -6244,6 +7688,7 @@ AT_CLEANUP AT_SETUP([conntrack - IPv6 ICMP6 Related with SNAT]) AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +AT_SKIP_IF([test $HAVE_NC = no]) CHECK_CONNTRACK() CHECK_CONNTRACK_NAT() OVS_TRAFFIC_VSWITCHD_START() @@ -6277,21 +7722,58 @@ dnl waiting, we get occasional failures due to the following error: dnl "connect: Cannot assign requested address" OVS_WAIT_UNTIL([ip netns exec at_ns0 ping6 -c 1 fc00::2]) -AT_CHECK([ovs-appctl dpctl/flush-conntrack]) +AT_CHECK([ovs-appctl dpctl/flush-conntrack]) + +rm p0.pcap +OVS_DAEMONIZE([tcpdump -n -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) +sleep 1 + +dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. +NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc -6 $NC_EOF_OPT -u fc00::2 1"]) + +AT_CHECK([tcpdump -n -v "icmp6" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl +udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc00::2,dst=fc00::240,sport=,dport=) +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conntrack - ICMPv6 related NAT with single port]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96", "f0:00:00:01:01:01", [], "nodad") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96", "f0:00:00:01:01:02", [], "nodad") + +AT_DATA([flows.txt], [dnl +table=0,ipv6,ct_state=-trk,actions=ct(table=0,nat) +table=0,in_port=ovs-p0,ct_state=+trk+new,udp6,actions=ct(commit,nat(dst=[[fc00::2]]:8080)),ovs-p1 +table=0,in_port=ovs-p1,ct_state=+trk+rel+rpl,icmp6,actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) rm p0.pcap -OVS_DAEMONIZE([tcpdump -U -i ovs-p0 -w p0.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns0], [tcpdump -n -l -U -i p0 -w p0.pcap 2> tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) -dnl UDP packets from ns0->ns1 should solicit "destination unreachable" response. -NS_CHECK_EXEC([at_ns0], [bash -c "echo a | nc -6 $NC_EOF_OPT -u fc00::2 1"]) +dnl Send UDP packet from [[fc00::1]]:1234 to [[fc00::240]]:80 +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p0,packet=f00000010102f0000001010186dd60066ced00151140fc000000000000000000000000000001fc00000000000000000000000000024004d20050001587d4646573745f756e72656163680a,actions=resubmit(,0)"]) +dnl Send "destination unreachable" response +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-p1,packet=f00000010101f0000001010286dd600733ed00453a40fc000000000000000000000000000002fc000000000000000000000000000001010428550000000060066ced00151140fc000000000000000000000000000001fc00000000000000000000000000000204d21f9000156ad2646573745f756e72656163680a,actions=resubmit(,0)"]) -AT_CHECK([tcpdump -v "icmp6" -r p0.pcap 2>/dev/null | grep -E 'wrong|bad'], [1], [ignore-nolog]) - -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fc00::2)], [0], [dnl -udp,orig=(src=fc00::1,dst=fc00::2,sport=,dport=),reply=(src=fc00::2,dst=fc00::240,sport=,dport=) +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=fc00::1," | sort], [0], [dnl +udp,orig=(src=fc00::1,dst=fc00::240,sport=1234,dport=80),reply=(src=fc00::2,dst=fc00::1,sport=8080,dport=1234) ]) +OVS_WAIT_UNTIL([ovs-pcap p0.pcap | grep -q "f00000010101f0000001010286dd600733ed00453a40fc000000000000000000000000000240fc000000000000000000000000000001010426170000000060066ced00151140fc000000000000000000000000000001fc00000000000000000000000000024004d20050001587d4646573745f756e72656163680a"]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -6874,19 +8356,27 @@ table=2,priority=10 ct_state=+trk+est action=drop AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) -# sending icmp pkts, first and second -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f0 00 00 01 01 02 f0 00 00 01 01 01 08 00 45 00 00 1c 00 01 00 00 40 01 64 dc 0a 01 01 01 0a 01 01 02 08 00 f7 ff ff ff ff ff > /dev/null]) +m4_define([ICMP_PKT], [m4_join([,], + [eth_src=f0:00:00:01:01:01,eth_dst=f0:00:00:01:01:02,eth_type=0x0800], + [nw_src=10.1.1.1,nw_dst=10.1.1.2], + [nw_proto=1,nw_ttl=64,nw_frag=no], + [icmp_type=8,icmp_code=0])]) -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f0 00 00 01 01 02 f0 00 00 01 01 01 08 00 45 00 00 1c 00 01 00 00 40 01 64 dc 0a 01 01 01 0a 01 01 02 08 00 f7 ff ff ff ff ff > /dev/null]) +# Sending ICMP packets, first and second. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT' '')], [0], [ignore]) + +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'ICMP_PKT' '')], [0], [ignore]) sleep 1 dnl ensure CT picked up the packet -AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1)], [0], [dnl +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl icmp,orig=(src=10.1.1.1,dst=10.1.1.2,id=,type=8,code=0),reply=(src=10.1.1.2,dst=10.1.1.1,id=,type=0,code=0) ]) -AT_CHECK([ovs-ofctl dump-flows br0 | grep table=2, | OFPROTO_CLEAR_DURATION_IDLE], +AT_CHECK([ovs-ofctl dump-flows br0 | grep table=2, | OFPROTO_CLEAR_DURATION_IDLE | sed 's/n_bytes=70,/n_bytes=84,/'], [0], [dnl cookie=0x0, duration=, table=2, n_packets=2, n_bytes=84, idle_age=, priority=10,ct_state=+new+trk,in_port=1 actions=drop cookie=0x0, duration=, table=2, n_packets=0, n_bytes=0, idle_age=, priority=10,ct_state=+est+trk actions=drop @@ -6897,7 +8387,6 @@ AT_CLEANUP AT_SETUP([conntrack - can match and clear ct_state from outside OVS]) CHECK_CONNTRACK_LOCAL_STACK() -OVS_CHECK_TUNNEL_TSO() OVS_CHECK_GENEVE() OVS_TRAFFIC_VSWITCHD_START() @@ -6921,12 +8410,12 @@ ADD_NATIVE_TUNNEL([geneve], [ns_gnv0], [at_ns0], [172.31.1.100], [10.1.1.1/24], [vni 0]) dnl First, check the underlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 172.31.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl Okay, now check the overlay -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.1.1.100 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -6942,6 +8431,160 @@ recirc_id(0),in_port(br-underlay),ct_state(+trk),eth(src=f0:00:00:01:01:02,dst=f OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - ICMP from different source related with NAT]) +AT_SKIP_IF([test $HAVE_NC = no]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() +CHECK_CONNTRACK_NAT() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(client, server) + +ADD_VETH(client, client, br0, "192.168.20.10/24", "00:00:00:00:20:10") +ADD_VETH(server, server, br0, "192.168.10.20/24", "00:00:00:00:10:20") + +dnl Send traffic from client to CT, do DNAT if the traffic is new otherwise send it to server +AT_DATA([flows.txt], [dnl +table=0,ip,actions=ct(table=1,zone=42,nat) +table=1,in_port=ovs-client,ip,ct_state=+trk+new,actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20) +table=1,icmp,ct_state=+trk+rel-rpl,actions=ct(commit,table=2,zone=42,nat) +table=1,ip,actions=resubmit(,2) +table=2,in_port=ovs-client,ip,ct_state=+trk+new,actions=output:ovs-server +table=2,in_port=ovs-client,icmp,ct_state=+trk+rel,actions=output:ovs-server +table=2,in_port=ovs-server,icmp,ct_state=+trk+rel,actions=output:ovs-client +table=2,in_port=ovs-server,ip,ct_state=+trk+rpl,actions=output:ovs-client +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +rm server.pcap +NETNS_DAEMONIZE([server], [tcpdump -n -l -U -i server -w server.pcap 2>tcpdump0_err], [tcpdump0.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump0_err]) + +dnl Send UDP client->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=00000000102000000000201008004500001C000040000A11C762C0A8140AC0A814140001000200080000,actions=resubmit(,0)"]) +dnl Send UDP response server->client +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-server,\ +packet=00000000201000000000102008004500001C000040000A11D162C0A80A14C0A8140A0002000100080000,actions=resubmit(,0)"]) +dnl Fake router sending ICMP need frag router->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=000000001020000000002000080045000038011F0000FF011140C0A81401C0A814140304F778000005784500001C000040000A11C762C0A81414C0A8140A0002000100080000,\ +actions=resubmit(,0)" +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ], [0], [dnl + n_packets=3, n_bytes=154, reset_counts ip actions=ct(table=1,zone=42,nat) + table=1, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20)) + table=1, n_packets=1, n_bytes=42, reset_counts ip actions=resubmit(,2) + table=1, n_packets=1, n_bytes=70, reset_counts ct_state=+rel-rpl+trk,icmp actions=ct(commit,table=2,zone=42,nat) + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+new+trk,ip,in_port=1 actions=output:2 + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+rpl+trk,ip,in_port=2 actions=output:1 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=1 actions=output:2 + table=2, reset_counts ct_state=+rel+trk,icmp,in_port=2 actions=output:1 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "192.168.20.10"], [0], [dnl +udp,orig=(src=192.168.20.10,dst=192.168.20.20,sport=1,dport=2),reply=(src=192.168.10.20,dst=192.168.20.10,sport=2,dport=1),zone=42 +]) + +OVS_WAIT_UNTIL([ovs-pcap server.pcap | grep 000000001020000000002000]) + +AT_CHECK([ovs-pcap server.pcap | grep 000000001020000000002000], [0], [dnl +000000001020000000002000080045000038011f0000ff011b40c0a81401c0a80a140304f778000005784500001c000040000a11d162c0a80a14c0a8140a0002000100080000 +]) + +dnl Check the ICMP error in reply direction +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=42]) + +rm client.pcap +NETNS_DAEMONIZE([client], [tcpdump -n -l -U -i client -w client.pcap 2>tcpdump1_err], [tcpdump1.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump1_err]) + +dnl Send UDP client->server +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-client,\ +packet=00000000102000000000201008004500001C000040000A11C762C0A8140AC0A814140001000200080000,actions=resubmit(,0)"]) +dnl Fake router sending ICMP need frag router->client +AT_CHECK([ovs-ofctl packet-out br0 "in_port=ovs-server,\ +packet=000000002010000000002000080045000038011F0000FF01114AC0A81401C0A8140A0304F778000005784500001C000040000A11D162C0A8140AC0A80A140001000200080000,\ +actions=resubmit(,0)" +]) + +AT_CHECK([ovs-appctl revalidator/purge], [0]) +AT_CHECK([ovs-ofctl -O OpenFlow15 dump-flows br0 | ofctl_strip | sort ], [0], [dnl + n_packets=5, n_bytes=266, reset_counts ip actions=ct(table=1,zone=42,nat) + table=1, n_packets=1, n_bytes=70, reset_counts ct_state=+rel-rpl+trk,icmp actions=ct(commit,table=2,zone=42,nat) + table=1, n_packets=2, n_bytes=112, reset_counts ip actions=resubmit(,2) + table=1, n_packets=2, n_bytes=84, reset_counts ct_state=+new+trk,ip,in_port=1 actions=ct(commit,table=2,zone=42,nat(dst=192.168.10.20)) + table=2, n_packets=1, n_bytes=42, reset_counts ct_state=+rpl+trk,ip,in_port=2 actions=output:1 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=1 actions=output:2 + table=2, n_packets=1, n_bytes=70, reset_counts ct_state=+rel+trk,icmp,in_port=2 actions=output:1 + table=2, n_packets=2, n_bytes=84, reset_counts ct_state=+new+trk,ip,in_port=1 actions=output:2 +OFPST_FLOW reply (OF1.5): +]) + +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "192.168.20.10"], [0], [dnl +udp,orig=(src=192.168.20.10,dst=192.168.20.20,sport=1,dport=2),reply=(src=192.168.10.20,dst=192.168.20.10,sport=2,dport=1),zone=42 +]) + +OVS_WAIT_UNTIL([ovs-pcap client.pcap | grep 000000002010000000002000]) + +AT_CHECK([ovs-pcap client.pcap | grep 000000002010000000002000], [0], [dnl +000000002010000000002000080045000038011f0000ff011137c0a81414c0a8140a0304f778000005784500001c000040000a11c762c0a8140ac0a814140001000200080000 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([conntrack - Flush many conntrack entries by port]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_DATA([flows.txt], [dnl +priority=100,in_port=1,udp,action=ct(zone=1,commit),2 +]) + +AT_CHECK([ovs-ofctl --bundle add-flows br0 flows.txt]) + +dnl 20 packets from port 1 and 1 packet from port 2. +flow_l3="\ + eth_src=50:54:00:00:00:09,eth_dst=50:54:00:00:00:0a,dl_type=0x0800,\ + nw_src=10.1.1.1,nw_dst=10.1.1.2,nw_proto=17,nw_ttl=64,nw_frag=no" + +for i in $(seq 1 20); do + frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=1,udp_dst=$i") + AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame actions=resubmit(,0)"]) +done +frame=$(ovs-ofctl compose-packet --bare "$flow_l3, udp_src=2,udp_dst=1") +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 packet=$frame actions=resubmit(,0)"]) + +: > conntrack + +for i in $(seq 1 20); do + echo "udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=${i}),reply=(src=10.1.1.2,dst=10.1.1.1,sport=${i},dport=1),zone=1" >> conntrack +done +echo "udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1" >> conntrack + +sort conntrack > expout + +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | grep -F "src=10.1.1.1," | sort ], [0], [expout]) + +dnl Check that flushing conntrack by port 1 flush all ct for port 1 but keeps ct for port 2. +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=1 'ct_nw_proto=17,ct_tp_src=1']) +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | grep -F "src=10.1.1.1," | sort ], [0], [dnl +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=2,dport=1),reply=(src=10.1.1.2,dst=10.1.1.1,sport=1,dport=2),zone=1 +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + AT_BANNER([IGMP]) AT_SETUP([IGMP - flood under normal action]) @@ -7024,6 +8667,7 @@ AT_CLEANUP AT_BANNER([802.1ad]) AT_SETUP([802.1ad - vlan_limit]) +OVS_CHECK_GITHUB_ACTION() OVS_TRAFFIC_VSWITCHD_START([set Open_vSwitch . other_config:vlan-limit=0]) OVS_CHECK_8021AD() @@ -7046,7 +8690,7 @@ dnl CVLAN traffic should match the flow and drop AT_CHECK([ovs-appctl revalidator/purge]) AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:vlan-limit=1]) AT_CHECK([ovs-ofctl add-flow br0 "priority=100 dl_type=0x8100 action=drop"]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -w 3 10.2.2.2], [1], [ignore]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -W 3 10.2.2.2], [1], [ignore]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -7096,11 +8740,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br2 flows-customer-br.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7152,11 +8796,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br2 flows-customer-br.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7204,24 +8848,24 @@ AT_CHECK([ovs-vsctl set port ovs-p2 vlan_mode=dot1q-tunnel tag=4094 cvlans=100,2 OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.3.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.3.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.3.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.3.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.3.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) dnl CVLAN 300 is not permitted by dot1q-tunnel -NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -w 3 10.4.2.2], [1], [ignore]) +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 -W 3 10.4.2.2], [1], [ignore]) OVS_TRAFFIC_VSWITCHD_STOP(["/dropping VLAN \(0\|300\) packet received on dot1q-tunnel port/d"]) AT_CLEANUP @@ -7250,11 +8894,11 @@ AT_CHECK([ovs-ofctl --bundle add-flows br0 flows-br0.txt]) OVS_WAIT_UNTIL([ip netns exec at_ns0 ping -c 1 10.2.2.2]) -NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) -NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -W 2 10.2.2.2 | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -7276,21 +8920,29 @@ dnl The flow will encap a nsh header to the TCP syn packet dnl eth/ip/tcp --> OVS --> eth/nsh/eth/ip/tcp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,ip,actions=encap(nsh(md_type=1)),set_field:0x1234->nsh_spi,set_field:0x11223344->nsh_c1,encap(ethernet),set_field:f2:ff:00:00:00:02->dl_dst,set_field:f2:ff:00:00:00:01->dl_src,ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is a TCP syn packet. pkt=eth/ip/tcp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +dnl Send the TCP SYN packet from p0(at_ns0) interface directed to +dnl p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')], [0], [ignore]) -dnl Check the expected nsh encapsulated packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *0fc6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0012 *34ff *1122 *3344 *0000 *0000 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0000 *0000 *0000 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +m4_define([NSH_HEADER], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=63,nsh_np=3,nsh_spi=0x1234,nsh_si=255], + [nsh_mdtype=1,nsh_c1=0x11223344])]) + +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -7308,19 +8960,31 @@ dnl The flow will decap a nsh header which in turn carries a TCP syn packet dnl eth/nsh/eth/ip/tcp --> OVS --> eth/ip/tcp AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,dl_type=0x894f, actions=decap(),decap(), ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is NSH packet with TCP syn payload. pkt=eth/nsh/eth/ip/tcp -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 00 64 03 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +m4_define([NSH_HEADER], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=63,nsh_np=3,nsh_spi=0x1234,nsh_si=255], + [nsh_mdtype=1,nsh_c1=0x11223344])]) + +dnl Send the NSH packet with TCP SYN payload from p0(at_ns0) interface directed +dnl to p1(at_ns1) interface. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) dnl Check the expected de-capsulated TCP packet on the egress interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f200 *0000 *0002 *f200 *0000 *0001 *0800 *4500" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0028 *0001 *0000 *4006 *b013 *c0a8 *000a *0a00" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *000a *0400 *0800 *0000 *00c8 *0000 *0000 *5002" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *2000 *b85e *0000" 2>&1 1>/dev/null]) +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q \ + "^$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')\$"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -7340,22 +9004,38 @@ dnl The flow will add another NSH header with nsh_spi=0x101, nsh_si=4, dnl nsh_ttl=7 and change the md1 context AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,in_port=ovs-p0,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x03,actions=decap(),decap(),encap(nsh(md_type=1)),set_field:0x07->nsh_ttl,set_field:0x0101->nsh_spi,set_field:0x04->nsh_si,set_field:0x100f0e0d->nsh_c1,set_field:0x0c0b0a09->nsh_c2,set_field:0x08070605->nsh_c3,set_field:0x04030201->nsh_c4,encap(ethernet),set_field:f2:ff:00:00:00:02->dl_dst,set_field:f2:ff:00:00:00:01->dl_src,ovs-p1"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) -dnl The hex dump is NSH packet with TCP syn payload. pkt=eth/nsh/eth/ip/tcp -dnl The nsh_ttl is 8, nsh_spi is 0x100 and nsh_si is 3 -dnl The packet is sent from p0(at_ns0) interface directed to -dnl p1(at_ns1) interface -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 01 00 03 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +m4_define([NSH_HEADER_1], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=3,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) -dnl Check the expected NSH packet with new fields in the header -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000* 0001 *894f *01c6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0001 *0104 *100f *0e0d *0c0b *0a09 *0807" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0605 *0403 *0201 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +dnl Send the NSH packet with TCP SYN payload from p0(at_ns0) interface directed +dnl to p1(at_ns1) interface. +dnl The nsh_ttl is 8, nsh_spi is 0x100 and nsh_si is 3. +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_1')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) + +m4_define([NSH_HEADER_2], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=7,nsh_np=3,nsh_spi=0x101,nsh_si=4,nsh_mdtype=1], + [nsh_c1=0x100f0e0d,nsh_c2=0x0c0b0a09,nsh_c3=0x08070605,nsh_c4=0x04030201])]) + +dnl Check the expected NSH packet with new fields in the header. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_2'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP @@ -7376,31 +9056,435 @@ dnl packet to to at_ns2. AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x02,actions=ovs-p1"]) AT_CHECK([ovs-ofctl -Oopenflow13 add-flow br0 "table=0,priority=100,dl_type=0x894f,nsh_spi=0x100,nsh_si=0x01,actions=ovs-p2"]) -NETNS_DAEMONIZE([at_ns1], [tcpdump -l -n -xx -U -i p1 > p1.pcap], [tcpdump.pid]) -NETNS_DAEMONIZE([at_ns2], [tcpdump -l -n -xx -U -i p2 > p2.pcap], [tcpdump2.pid]) -sleep 1 +NETNS_DAEMONIZE([at_ns1], + [tcpdump -l -n -xx -U -i p1 -w p1.pcap 2>tcpdump_err], [tcpdump.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump_err]) +NETNS_DAEMONIZE([at_ns2], + [tcpdump -l -n -xx -U -i p2 -w p2.pcap 2>tcpdump2_err], [tcpdump2.pid]) +OVS_WAIT_UNTIL([grep "listening" tcpdump2_err]) + +m4_define([TCP_SYN_PKT], [m4_join([,], + [eth_src=f2:00:00:00:00:01,eth_dst=f2:00:00:00:00:02,eth_type=0x0800], + [nw_src=192.168.0.10,nw_dst=10.0.0.10], + [nw_proto=6,nw_ttl=64,nw_frag=no], + [tcp_src=1024,tcp_dst=2048,tcp_flags=syn])]) + +dnl First send packet from at_ns0 --> OVS with SPI=0x100 and SI=2. +m4_define([NSH_HEADER_1], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=2,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) + +NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_1')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) + +dnl Check for the above packet on p1 interface. +OVS_WAIT_UNTIL([ovs-pcap p1.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_1'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) + +dnl Send the second packet from at_ns1 --> OVS with SPI=0x100 and SI=1. +m4_define([NSH_HEADER_2], [m4_join([,], + [eth_src=f2:ff:00:00:00:01,eth_dst=f2:ff:00:00:00:02,eth_type=0x894f], + [nsh_ttl=8,nsh_np=3,nsh_spi=0x100,nsh_si=1,nsh_mdtype=1], + [nsh_c1=0x01020304,nsh_c2=0x05060708,nsh_c3=0x090a0b0c,nsh_c4=0x0d0e0f10])]) + +NS_CHECK_EXEC([at_ns1], [$PYTHON3 $srcdir/sendpkt.py p1 \ + "$(ovs-ofctl compose-packet --bare 'NSH_HEADER_2')" \ + "$(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT')"], + [0], [ignore]) + +dnl Check for the above packet on p2 interface. +OVS_WAIT_UNTIL([ovs-pcap p2.pcap | grep -q "m4_join([], [^], + $(ovs-ofctl compose-packet --bare 'NSH_HEADER_2'), + $(ovs-ofctl compose-packet --bare 'TCP_SYN_PKT'), [\$])"]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_BANNER([local-sampling]) + +m4_define([SAMPLE_ACTION], + [sample(probability=65535,collector_set_id=$1,obs_domain_id=$2,obs_point_id=$3)]dnl +) + +AT_SETUP([psample - sanity check]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +AT_CHECK([ovs-appctl dpctl/dump-flows -m --names], [0], [stdout]) +AT_CHECK([grep -q 'actions:psample(group=10,cookie=0xaa102030aa405060),ovs-p1' stdout]) +AT_CHECK([grep -q 'actions:psample(group=12,cookie=0xbb102030bb405060),ovs-p0' stdout]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) -dnl First send packet from at_ns0 --> OVS with SPI=0x100 and SI=2 -NS_CHECK_EXEC([at_ns0], [$PYTHON3 $srcdir/sendpkt.py p0 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 02 06 01 03 00 01 00 02 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) -dnl Check for the above packet on p1 interface -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *0206" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0010: *0103 *0001 *0002 *0102 *0304 *0506 *0708 *090a" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0020: *0b0c *0d0e *0f10 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p1.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - sanity check IPv6]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +priority=100,in_port=ovs-p0,ip6,icmp6,icmpv6_type=128 actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +priority=100,in_port=ovs-p1,ip6,icmp6,icmpv6_type=129 actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +priority=0 actions=NORMAL +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +OVS_WAIT_UNTIL_EQUAL([ip netns exec at_ns0 ping6 -I fc00::1 -q -W 2 -c 1 fc00::2 | FORMAT_PING], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms]) + +AT_CHECK([ovs-appctl dpctl/dump-flows -m --names], [0], [stdout]) +AT_CHECK([grep -q 'actions:psample(group=10,cookie=0xaa102030aa405060),ovs-p1' stdout]) +AT_CHECK([grep -q 'actions:psample(group=12,cookie=0xbb102030bb405060),ovs-p0' stdout]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp6.*ipv6_src=fc00::1,ipv6_dst=fc00::2])]) +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp6.*ipv6_src=fc00::2,ipv6_dst=fc00::1])]) + +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - slow]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p1,max_len=200) +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),output(port=ovs-p0,max_len=200) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Disable datapath truncate support to force actions to run in slow path. +AT_CHECK([ovs-appctl dpif/set-dp-features br0 trunc false], [0]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +AT_CHECK([tail -3 stdout], [0], [dnl +Datapath actions: psample(group=10,cookie=0xaa102030aa405060),trunc(200),3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) + +AT_CHECK([grep -qE 'SAMPLE1' psample.out]) +AT_CHECK([grep -qE 'SAMPLE2' psample.out]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([psample - slow with probability]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + local-group-id=10], + [0], [ignore]) + +dnl A probability != 100% but still pretty high (99.99847%). This ensures that +dnl the outer sample action is not optimized out. +m4_define([PROBABLE_SAMPLE_ACTION], + [sample(probability=65534,collector_set_id=$1,obs_domain_id=$2,obs_point_id=$3)]dnl +) -dnl Send the second packet from at_ns1 --> OVS with SPI=0x100 and SI=1 -NS_CHECK_EXEC([at_ns1], [$PYTHON3 $srcdir/sendpkt.py p1 f2 ff 00 00 00 02 f2 ff 00 00 00 01 89 4f 01 c6 01 03 00 01 00 01 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 f2 00 00 00 00 02 f2 00 00 00 00 01 08 00 45 00 00 28 00 01 00 00 40 06 b0 13 c0 a8 00 0a 0a 00 00 0a 04 00 08 00 00 00 00 c8 00 00 00 00 50 02 20 00 b8 5e 00 00 > /dev/null]) +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=PROBABLE_SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p1,max_len=200) +in_port=ovs-p1,ip actions=PROBABLE_SAMPLE_ACTION(1, 2853183536, 2856341600),output(port=ovs-p0,max_len=200) +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Disable datapath truncate support to force actions to run in slow path. +AT_CHECK([ovs-appctl dpif/set-dp-features br0 trunc false], [0]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +AT_CHECK([tail -3 stdout], [0], [dnl +Datapath actions: sample(sample=100.0%,actions(psample(group=10,cookie=0xaa102030aa405060))),trunc(200),3 +This flow is handled by the userspace slow path because it: + - Uses action(s) not supported by datapath. +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) -dnl Check for the above packet on p2 interface -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0000: *f2ff *0000 *0002 *f2ff *0000 *0001 *894f *01c6" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0010: *0103 *0001 *0001 *0102 *0304 *0506 *0708 *090a" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0020: *0b0c *0d0e *0f10 *f200 *0000 *0002 *f200 *0000" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0030: *0001 *0800 *4500 *0028 *0001 *0000 *4006 *b013" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0040: *c0a8 *000a *0a00 *000a *0400 *0800 *0000 *00c8" 2>&1 1>/dev/null]) -OVS_WAIT_UNTIL([cat p2.pcap | grep -E "0x0050: *0000 *0000 *5002 *2000 *b85e *0000" 2>&1 1>/dev/null]) +dnl Sending 10 packets to decrease even more the odds of not sampling a packet. +NS_CHECK_EXEC([at_ns0], [ping -q -i 0.1 -c 10 10.1.1.2 | FORMAT_PING], [0], [dnl +10 packets transmitted, 10 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE], [m4_join([ ], + [group_id=0xa,prob=4294901758], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +AT_CHECK([grep -qE 'SAMPLE' psample.out]) OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([psample - with IPFIX]) +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@i create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 ipfix=@i \ + bridge=@br0 local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 ipfix=@i \ + bridge=@br0 local-group-id=12], + [0], [ignore]) + +AT_DATA([flows.txt], [dnl +arp actions=NORMAL +in_port=ovs-p0,ip actions=SAMPLE_ACTION(1, 2853183536, 2856341600),ovs-p1 +in_port=ovs-p1,ip actions=SAMPLE_ACTION(2, 3138396208, 3141554272),ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl ofproto/trace br0 \ + 'in_port=ovs-p0,dl_src=e4:11:22:33:44:55,dl_dst=e4:11:22:33:44:66,dl_type=0x0800,nw_src=10.1.1.1,nw_dst=10.1.1.12'], + [0], [stdout]) + +m4_define([ACTIONS], [m4_join([], + [psample(group=10,cookie=0xaa102030aa405060),], + [userspace(pid=4294967295,], + [flow_sample(probability=65535,], + [collector_set_id=1,], + [obs_domain_id=2853183536,], + [obs_point_id=2856341600,], + [output_port=4294967295)),], + [3])]) + +AT_CHECK([tail -1 stdout], [0], [dnl +Datapath actions: ACTIONS +]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) + +OVS_WAIT_UNTIL([grep -qE 'SAMPLE1' psample.out]) +OVS_WAIT_UNTIL([grep -qE 'SAMPLE2' psample.out]) + +dnl Check IPFIX samples have been received. +dnl Entries can be unsorted and IFPIX packets might not have been sent (or +dnl at least tried to be sent) yet. +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-ipfix-flow br0 | \ + sed 's/tx pkts=[[0-9]]*/tx pkts=24/' | \ + sed 's/tx errs=[[0-9]]*/tx errs=0/' | \ + sed 's/id [[1-2]]:/id ?:/'], [dnl +NXST_IPFIX_FLOW reply (xid=0x2): 2 ids + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0 + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0]) + +dnl OVS will fail to send IPFIX packets because the target is localhost +dnl and the port is closed. Ignore the message it generates. +OVS_TRAFFIC_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP + +AT_SETUP([psample - from ct label]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START() +OVS_CHECK_PSAMPLE() + +ADD_NAMESPACES(at_ns0, at_ns1) +NS_CHECK_EXEC([at_ns0], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) +NS_CHECK_EXEC([at_ns1], [sysctl -w net.ipv6.conf.all.disable_ipv6=1], [0], [ignore]) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24", "e4:11:22:33:44:55") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24", "e4:11:22:33:44:66") + +AT_CHECK([ovs-vsctl -- --id=@br0 get Bridge br0 \ + -- --id=@ipfix create IPFIX targets=\"127.0.0.1:4739\" \ + -- create Flow_Sample_Collector_Set id=1 bridge=@br0 \ + ipfix=@ipfix, local-group-id=10 \ + -- create Flow_Sample_Collector_Set id=2 bridge=@br0 \ + ipfix=@ipfix, local-group-id=12], + [0], [ignore]) + +m4_define([CT_STORE_ACT], + [ct(zone=5,commit,exec(load:0x0bb102030->NXM_NX_CT_LABEL[[0..31]],load:0xbb405060->NXM_NX_CT_LABEL[[32..63]]))]) + +AT_DATA([flows.txt], [dnl +priority=100,ip actions=ct(zone=5, table=10) +priority=0 actions=NORMAL +table=10,priority=100,ip,ct_state=+trk+new action=SAMPLE_ACTION(1, 2853183536, 2856341600),CT_STORE_ACT,NORMAL +table=10,priority=100,ip,ct_state=+trk-new action=SAMPLE_ACTION(2, NXM_NX_CT_LABEL[[[0..31]]], NXM_NX_CT_LABEL[[[32..63]]]),NORMAL +table=10, priority=50, ip, actions=DROP +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +OVS_DAEMONIZE([ovstest test-psample > psample.out], [psample1.pid]) +OVS_WAIT_UNTIL([grep -q "Listening for psample events" psample.out]) + +NS_CHECK_EXEC([at_ns0], [ping -q -c 1 10.1.1.2 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +m4_define([SAMPLE1], [m4_join([ ], + [group_id=0xa,prob=4294967295], + [obs_domain=0xaa102030,obs_point=0xaa405060], + [.*icmp.*nw_src=10.1.1.1,nw_dst=10.1.1.2])]) + +m4_define([SAMPLE2], [m4_join([ ], + [group_id=0xc,prob=4294967295], + [obs_domain=0xbb102030,obs_point=0xbb405060], + [.*icmp.*nw_src=10.1.1.2,nw_dst=10.1.1.1])]) +AT_CHECK([grep -qE 'SAMPLE1' psample.out]) +AT_CHECK([grep -qE 'SAMPLE2' psample.out]) + +m4_define([FLOW_MATCH], [m4_join([], + [ct_label(0xbb405060bb102030/0xffffffffffffffff).*actions:], + [actions:psample(group=12,cookie=0xbb102030bb405060),], + [userspace(pid=[[0-9]]+,flow_sample(.*obs_domain_id=3138396208,obs_point_id=3141554272.*))] +)]) + +AT_CHECK([ovs-appctl dpctl/dump-flows --names filter=in_port=ovs-p1 \ + | grep -qE 'FLOW_MATCH' ], [0], []) + +dnl Check IPFIX samples have been received. +dnl Entries can be unsorted and IFPIX packets might not have been sent (or +dnl at least tried to be sent) yet. +OVS_WAIT_UNTIL_EQUAL([ovs-ofctl dump-ipfix-flow br0 | \ + sed 's/tx pkts=[[0-9]]*/tx pkts=24/' | \ + sed 's/tx errs=[[0-9]]*/tx errs=0/' | \ + sed 's/id [[1-2]]:/id ?:/'], [dnl +NXST_IPFIX_FLOW reply (xid=0x2): 2 ids + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0 + id ?: flows=1, current flows=0, sampled pkts=1, ipv4 ok=1, ipv6 ok=0, tx pkts=24 + pkts errs=0, ipv4 errs=0, ipv6 errs=0, tx errs=0]) + +dnl OVS will fail to send IPFIX packets because the target is localhost +dnl and the port is closed. Ignore the message it generates. +OVS_TRAFFIC_VSWITCHD_STOP(["/sending to collector failed/d"]) +AT_CLEANUP diff --git a/tests/system-userspace-macros.at b/tests/system-userspace-macros.at index b34a84775bf..d9b5b7e4c4d 100644 --- a/tests/system-userspace-macros.at +++ b/tests/system-userspace-macros.at @@ -106,12 +106,25 @@ m4_define([CHECK_CONNTRACK_NAT]) # m4_define([CHECK_CONNTRACK_ZEROIP_SNAT]) +# CHECK_CONNTRACK_SCTP() +# +# Perform requirements checks for running conntrack SCTP. The userspace +# datapath has no dependency, so no check is required. +# +m4_define([CHECK_CONNTRACK_SCTP]) + # CHECK_CONNTRACK_TIMEOUT() # # Perform requirements checks for running conntrack customized timeout tests. # m4_define([CHECK_CONNTRACK_TIMEOUT]) +# CHECK_CONNTRACK_DUMP_EXPECTATIONS() +# +# Perform requirements checks for dumping conntrack expectations. +# +m4_define([CHECK_CONNTRACK_DUMP_EXPECTATIONS]) + # CHECK_CT_DPIF_SET_GET_MAXCONNS() # # Perform requirements checks for running ovs-dpctl ct-set-maxconns or @@ -301,6 +314,12 @@ m4_define([OVS_CHECK_KERNEL_EXCL], AT_SKIP_IF([:]) ]) +# OVS_CHECK_SRV6() +m4_define([OVS_CHECK_SRV6], + [AT_SKIP_IF([! ip -6 route add fc00::1/96 encap seg6 mode encap dev lo 2>&1 >/dev/null]) + AT_CHECK([ip -6 route del fc00::1/96 2>&1 >/dev/null]) + OVS_CHECK_FIREWALL()]) + # CHECK_LATER_IPV6_FRAGMENTS() # # Userspace is parsing later IPv6 fragments correctly. @@ -325,3 +344,16 @@ m4_define([CHECK_L3L4_CONNTRACK_REASM], [ AT_SKIP_IF([:]) ]) + +# CHECK_NO_TC_OFFLOAD +# +# Userspace tests do not use TC offload. +m4_define([CHECK_NO_TC_OFFLOAD]) + +# OVS_CHECK_BAREUDP() +# +# The userspace datapath does not support bareudp tunnels. +m4_define([OVS_CHECK_BAREUDP], +[ + AT_SKIP_IF([:]) +]) diff --git a/tests/system-userspace-packet-type-aware.at b/tests/system-userspace-packet-type-aware.at index 974304758f8..aac178edaf9 100644 --- a/tests/system-userspace-packet-type-aware.at +++ b/tests/system-userspace-packet-type-aware.at @@ -335,7 +335,7 @@ AT_CHECK([ # Ping between N1 and N3, via the L2 GRE tunnel between br-in1 and br-in3 -NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -w 2 $N3_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -W 2 $N3_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -363,7 +363,7 @@ AT_CHECK([ # Ping between N1 and N2, via the L2 GRE tunnel between br-in1 and br-in2 -NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -w 2 $N2_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns1], [ping -q -c 3 -i 0.3 -W 2 $N2_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) @@ -394,7 +394,7 @@ AT_CHECK([ # Ping between N3 and N2, via the L3 GRE tunnel between br-in3 and br-in2 -NS_CHECK_EXEC([ns3], [ping -q -c 3 -i 0.3 -w 2 $N1_IP | FORMAT_PING], [0], [dnl +NS_CHECK_EXEC([ns3], [ping -q -c 3 -i 0.3 -W 2 $N1_IP | FORMAT_PING], [0], [dnl 3 packets transmitted, 3 received, 0% packet loss, time 0ms ]) diff --git a/tests/test-atomic.c b/tests/test-atomic.c index 4b1374b70b2..7853c3e59f2 100644 --- a/tests/test-atomic.c +++ b/tests/test-atomic.c @@ -28,7 +28,7 @@ VLOG_DEFINE_THIS_MODULE(test_atomic); #define TEST_ATOMIC_TYPE(ATOMIC_TYPE, BASE_TYPE) \ { \ - ATOMIC_TYPE x = ATOMIC_VAR_INIT(1); \ + ATOMIC_TYPE x = 1; \ BASE_TYPE value, orig; \ \ atomic_read(&x, &value); \ @@ -71,7 +71,7 @@ VLOG_DEFINE_THIS_MODULE(test_atomic); #define TEST_ATOMIC_TYPE_EXPLICIT(ATOMIC_TYPE, BASE_TYPE, \ ORDER_READ, ORDER_STORE, ORDER_RMW) \ { \ - ATOMIC_TYPE x = ATOMIC_VAR_INIT(1); \ + ATOMIC_TYPE x = 1; \ BASE_TYPE value, orig; \ \ atomic_read_explicit(&x, &value, ORDER_READ); \ @@ -181,7 +181,7 @@ struct atomic_aux { ATOMIC(uint64_t) data64; }; -static ATOMIC(struct atomic_aux *) paux = ATOMIC_VAR_INIT(NULL); +static ATOMIC(struct atomic_aux *) paux = NULL; static struct atomic_aux *auxes = NULL; #define ATOMIC_ITEM_COUNT 1000000 @@ -229,7 +229,7 @@ atomic_producer(void * arg1 OVS_UNUSED) for (i = 0; i < ATOMIC_ITEM_COUNT; i++) { struct atomic_aux *aux = &auxes[i]; - aux->count = ATOMIC_VAR_INIT(i); + aux->count = i; aux->b = i + 42; /* Publish the new item. */ @@ -337,9 +337,9 @@ test_acq_rel(void) a = 0; aux->b = 0; - aux->count = ATOMIC_VAR_INIT(0); + aux->count = 0; atomic_init(&aux->data, NULL); - aux->data64 = ATOMIC_VAR_INIT(0); + aux->data64 = 0; reader = ovs_thread_create("reader", atomic_reader, aux); writer = ovs_thread_create("writer", atomic_writer, aux); diff --git a/tests/test-barrier.c b/tests/test-barrier.c index 3bc5291cc17..fb0ab0e695c 100644 --- a/tests/test-barrier.c +++ b/tests/test-barrier.c @@ -14,13 +14,13 @@ * limitations under the License. */ -#include - #include +#undef NDEBUG +#include -#include "ovs-thread.h" -#include "ovs-rcu.h" #include "ovstest.h" +#include "ovs-rcu.h" +#include "ovs-thread.h" #include "random.h" #include "util.h" diff --git a/tests/test-byteq.c b/tests/test-byteq.c new file mode 100644 index 00000000000..ed2afd1fef8 --- /dev/null +++ b/tests/test-byteq.c @@ -0,0 +1,159 @@ +/* + * Copyright (C) 2023 Hewlett Packard Enterprise Development LP + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include +#include +#include +#include +#include +#include "byteq.h" +#include "ovstest.h" +#include "util.h" + +static void test_byteq_main(int argc, char *argv[]); +static void test_byteq_put_get(void); +static void test_byteq_putn_get(void); +static void test_byteq_put_string(void); +static void test_byteq_write_read(void); + +#define SIZE 256 + +static void +test_byteq_put_get(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + for (int i = 0; i < input_len; i++) { + byteq_put(&bq, input[i]); + } + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_putn_get(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_putn(&bq, input, input_len); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_put_string(void) +{ + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_put_string(&bq, input); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } +} + +static void +test_byteq_write_read(void) +{ +#ifndef _WIN32 + int fd[2]; + pid_t childpid; + int rc; + struct byteq bq; + uint8_t buffer[SIZE]; + const char *input = "Open vSwitch"; + const int input_len = strlen(input); + + byteq_init(&bq, buffer, SIZE); + byteq_put_string(&bq, input); + + rc = pipe(fd); + ovs_assert(rc == 0); + + /* Flush stdout */ + fflush(stdout); + + childpid = fork(); + ovs_assert(childpid != -1); + if (childpid == 0) { + /* Child process closes stdout */ + close(STDOUT_FILENO); + /* Child process closes up input side of pipe */ + close(fd[0]); + rc = byteq_write(&bq, fd[1]); + ovs_assert(rc == 0); + exit(0); + } else { + /* Parent process closes up output side of pipe */ + close(fd[1]); + rc = byteq_read(&bq, fd[0]); + ovs_assert(rc == EOF); + for (int i = 0; i < input_len; i++) { + ovs_assert(byteq_get(&bq) == input[i]); + } + } +#endif /* _WIN32 */ +} + +static void +run_test(void (*function)(void)) +{ + function(); + printf("."); +} + +static void +test_byteq_main(int argc, char *argv[]) +{ + if (argc != 2) { + ovs_fatal(0, "exactly one argument required\n" + "the argument must be one of the following:\n" + "\tbasic\n" + "\twrite_read\n"); + } + + if (strcmp(argv[1], "write_read") == 0) { + run_test(test_byteq_write_read); + printf("\n"); + } else if (strcmp(argv[1], "basic") == 0) { + run_test(test_byteq_put_get); + run_test(test_byteq_putn_get); + run_test(test_byteq_put_string); + printf("\n"); + } else { + ovs_fatal(0, "invalid argument\n" + "the argument must be one of the following:\n" + "\tbasic\n" + "\twrite_read\n"); + } +} + +OVSTEST_REGISTER("test-byteq", test_byteq_main); diff --git a/tests/test-classifier.c b/tests/test-classifier.c index cff00c8fa35..2c1604a01e2 100644 --- a/tests/test-classifier.c +++ b/tests/test-classifier.c @@ -441,7 +441,7 @@ compare_classifiers(struct classifier *cls, size_t n_invisible_rules, /* This assertion is here to suppress a GCC 4.9 array-bounds warning */ ovs_assert(cls->n_tries <= CLS_MAX_TRIES); - cr0 = classifier_lookup(cls, version, &flow, &wc); + cr0 = classifier_lookup(cls, version, &flow, &wc, NULL); cr1 = tcls_lookup(tcls, &flow); assert((cr0 == NULL) == (cr1 == NULL)); if (cr0 != NULL) { @@ -454,7 +454,7 @@ compare_classifiers(struct classifier *cls, size_t n_invisible_rules, /* Make sure the rule should have been visible. */ assert(cls_rule_visible_in_version(cr0, version)); } - cr2 = classifier_lookup(cls, version, &flow, NULL); + cr2 = classifier_lookup(cls, version, &flow, NULL, NULL); assert(cr2 == cr0); } } @@ -1370,10 +1370,10 @@ lookup_classifier(void *aux_) if (aux->use_wc) { flow_wildcards_init_catchall(&wc); cr = classifier_lookup(aux->cls, version, &aux->lookup_flows[x], - &wc); + &wc, NULL); } else { cr = classifier_lookup(aux->cls, version, &aux->lookup_flows[x], - NULL); + NULL, NULL); } if (cr) { hits++; diff --git a/tests/test-conntrack.c b/tests/test-conntrack.c index 24c93e4a488..dc8d6cff941 100644 --- a/tests/test-conntrack.c +++ b/tests/test-conntrack.c @@ -25,36 +25,48 @@ #include "ovstest.h" #include "pcap-file.h" #include "timeval.h" +#include "stopwatch.h" + +#define STOPWATCH_CT_EXECUTE_COMMIT "ct-execute-commit" +#define STOPWATCH_CT_EXECUTE_NO_COMMIT "ct-execute-no-commit" +#define STOPWATCH_FLUSH_FULL_ZONE "full-zone" +#define STOPWATCH_FLUSH_EMPTY_ZONE "empty-zone" static const char payload[] = "50540000000a50540000000908004500001c0000000000" "11a4cd0a0101010a0101020001000200080000"; +static struct dp_packet * +build_packet(uint16_t udp_src, uint16_t udp_dst, ovs_be16 *dl_type) +{ + struct udp_header *udp; + struct flow flow; + struct dp_packet *pkt = dp_packet_new(sizeof payload / 2); + + dp_packet_put_hex(pkt, payload, NULL); + flow_extract(pkt, &flow); + + udp = dp_packet_l4(pkt); + udp->udp_src = htons(udp_src); + udp->udp_dst = htons(udp_dst); + + *dl_type = flow.dl_type; + + return pkt; +} + static struct dp_packet_batch * prepare_packets(size_t n, bool change, unsigned tid, ovs_be16 *dl_type) { struct dp_packet_batch *pkt_batch = xzalloc(sizeof *pkt_batch); - struct flow flow; size_t i; ovs_assert(n <= ARRAY_SIZE(pkt_batch->packets)); dp_packet_batch_init(pkt_batch); for (i = 0; i < n; i++) { - struct udp_header *udp; - struct dp_packet *pkt = dp_packet_new(sizeof payload/2); - - dp_packet_put_hex(pkt, payload, NULL); - flow_extract(pkt, &flow); - - udp = dp_packet_l4(pkt); - udp->udp_src = htons(ntohs(udp->udp_src) + tid); - - if (change) { - udp->udp_dst = htons(ntohs(udp->udp_dst) + i); - } - + uint16_t udp_dst = change ? 2+1 : 2; + struct dp_packet *pkt = build_packet(1 + tid, udp_dst, dl_type); dp_packet_batch_add(pkt_batch, pkt); - *dl_type = flow.dl_type; } return pkt_batch; @@ -91,7 +103,7 @@ ct_thread_main(void *aux_) ovs_barrier_block(&barrier); for (i = 0; i < n_pkts; i += batch_size) { conntrack_execute(ct, pkt_batch, dl_type, false, true, 0, NULL, NULL, - 0, 0, NULL, NULL, now, 0); + NULL, NULL, now, 0); DP_PACKET_BATCH_FOR_EACH (j, pkt, pkt_batch) { pkt_metadata_init_conn(&pkt->md); } @@ -153,6 +165,140 @@ test_benchmark(struct ovs_cmdl_context *ctx) free(threads); } +static void +test_benchmark_zones(struct ovs_cmdl_context *ctx) +{ + unsigned long n_conns, n_zones, iterations; + long long start; + unsigned i, j; + ovs_be16 dl_type; + long long now = time_msec(); + + fatal_signal_init(); + + /* Parse arguments */ + n_conns = strtoul(ctx->argv[1], NULL, 0); + if (n_conns == 0 || n_conns >= UINT32_MAX) { + ovs_fatal(0, "n_conns must be between 1 and 2^32"); + } + n_zones = strtoul(ctx->argv[2], NULL, 0); + if (n_zones == 0 || n_zones >= UINT16_MAX) { + ovs_fatal(0, "n_zones must be between 1 and 2^16"); + } + iterations = strtoul(ctx->argv[3], NULL, 0); + if (iterations == 0) { + ovs_fatal(0, "iterations must be greater than 0"); + } + + ct = conntrack_init(); + + /* Create initial connection entries */ + start = time_msec(); + struct dp_packet_batch **pkt_batch = xzalloc(n_conns * sizeof *pkt_batch); + for (i = 0; i < n_conns; i++) { + pkt_batch[i] = xzalloc(sizeof(struct dp_packet_batch)); + dp_packet_batch_init(pkt_batch[i]); + uint16_t udp_src = (i & 0xFFFF0000) >> 16; + if (udp_src == 0) { + udp_src = UINT16_MAX; + } + uint16_t udp_dst = i & 0xFFFF; + if (udp_dst == 0) { + udp_dst = UINT16_MAX; + } + struct dp_packet *pkt = build_packet(udp_src, udp_dst, &dl_type); + dp_packet_batch_add(pkt_batch[i], pkt); + } + printf("initial packet generation time: %lld ms\n", time_msec() - start); + + /* Put initial entries to each zone */ + start = time_msec(); + for (i = 0; i < n_zones; i++) { + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, true, i, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + } + printf("initial insert time: %lld ms\n", time_msec() - start); + + /* Actually run the tests */ + stopwatch_create(STOPWATCH_CT_EXECUTE_COMMIT, SW_US); + stopwatch_create(STOPWATCH_CT_EXECUTE_NO_COMMIT, SW_US); + stopwatch_create(STOPWATCH_FLUSH_FULL_ZONE, SW_US); + stopwatch_create(STOPWATCH_FLUSH_EMPTY_ZONE, SW_US); + start = time_msec(); + for (i = 0; i < iterations; i++) { + /* Testing flushing a full zone */ + stopwatch_start(STOPWATCH_FLUSH_FULL_ZONE, time_usec()); + uint16_t zone = 1; + conntrack_flush(ct, &zone); + stopwatch_stop(STOPWATCH_FLUSH_FULL_ZONE, time_usec()); + + /* Now fill the zone again */ + stopwatch_start(STOPWATCH_CT_EXECUTE_COMMIT, time_usec()); + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, true, zone, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + stopwatch_stop(STOPWATCH_CT_EXECUTE_COMMIT, time_usec()); + + /* Running conntrack_execute on the now existing connections */ + stopwatch_start(STOPWATCH_CT_EXECUTE_NO_COMMIT, time_usec()); + for (j = 0; j < n_conns; j++) { + conntrack_execute(ct, pkt_batch[j], dl_type, false, false, zone, + NULL, NULL, NULL, NULL, now, 0); + pkt_metadata_init_conn(&pkt_batch[j]->packets[0]->md); + } + stopwatch_stop(STOPWATCH_CT_EXECUTE_NO_COMMIT, time_usec()); + + /* Testing flushing an empty zone */ + stopwatch_start(STOPWATCH_FLUSH_EMPTY_ZONE, time_usec()); + zone = UINT16_MAX; + conntrack_flush(ct, &zone); + stopwatch_stop(STOPWATCH_FLUSH_EMPTY_ZONE, time_usec()); + } + + printf("flush run time: %lld ms\n", time_msec() - start); + + stopwatch_sync(); + struct stopwatch_stats stats_ct_execute_commit = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_CT_EXECUTE_COMMIT, &stats_ct_execute_commit); + struct stopwatch_stats stats_ct_execute_nocommit = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_CT_EXECUTE_NO_COMMIT, + &stats_ct_execute_nocommit); + struct stopwatch_stats stats_flush_full = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_FLUSH_FULL_ZONE, &stats_flush_full); + struct stopwatch_stats stats_flush_empty = { .unit = SW_US }; + stopwatch_get_stats(STOPWATCH_FLUSH_EMPTY_ZONE, &stats_flush_empty); + + printf("results:\n"); + printf(" | ct execute (commit) | ct execute (no commit) |" + " flush full zone | flush empty zone |\n"); + printf("+--------+---------------------+------------------------+" + "-----------------+------------------+\n"); + printf("| Min | %16llu us | %19llu us | %12llu us | %13llu us |\n", + stats_ct_execute_commit.min, stats_ct_execute_nocommit.min, + stats_flush_full.min, stats_flush_empty.min); + printf("| Max | %16llu us | %19llu us | %12llu us | %13llu us |\n", + stats_ct_execute_commit.max, stats_ct_execute_nocommit.max, + stats_flush_full.max, stats_flush_empty.max); + printf("| 95%%ile | %16.2f us | %19.2f us | %12.2f us | %13.2f us |\n", + stats_ct_execute_commit.pctl_95, stats_ct_execute_nocommit.pctl_95, + stats_flush_full.pctl_95, stats_flush_empty.pctl_95); + printf("| Avg | %16.2f us | %19.2f us | %12.2f us | %13.2f us |\n", + stats_ct_execute_commit.ewma_1, stats_ct_execute_nocommit.ewma_1, + stats_flush_full.ewma_1, stats_flush_empty.ewma_1); + + conntrack_destroy(ct); + for (i = 0; i < n_conns; i++) { + dp_packet_delete_batch(pkt_batch[i], true); + free(pkt_batch[i]); + } + free(pkt_batch); +} + static void pcap_batch_execute_conntrack(struct conntrack *ct_, struct dp_packet_batch *pkt_batch) @@ -178,7 +324,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, if (flow.dl_type != dl_type) { conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, - NULL, NULL, 0, 0, NULL, NULL, now, 0); + NULL, NULL, NULL, NULL, now, 0); dp_packet_batch_init(&new_batch); } dp_packet_batch_add(&new_batch, packet); @@ -186,7 +332,7 @@ pcap_batch_execute_conntrack(struct conntrack *ct_, if (!dp_packet_batch_is_empty(&new_batch)) { conntrack_execute(ct_, &new_batch, dl_type, false, true, 0, NULL, NULL, - 0, 0, NULL, NULL, now, 0); + NULL, NULL, now, 0); } } @@ -264,6 +410,11 @@ static const struct ovs_cmdl_command commands[] = { * 'batch_size' (1 by default) per call, with the commit flag set. * Prints the ct_state of each packet. */ {"pcap", "file [batch_size]", 1, 2, test_pcap, OVS_RO}, + /* Creates 'n_conns' connections in 'n_zones' zones each. + * Afterwards triggers flush requests repeadeatly for the last filled zone + * and an empty zone. */ + {"benchmark-zones", "n_conns n_zones iterations", 3, 3, + test_benchmark_zones, OVS_RO}, {NULL, NULL, 0, 0, NULL, OVS_RO}, }; diff --git a/tests/test-cooperative-multitasking.c b/tests/test-cooperative-multitasking.c new file mode 100644 index 00000000000..f7407bb0305 --- /dev/null +++ b/tests/test-cooperative-multitasking.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2023 Canonical Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include "cooperative-multitasking.h" +#include "cooperative-multitasking-private.h" +#include "openvswitch/hmap.h" +#include "ovstest.h" +#include "timeval.h" +#include "util.h" +#include "openvswitch/vlog.h" + +struct fixture_arg { + bool called; +}; + +static void fixture_run_wrap(void *arg); + +#define FIXTURE_RUN_NAME "fixture_run" + +static void +fixture_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_RUN_NAME); + if (arg) { + arg->called = true; + } +} + +static void +fixture_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_run(fixture_arg); +} + + +static void fixture_other_run_wrap(void *arg); + +#define FIXTURE_OTHER_RUN_NAME "fixture_other_run" + +static void +fixture_other_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_other_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_OTHER_RUN_NAME); + if (arg) { + arg->called = true; + } +} + +static void +fixture_other_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_other_run(fixture_arg); +} + +static void +test_cm_set_registration(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 1000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 2000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_other_run_wrap, NULL, 0, 3000, + FIXTURE_OTHER_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 3); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->cb == &fixture_run_wrap); + ovs_assert(cm_entry->threshold == 1000); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->cb == &fixture_run_wrap); + ovs_assert(cm_entry->threshold == 2000); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->cb == &fixture_other_run_wrap) { + ovs_assert(cm_entry->arg == NULL); + ovs_assert(cm_entry->threshold == 3000); + ovs_assert(cm_entry->last_run == now); + } else { + OVS_NOT_REACHED(); + } + } + + cooperative_multitasking_remove(&fixture_other_run_wrap, NULL); + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + cooperative_multitasking_remove(&fixture_run_wrap, (void *) &arg2); + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 1); + + cooperative_multitasking_destroy(); +} + +static void +test_cm_set_update(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + /* First register a couple of callbacks. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 0, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 0, + FIXTURE_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 0); + ovs_assert(cm_entry->last_run == now); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 0); + ovs_assert(cm_entry->last_run == now); + } else { + OVS_NOT_REACHED(); + } + } + + /* Update 'last_run' and 'threshold' for each callback and validate + * that the correct entry was actually updated. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 1, 2, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 3, 4, + FIXTURE_RUN_NAME); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 2); + ovs_assert(cm_entry->last_run == 1); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 4); + ovs_assert(cm_entry->last_run == 3); + } else { + OVS_NOT_REACHED(); + } + } + + /* Confirm that providing 0 for 'last_run' or 'threshold' leaves the + * existing value untouched. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 5, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 6, 0, + FIXTURE_RUN_NAME); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + if (cm_entry->arg == (void *) &arg1) { + ovs_assert(cm_entry->threshold == 5); + ovs_assert(cm_entry->last_run == 1); + } else if (cm_entry->arg == (void *) &arg2) { + ovs_assert(cm_entry->threshold == 4); + ovs_assert(cm_entry->last_run == 6); + } else { + OVS_NOT_REACHED(); + } + } + + cooperative_multitasking_destroy(); +} + +static void +test_cm_yield(void) +{ + struct cm_entry *cm_entry; + struct fixture_arg arg1 = { + .called = false, + }; + struct fixture_arg arg2 = { + .called = false, + }; + + timeval_stop(); + long long int now = time_msec(); + + /* First register a couple of callbacks. */ + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg1, 0, 1000, + FIXTURE_RUN_NAME); + cooperative_multitasking_set(&fixture_run_wrap, (void *) &arg2, 0, 2000, + FIXTURE_RUN_NAME); + + ovs_assert(hmap_count(&cooperative_multitasking_callbacks) == 2); + + /* Call to yield should not execute callbacks until time threshold. */ + cooperative_multitasking_yield(); + ovs_assert(arg1.called == false); + ovs_assert(arg2.called == false); + + HMAP_FOR_EACH (cm_entry, node, &cooperative_multitasking_callbacks) { + ovs_assert(cm_entry->last_run == now); + } + + /* Move clock forward and confirm the expected callbacks to be executed. */ + timeval_warp(1000); + timeval_stop(); + cooperative_multitasking_yield(); + ovs_assert(arg1.called == true); + ovs_assert(arg2.called == false); + + /* Move clock forward and confirm the expected callbacks to be executed. */ + arg1.called = arg2.called = false; + timeval_warp(1000); + timeval_stop(); + cooperative_multitasking_yield(); + ovs_assert(arg1.called == true); + ovs_assert(arg2.called == true); + + cooperative_multitasking_destroy(); +} + +static void fixture_buggy_run_wrap(void *arg); + +#define FIXTURE_BUGGY_RUN_NAME "fixture_buggy_run" + +static void +fixture_buggy_run(struct fixture_arg *arg) +{ + cooperative_multitasking_set(&fixture_buggy_run_wrap, (void *) arg, + time_msec(), 0, FIXTURE_BUGGY_RUN_NAME); + if (arg) { + arg->called = true; + } + /* A real run function MUST NOT directly or indirectly call yield, this is + * here to test the detection of such a programming error. */ + cooperative_multitasking_yield(); +} + +static void +fixture_buggy_run_wrap(void *arg) +{ + struct fixture_arg *fixture_arg = (struct fixture_arg *) arg; + + fixture_buggy_run(fixture_arg); +} + +static void +test_cooperative_multitasking_nested_yield(int argc OVS_UNUSED, char *argv[]) +{ + struct fixture_arg arg1 = { + .called = false, + }; + + set_program_name(argv[0]); + vlog_set_pattern(VLF_CONSOLE, "%c|%p|%m"); + vlog_set_levels(NULL, VLF_SYSLOG, VLL_OFF); + + time_msec(); /* Ensure timeval is initialized. */ + + cooperative_multitasking_set(&fixture_buggy_run_wrap, (void *) &arg1, + 0, 1000, FIXTURE_BUGGY_RUN_NAME); + timeval_warp(1000); + cooperative_multitasking_yield(); + cooperative_multitasking_destroy(); +} + +static void +test_cooperative_multitasking(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) +{ + time_msec(); /* Ensure timeval is initialized. */ + + test_cm_set_registration(); + test_cm_set_update(); + test_cm_yield(); +} + +OVSTEST_REGISTER("test-cooperative-multitasking", + test_cooperative_multitasking); +OVSTEST_REGISTER("test-cooperative-multitasking-nested-yield", + test_cooperative_multitasking_nested_yield); diff --git a/tests/test-dpparse.py b/tests/test-dpparse.py new file mode 100755 index 00000000000..7762e5e8a90 --- /dev/null +++ b/tests/test-dpparse.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""test-dpparse reads flows from stdin and tries to parse them using +the python flow parsing library. +""" + +import fileinput +import sys + +try: + from ovs.flow.odp import ODPFlow +except ImportError: + sys.exit(0) + + +def main(): + for flow in fileinput.input(): + try: + result_flow = ODPFlow(flow) + if flow != str(result_flow): + print("in: {}".format(flow)) + print("out: {}".format(str(result_flow))) + raise ValueError("Flow conversion back to string failed") + except Exception as e: + print("Error parsing flow {}: {}".format(flow, e)) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test-id-fpool.c b/tests/test-id-fpool.c index 25275d9aefa..7bdb8154d3c 100644 --- a/tests/test-id-fpool.c +++ b/tests/test-id-fpool.c @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #undef NDEBUG #include #include #include - -#include +#include #include "command-line.h" #include "id-fpool.h" @@ -237,7 +237,7 @@ print_result(const char *prefix) for (i = 0; i < n_threads; i++) { avg += thread_working_ms[i]; } - avg /= n_threads; + avg /= n_threads ? n_threads : 1; printf("%s: ", prefix); for (i = 0; i < n_threads; i++) { if (thread_working_ms[i] >= TIMEOUT_MS) { diff --git a/tests/test-json.c b/tests/test-json.c index a2f4332e77b..6cf5eb75def 100644 --- a/tests/test-json.c +++ b/tests/test-json.c @@ -34,8 +34,123 @@ static int pretty = 0; * instead of exactly one object or array. */ static int multiple = 0; +static void test_json_equal(const struct json *a, const struct json *b, + bool allow_the_same); + +static void +test_json_equal_object(const struct shash *a, const struct shash *b, + bool allow_the_same) +{ + struct shash_node *a_node; + + ovs_assert(allow_the_same || a != b); + + if (a == b) { + return; + } + + ovs_assert(shash_count(a) == shash_count(b)); + + SHASH_FOR_EACH (a_node, a) { + struct shash_node *b_node = shash_find(b, a_node->name); + + ovs_assert(b_node); + test_json_equal(a_node->data, b_node->data, allow_the_same); + } +} + +static void +test_json_equal_array(const struct json_array *a, const struct json_array *b, + bool allow_the_same) +{ + ovs_assert(allow_the_same || a != b); + + if (a == b) { + return; + } + + ovs_assert(a->n == b->n); + + for (size_t i = 0; i < a->n; i++) { + test_json_equal(a->elems[i], b->elems[i], allow_the_same); + } +} + +static void +test_json_equal(const struct json *a, const struct json *b, + bool allow_the_same) +{ + ovs_assert(allow_the_same || a != b); + ovs_assert(a && b); + + if (a == b) { + ovs_assert(a->count > 1); + return; + } + + ovs_assert(a->type == b->type); + + switch (a->type) { + case JSON_OBJECT: + test_json_equal_object(a->object, b->object, allow_the_same); + return; + + case JSON_ARRAY: + test_json_equal_array(&a->array, &b->array, allow_the_same); + return; + + case JSON_STRING: + case JSON_SERIALIZED_OBJECT: + ovs_assert(a->string != b->string); + ovs_assert(!strcmp(a->string, b->string)); + return; + + case JSON_NULL: + case JSON_FALSE: + case JSON_TRUE: + return; + + case JSON_INTEGER: + ovs_assert(a->integer == b->integer); + return; + + case JSON_REAL: + ovs_assert(a->real == b->real); + return; + + case JSON_N_TYPES: + default: + OVS_NOT_REACHED(); + } +} + +static void +test_json_clone(struct json *json) +{ + struct json *copy, *deep_copy; + + copy = json_clone(json); + + ovs_assert(json_equal(json, copy)); + test_json_equal(json, copy, true); + ovs_assert(json->count == 2); + + json_destroy(copy); + ovs_assert(json->count == 1); + + deep_copy = json_deep_clone(json); + + ovs_assert(json_equal(json, deep_copy)); + test_json_equal(json, deep_copy, false); + ovs_assert(json->count == 1); + ovs_assert(deep_copy->count == 1); + + json_destroy(deep_copy); + ovs_assert(json->count == 1); +} + static bool -print_and_free_json(struct json *json) +print_test_and_free_json(struct json *json) { bool ok; if (json->type == JSON_STRING) { @@ -47,6 +162,7 @@ print_and_free_json(struct json *json) free(s); ok = true; } + test_json_clone(json); json_destroy(json); return ok; } @@ -89,7 +205,7 @@ parse_multiple(FILE *stream) used += json_parser_feed(parser, &buffer[used], n - used); if (used < n) { - if (!print_and_free_json(json_parser_finish(parser))) { + if (!print_test_and_free_json(json_parser_finish(parser))) { ok = false; } parser = NULL; @@ -97,7 +213,7 @@ parse_multiple(FILE *stream) } } if (parser) { - if (!print_and_free_json(json_parser_finish(parser))) { + if (!print_test_and_free_json(json_parser_finish(parser))) { ok = false; } } @@ -150,7 +266,7 @@ test_json_main(int argc, char *argv[]) if (multiple) { ok = parse_multiple(stream); } else { - ok = print_and_free_json(json_from_stream(stream)); + ok = print_test_and_free_json(json_from_stream(stream)); } fclose(stream); diff --git a/tests/test-jsonrpc.py b/tests/test-jsonrpc.py index 1df5afa221f..8a4a1759380 100644 --- a/tests/test-jsonrpc.py +++ b/tests/test-jsonrpc.py @@ -199,13 +199,13 @@ def main(argv): sys.exit(1) func, n_args = commands[command_name] - if type(n_args) == tuple: + if type(n_args) is tuple: if len(args) < n_args[0]: sys.stderr.write("%s: \"%s\" requires at least %d arguments but " "only %d provided\n" % (argv[0], command_name, n_args, len(args))) sys.exit(1) - elif type(n_args) == int: + elif type(n_args) is int: if len(args) != n_args: sys.stderr.write("%s: \"%s\" requires %d arguments but %d " "provided\n" diff --git a/tests/test-l7.py b/tests/test-l7.py index 32a77392c64..97cd4f29a60 100755 --- a/tests/test-l7.py +++ b/tests/test-l7.py @@ -86,6 +86,8 @@ def main(): description='Run basic application servers.') parser.add_argument('proto', default='http', nargs='?', help='protocol to serve (%s)' % protocols) + parser.add_argument('port', default=0, nargs='?', + help='server port number') args = parser.parse_args() if args.proto not in protocols: @@ -95,6 +97,8 @@ def main(): constructor = SERVERS[args.proto][0] handler = SERVERS[args.proto][1] port = SERVERS[args.proto][2] + if args.port != 0: + port = args.port srv = constructor(('', port), handler) srv.serve_forever() diff --git a/tests/test-mpsc-queue.c b/tests/test-mpsc-queue.c index a38bf9e6dfa..86a223caffa 100644 --- a/tests/test-mpsc-queue.c +++ b/tests/test-mpsc-queue.c @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #undef NDEBUG #include #include #include - -#include +#include #include "command-line.h" #include "guarded-list.h" @@ -315,7 +315,7 @@ print_result(const char *prefix, int reader_elapsed) for (i = 0; i < n_threads; i++) { avg += thread_working_ms[i]; } - avg /= n_threads; + avg /= n_threads ? n_threads : 1; printf("%s: %6d", prefix, reader_elapsed); for (i = 0; i < n_threads; i++) { printf(" %6" PRIu64, thread_working_ms[i]); diff --git a/tests/test-netflow.c b/tests/test-netflow.c index d2322d4509a..7f89cfcae0d 100644 --- a/tests/test-netflow.c +++ b/tests/test-netflow.c @@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[]) } daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(NULL, &server); if (error) { diff --git a/tests/test-ofparse.py b/tests/test-ofparse.py new file mode 100755 index 00000000000..ba96e8344c2 --- /dev/null +++ b/tests/test-ofparse.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""test-ofparse reads flows from stdin and tries to parse them using +the python flow parsing library. +""" + +import fileinput +import sys + +try: + from ovs.flow.ofp import OFPFlow +except ImportError: + sys.exit(0) + + +def main(): + for flow in fileinput.input(): + try: + result_flow = OFPFlow(flow) + if flow != str(result_flow): + print("in: {}".format(flow)) + print("out: {}".format(str(result_flow))) + raise ValueError("Flow conversion back to string failed") + except Exception as e: + print("Error parsing flow {}: {}".format(flow, e)) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test-ovsdb.c b/tests/test-ovsdb.c index 5f7110f415f..41c1525f451 100644 --- a/tests/test-ovsdb.c +++ b/tests/test-ovsdb.c @@ -870,7 +870,8 @@ do_parse_rows(struct ovs_cmdl_context *ctx) row = ovsdb_row_create(table); json = unbox_json(parse_json(ctx->argv[i])); - check_ovsdb_error(ovsdb_row_from_json(row, json, NULL, &columns)); + check_ovsdb_error(ovsdb_row_from_json(row, json, NULL, + &columns, false)); json_destroy(json); print_and_free_json(ovsdb_row_to_json(row, &all_columns)); @@ -937,7 +938,7 @@ do_compare_rows(struct ovs_cmdl_context *ctx) } names[i] = xstrdup(json->array.elems[0]->string); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[1], - NULL, NULL)); + NULL, NULL, false)); json_destroy(json); } for (i = 0; i < n_rows; i++) { @@ -1050,7 +1051,7 @@ do_evaluate_condition__(struct ovs_cmdl_context *ctx, int mode) for (i = 0; i < n_rows; i++) { rows[i] = ovsdb_row_create(table); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); } json_destroy(json); @@ -1224,7 +1225,7 @@ do_execute_mutations(struct ovs_cmdl_context *ctx) for (i = 0; i < n_rows; i++) { rows[i] = ovsdb_row_create(table); check_ovsdb_error(ovsdb_row_from_json(rows[i], json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); } json_destroy(json); @@ -1338,7 +1339,7 @@ do_query(struct ovs_cmdl_context *ctx) struct ovsdb_row *row = ovsdb_row_create(table); uuid_generate(ovsdb_row_get_uuid_rw(row)); check_ovsdb_error(ovsdb_row_from_json(row, json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); if (ovsdb_table_get_row(table, ovsdb_row_get_uuid(row))) { ovs_fatal(0, "duplicate UUID "UUID_FMT" in table", UUID_ARGS(ovsdb_row_get_uuid(row))); @@ -1445,7 +1446,7 @@ do_query_distinct(struct ovs_cmdl_context *ctx) row = ovsdb_row_create(table); uuid_generate(ovsdb_row_get_uuid_rw(row)); check_ovsdb_error(ovsdb_row_from_json(row, json->array.elems[i], - NULL, NULL)); + NULL, NULL, false)); /* Initialize row and find equivalence class. */ rows[i].uuid = *ovsdb_row_get_uuid(row); @@ -1797,7 +1798,7 @@ do_transact_modify(struct ovs_cmdl_context *ctx) struct ovsdb_row *row_rw; row_ro = do_transact_find_row(ctx->argv[1]); - row_rw = ovsdb_txn_row_modify(do_transact_txn, row_ro); + ovsdb_txn_row_modify(do_transact_txn, row_ro, &row_rw, NULL); do_transact_set_i_j(row_rw, ctx->argv[2], ctx->argv[3]); } @@ -2022,6 +2023,24 @@ print_idl_row_updated_link2(const struct idltest_link2 *l2, int step) } } +static void +print_idl_row_updated_indexed(const struct idltest_indexed *ind, int step) +{ + struct ds updates = DS_EMPTY_INITIALIZER; + + for (size_t i = 0; i < IDLTEST_INDEXED_N_COLUMNS; i++) { + if (idltest_indexed_is_updated(ind, i)) { + ds_put_format(&updates, " %s", idltest_indexed_columns[i].name); + } + } + if (updates.length) { + print_and_log("%03d: table %s: updated columns:%s", + step, ind->header_.table->class_->name, + ds_cstr(&updates)); + ds_destroy(&updates); + } +} + static void print_idl_row_updated_simple3(const struct idltest_simple3 *s3, int step) { @@ -2171,6 +2190,21 @@ print_idl_row_link2(const struct idltest_link2 *l2, int step, bool terse) print_idl_row_updated_link2(l2, step); } +static void +print_idl_row_indexed(const struct idltest_indexed *ind, int step, bool terse) +{ + struct ds msg = DS_EMPTY_INITIALIZER; + + ds_put_format(&msg, "i=%"PRId64, ind->i); + + char *row_msg = format_idl_row(&ind->header_, step, ds_cstr(&msg), terse); + print_and_log("%s", row_msg); + ds_destroy(&msg); + free(row_msg); + + print_idl_row_updated_indexed(ind, step); +} + static void print_idl_row_simple3(const struct idltest_simple3 *s3, int step, bool terse) { @@ -2251,6 +2285,7 @@ print_idl_row_singleton(const struct idltest_singleton *sng, int step, static void print_idl(struct ovsdb_idl *idl, int step, bool terse) { + const struct idltest_indexed *ind; const struct idltest_simple3 *s3; const struct idltest_simple4 *s4; const struct idltest_simple6 *s6; @@ -2284,6 +2319,10 @@ print_idl(struct ovsdb_idl *idl, int step, bool terse) print_idl_row_simple6(s6, step, terse); n++; } + IDLTEST_INDEXED_FOR_EACH (ind, idl) { + print_idl_row_indexed(ind, step, terse); + n++; + } IDLTEST_SINGLETON_FOR_EACH (sng, idl) { print_idl_row_singleton(sng, step, terse); n++; @@ -2296,6 +2335,7 @@ print_idl(struct ovsdb_idl *idl, int step, bool terse) static void print_idl_track(struct ovsdb_idl *idl, int step, bool terse) { + const struct idltest_indexed *ind; const struct idltest_simple3 *s3; const struct idltest_simple4 *s4; const struct idltest_simple6 *s6; @@ -2328,6 +2368,10 @@ print_idl_track(struct ovsdb_idl *idl, int step, bool terse) print_idl_row_simple6(s6, step, terse); n++; } + IDLTEST_INDEXED_FOR_EACH (ind, idl) { + print_idl_row_indexed(ind, step, terse); + n++; + } if (!n) { print_and_log("%03d: empty", step); @@ -2400,7 +2444,7 @@ idltest_find_simple(struct ovsdb_idl *idl, int i) return NULL; } -static void +static bool idl_set(struct ovsdb_idl *idl, char *commands, int step) { char *cmd, *save_ptr1 = NULL; @@ -2458,6 +2502,19 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) s = idltest_simple_insert(txn); idltest_simple_set_i(s, atoi(arg1)); + } else if (!strcmp(name, "insert_uuid")) { + struct idltest_simple *s; + + if (!arg1 || !arg2) { + ovs_fatal(0, "\"insert\" command requires 2 arguments"); + } + + struct uuid s_uuid; + if (!uuid_from_string(&s_uuid, arg1)) { + ovs_fatal(0, "\"insert_uuid\" command requires valid uuid"); + } + s = idltest_simple_insert_persist_uuid(txn, &s_uuid); + idltest_simple_set_i(s, atoi(arg2)); } else if (!strcmp(name, "delete")) { const struct idltest_simple *s; @@ -2522,7 +2579,7 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) print_and_log("%03d: destroy", step); ovsdb_idl_txn_destroy(txn); ovsdb_idl_check_consistency(idl); - return; + return true; } else { ovs_fatal(0, "unknown command %s", name); } @@ -2543,6 +2600,8 @@ idl_set(struct ovsdb_idl *idl, char *commands, int step) ovsdb_idl_txn_destroy(txn); ovsdb_idl_check_consistency(idl); + + return (status != TXN_ERROR); } static const struct ovsdb_idl_table_class * @@ -2612,11 +2671,12 @@ parse_link2_json_clause(struct ovsdb_idl_condition *cond, } } -static void -update_conditions(struct ovsdb_idl *idl, char *commands) +static unsigned int +update_conditions(struct ovsdb_idl *idl, char *commands, int step) { - char *cmd, *save_ptr1 = NULL; const struct ovsdb_idl_table_class *tc; + unsigned int next_cond_seqno = 0; + char *cmd, *save_ptr1 = NULL; for (cmd = strtok_r(commands, ";", &save_ptr1); cmd; cmd = strtok_r(NULL, ";", &save_ptr1)) { @@ -2667,15 +2727,20 @@ update_conditions(struct ovsdb_idl *idl, char *commands) unsigned int seqno = ovsdb_idl_get_condition_seqno(idl); unsigned int next_seqno = ovsdb_idl_set_condition(idl, tc, &cond); if (seqno == next_seqno ) { - ovs_fatal(0, "condition unchanged"); + print_and_log("%03d: %s: conditions unchanged", + step, table_name); + } else { + print_and_log("%03d: %s: change conditions", step, table_name); } unsigned int new_next_seqno = ovsdb_idl_set_condition(idl, tc, &cond); if (next_seqno != new_next_seqno) { ovs_fatal(0, "condition expected seqno changed"); } + next_cond_seqno = MAX(next_cond_seqno, next_seqno); ovsdb_idl_condition_destroy(&cond); json_destroy(json); } + return next_cond_seqno; } static void @@ -2684,6 +2749,7 @@ do_idl(struct ovs_cmdl_context *ctx) struct test_ovsdb_pvt_context *pvt = ctx->pvt; struct jsonrpc *rpc; struct ovsdb_idl *idl; + unsigned int next_cond_seqno = 0; unsigned int seqno = 0; struct ovsdb_symbol_table *symtab; size_t n_uuids = 0; @@ -2720,8 +2786,8 @@ do_idl(struct ovs_cmdl_context *ctx) const char remote_s[] = "set-remote "; const char cond_s[] = "condition "; if (ctx->argc > 2 && strstr(ctx->argv[2], cond_s)) { - update_conditions(idl, ctx->argv[2] + strlen(cond_s)); - print_and_log("%03d: change conditions", step++); + next_cond_seqno = + update_conditions(idl, ctx->argv[2] + strlen(cond_s), step++); i = 3; } else { i = 2; @@ -2740,6 +2806,21 @@ do_idl(struct ovs_cmdl_context *ctx) if (*arg == '+') { /* The previous transaction didn't change anything. */ arg++; + } else if (*arg == '^') { + /* Wait for condition change to be acked by the server. */ + arg++; + for (;;) { + ovsdb_idl_run(idl); + ovsdb_idl_check_consistency(idl); + if (ovsdb_idl_get_condition_seqno(idl) == next_cond_seqno) { + break; + } + jsonrpc_run(rpc); + + ovsdb_idl_wait(idl); + jsonrpc_wait(rpc); + poll_block(); + } } else { /* Wait for update. */ for (;;) { @@ -2762,6 +2843,13 @@ do_idl(struct ovs_cmdl_context *ctx) } else { print_idl(idl, step++, terse); } + + /* Just run IDL forever for a simple monitoring. */ + if (!strcmp(arg, "monitor")) { + seqno = ovsdb_idl_get_seqno(idl); + i--; + continue; + } } seqno = ovsdb_idl_get_seqno(idl); @@ -2774,10 +2862,17 @@ do_idl(struct ovs_cmdl_context *ctx) arg + strlen(remote_s), ovsdb_idl_is_connected(idl) ? "true" : "false"); } else if (!strncmp(arg, cond_s, strlen(cond_s))) { - update_conditions(idl, arg + strlen(cond_s)); - print_and_log("%03d: change conditions", step++); + next_cond_seqno = update_conditions(idl, arg + strlen(cond_s), + step++); } else if (arg[0] != '[') { - idl_set(idl, arg, step++); + if (!idl_set(idl, arg, step++)) { + /* If idl_set() returns false, then no transaction + * was sent to the server and most likely 'seqno' + * would remain the same. And the above 'Wait for update' + * for loop poll_block() would never return. + * So set seqno to 0. */ + seqno = 0; + } } else { struct json *json = parse_json(arg); substitute_uuids(json, symtab); diff --git a/tests/test-ovsdb.py b/tests/test-ovsdb.py index 402cacbe9d7..60752ef4ae2 100644 --- a/tests/test-ovsdb.py +++ b/tests/test-ovsdb.py @@ -36,8 +36,68 @@ vlog.init(None) +def substitute_object_text(data, quotechar='"', obj_chars=("{}", "[]"), + tag_format="_OBJECT_{}_"): + """Replace objects in strings with tags that can later be retrieved + + Given data like: + 'cmd1 1, cmd2 {"a": {"a": "b"}}, cmd3 1 2, cmd4 ["a", "b"]' + + Return an output string: + 'cmd1 1, cmd2 _OBJECT_0_, cmd3 1 2, cmd4 _OBJECT_1_' + + and a dictionary of replaced text: + {'_OBJECT_0_': '{"a": {"a": "b"}}', '_OBJECT_1_': '["a", "b"]'} + """ + + obj_chars = dict(obj_chars) + in_quote = False + in_object = [] # Stack of nested outer object opening characters. + replaced_text = {} + output = "" + start = end = 0 + for i, c in enumerate(data): + if not in_object: + if not in_quote and c in obj_chars: + # This is the start of a non-quoted outer object that will + # be replaced by a tag. + in_object.append(c) + start = i + else: + # Regular output. + output += c + if c == quotechar: + in_quote = not in_quote + elif not in_quote: # Unquoted object. + if c == in_object[0]: + # Record on the stack that we are in a nested object of the + # same type as the outer object, this object will not be + # substituted with a tag. + in_object.append(c) + elif c == obj_chars[in_object[0]]: + # This is the closing character to this potentially nested + # object's opening character, so pop it off the stack. + in_object.pop() + if not in_object: + # This is the outer object's closing character, so record + # the substituted text and generate the tagged text. + end = i + 1 + tag = tag_format.format(len(replaced_text)) + replaced_text[tag] = data[start:end] + output += tag + return output, replaced_text + + +def recover_object_text_from_list(words, json): + if not json: + return words + # NOTE(twilson) This does not handle the case of having multiple replaced + # objects in the same word, e.g. two json adjacent json strings. + return [json.get(word, word) for word in words] + + def unbox_json(json): - if type(json) == list and len(json) == 1: + if type(json) is list and len(json) == 1: return json[0] else: return json @@ -228,6 +288,10 @@ def get_link2_table_printable_row(row): return s +def get_indexed_table_printable_row(row): + return "i=%s" % row.i + + def get_singleton_table_printable_row(row): return "name=%s" % row.name @@ -307,6 +371,14 @@ def print_idl(idl, step, terse=False): terse) n += 1 + if "indexed" in idl.tables: + ind = idl.tables["indexed"].rows + for row in ind.values(): + print_row("indexed", row, step, + get_indexed_table_printable_row(row), + terse) + n += 1 + if "singleton" in idl.tables: sng = idl.tables["singleton"].rows for row in sng.values(): @@ -325,9 +397,9 @@ def substitute_uuids(json, symtab): symbol = symtab.get(json) if symbol: return str(symbol) - elif type(json) == list: + elif type(json) is list: return [substitute_uuids(element, symtab) for element in json] - elif type(json) == dict: + elif type(json) is dict: d = {} for key, value in json.items(): d[key] = substitute_uuids(value, symtab) @@ -341,10 +413,10 @@ def parse_uuids(json, symtab): name = "#%d#" % len(symtab) sys.stderr.write("%s = %s\n" % (name, json)) symtab[name] = json - elif type(json) == list: + elif type(json) is list: for element in json: parse_uuids(element, symtab) - elif type(json) == dict: + elif type(json) is dict: for value in json.values(): parse_uuids(value, symtab) @@ -377,8 +449,15 @@ def idl_set(idl, commands, step): increment = False fetch_cmds = [] events = [] + # `commands` is a comma-separated list of space-separated arguments. To + # handle commands that take arguments that may contain spaces or commas, + # e.g. JSON, it is necessary to process `commands` to extract those + # arguments before splitting by ',' or ' ' below, and then re-insert them + # after the arguments are split. + commands, data = substitute_object_text(commands) for command in commands.split(','): words = command.split() + words = recover_object_text_from_list(words, data) name = words[0] args = words[1:] @@ -429,6 +508,20 @@ def notify(event, row, updates=None): s = txn.insert(idl.tables["simple"]) s.i = int(args[0]) + elif name == "insert_uuid": + if len(args) != 2: + sys.stderr.write('"set" command requires 2 argument\n') + sys.exit(1) + + s = txn.insert(idl.tables["simple"], new_uuid=uuid.UUID(args[0]), + persist_uuid=True) + s.i = int(args[1]) + elif name == "add_op": + if len(args) != 1: + sys.stderr.write('"add_op" command requires 1 argument\n') + sys.stderr.write(f"args={args}\n") + sys.exit(1) + txn.add_op(ovs.json.from_string(args[0])) elif name == "delete": if len(args) != 1: sys.stderr.write('"delete" command requires 1 argument\n') @@ -491,7 +584,7 @@ def notify(event, row, updates=None): print("%03d: destroy" % step) sys.stdout.flush() txn.abort() - return + return True elif name == "linktest": l1_0 = txn.insert(idl.tables["link1"]) l1_0.i = 1 @@ -615,8 +708,11 @@ def notify(event, row, updates=None): sys.stdout.write("\n") sys.stdout.flush() + return status != ovs.db.idl.Transaction.ERROR + -def update_condition(idl, commands): +def update_condition(idl, commands, step): + next_cond_seqno = 0 commands = commands[len("condition "):].split(";") for command in commands: command = command.split(" ") @@ -627,7 +723,20 @@ def update_condition(idl, commands): table = command[0] cond = ovs.json.from_string(command[1]) - idl.cond_change(table, cond) + next_seqno = idl.cond_change(table, cond) + if idl.cond_seqno == next_seqno: + sys.stdout.write("%03d: %s: conditions unchanged\n" % + (step, table)) + else: + sys.stdout.write("%03d: %s: change conditions\n" % + (step, table)) + sys.stdout.flush() + + assert next_seqno == idl.cond_change(table, cond), \ + "condition expected seqno changed" + next_cond_seqno = max(next_cond_seqno, next_seqno) + + return next_cond_seqno def do_idl(schema_file, remote, *commands): @@ -666,6 +775,9 @@ def do_idl(schema_file, remote, *commands): idl = ovs.db.idl.Idl(remote, schema_helper, leader_only=False) if "simple3" in idl.tables: idl.index_create("simple3", "simple3_by_name") + if "indexed" in idl.tables: + idx = idl.index_create("indexed", "indexed_by_i") + idx.add_column("i") if commands: remotes = remote.split(',') @@ -684,6 +796,7 @@ def do_idl(schema_file, remote, *commands): else: rpc = None + next_cond_seqno = 0 symtab = {} seqno = 0 step = 0 @@ -707,9 +820,7 @@ def mock_notify(event, row, updates=None): commands = list(commands) if len(commands) >= 1 and "condition" in commands[0]: - update_condition(idl, commands.pop(0)) - sys.stdout.write("%03d: change conditions\n" % step) - sys.stdout.flush() + next_cond_seqno = update_condition(idl, commands.pop(0), step) step += 1 for command in commands: @@ -722,18 +833,35 @@ def mock_notify(event, row, updates=None): if command.startswith("+"): # The previous transaction didn't change anything. command = command[1:] - else: - # Wait for update. - while idl.change_seqno == seqno and not idl.run(): + elif command.startswith("^"): + # Wait for condition change to be acked by the server. + command = command[1:] + while idl.cond_seqno != next_cond_seqno and not idl.run(): rpc.run() poller = ovs.poller.Poller() idl.wait(poller) rpc.wait(poller) poller.block() + else: + # Wait for update. + while True: + while idl.change_seqno == seqno and not idl.run(): + rpc.run() - print_idl(idl, step, terse) - step += 1 + poller = ovs.poller.Poller() + idl.wait(poller) + rpc.wait(poller) + poller.block() + + print_idl(idl, step, terse) + step += 1 + + # Run IDL forever in case of a simple monitor, otherwise + # break and execute the command. + seqno = idl.change_seqno + if command != "monitor": + break seqno = idl.change_seqno @@ -743,12 +871,16 @@ def mock_notify(event, row, updates=None): step += 1 idl.force_reconnect() elif "condition" in command: - update_condition(idl, command) - sys.stdout.write("%03d: change conditions\n" % step) - sys.stdout.flush() + next_cond_seqno = update_condition(idl, command, step) step += 1 elif not command.startswith("["): - idl_set(idl, command, step) + if not idl_set(idl, command, step): + # If idl_set() returns false, then no transaction + # was sent to the server and most likely seqno + # would remain the same. And the above 'Wait for update' + # for loop poller.block() would never return. + # So set seqno to 0. + seqno = 0 step += 1 else: json = ovs.json.from_string(command) @@ -1012,14 +1144,14 @@ def main(argv): sys.exit(1) func, n_args = commands[command_name] - if type(n_args) == tuple: + if type(n_args) is tuple: if len(args) < n_args[0]: sys.stderr.write("%s: \"%s\" requires at least %d arguments but " "only %d provided\n" % (ovs.util.PROGRAM_NAME, command_name, n_args[0], len(args))) sys.exit(1) - elif type(n_args) == int: + elif type(n_args) is int: if len(args) != n_args: sys.stderr.write("%s: \"%s\" requires %d arguments but %d " "provided\n" diff --git a/tests/test-psample.c b/tests/test-psample.c new file mode 100644 index 00000000000..1494dcc8d25 --- /dev/null +++ b/tests/test-psample.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2024 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include +#include +#include +#include +#include + +#include + +#include "command-line.h" +#include "dp-packet.h" +#include "util.h" +#include "netlink.h" +#include "netlink-socket.h" +#include "openvswitch/ofp-actions.h" +#include "openvswitch/ofp-print.h" +#include "openvswitch/types.h" +#include "openvswitch/uuid.h" +#include "openvswitch/vlog.h" +#include "ovstest.h" + +VLOG_DEFINE_THIS_MODULE(test_psample); + +static int psample_family = 0; +static uint32_t group_id = 0; +static bool has_filter; + +static void usage(void) +{ + printf("%s: psample collector test utility\n" + "usage: %s [OPTIONS] [GROUP]\n" + "where GROUP is the psample group_id to listen on. " + "If none is provided all events are printed.\n", + program_name, program_name); + vlog_usage(); + printf("\nOther Options:\n" + " -h, --help display this help message\n"); +} + +static void parse_options(int argc, char *argv[]) +{ + enum { + VLOG_OPTION_ENUMS + }; + static const struct option long_options[] = { + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + VLOG_LONG_OPTIONS, + {NULL, 0, NULL, 0}, + }; + char *tmp_short_options, *short_options; + int ret = EXIT_SUCCESS; + bool do_exit = false; + + tmp_short_options = ovs_cmdl_long_options_to_short_options(long_options); + short_options = xasprintf("+%s", tmp_short_options); + + while (!do_exit) { + int option; + + option = getopt_long(argc, argv, short_options, long_options, NULL); + if (option == -1) { + break; + } + + switch (option) { + + VLOG_OPTION_HANDLERS + + case 'h': + usage(); + do_exit = true; + ret = EXIT_SUCCESS; + break; + + case '?': + do_exit = true; + ret = EXIT_FAILURE; + break; + + default: + OVS_NOT_REACHED(); + } + } + + free(tmp_short_options); + free(short_options); + if (do_exit) { + exit(ret); + } +} + +static int connect_psample_socket(struct nl_sock **sock) +{ + unsigned int psample_packet_mcgroup; + int error; + + error = nl_lookup_genl_family(PSAMPLE_GENL_NAME , &psample_family); + if (error) { + VLOG_ERR("PSAMPLE_GENL_NAME not found: %s", ovs_strerror(error)); + return error; + } + + error = nl_lookup_genl_mcgroup(PSAMPLE_GENL_NAME, + PSAMPLE_NL_MCGRP_SAMPLE_NAME, + &psample_packet_mcgroup); + if (error) { + VLOG_ERR("psample packet multicast group not found: %s", + ovs_strerror(error)); + return error; + } + + error = nl_sock_create(NETLINK_GENERIC, sock); + if (error) { + VLOG_ERR("cannot create netlink socket: %s ", ovs_strerror(error)); + return error; + } + + nl_sock_listen_all_nsid(*sock, true); + + error = nl_sock_join_mcgroup(*sock, psample_packet_mcgroup); + if (error) { + nl_sock_destroy(*sock); + *sock = NULL; + VLOG_ERR("cannot join psample multicast group: %s", + ovs_strerror(error)); + return error; + } + return 0; +} + +/* Internal representation of a sample. */ +struct sample { + struct dp_packet packet; + uint32_t group_id; + uint32_t rate; + uint32_t obs_domain_id; + uint32_t obs_point_id; + bool has_cookie; +}; + +static inline void +sample_clear(struct sample *sample) +{ + sample->group_id = 0; + sample->obs_domain_id = 0; + sample->obs_point_id = 0; + sample->has_cookie = false; + dp_packet_clear(&sample->packet); +} + +static int +parse_psample(struct ofpbuf *buf, struct sample *sample) +{ + static const struct nl_policy psample_packet_policy[] = { + [PSAMPLE_ATTR_SAMPLE_GROUP] = { .type = NL_A_U32 }, + [PSAMPLE_ATTR_SAMPLE_RATE] = { .type = NL_A_U32 }, + [PSAMPLE_ATTR_DATA] = { .type = NL_A_UNSPEC, + .optional = true }, + [PSAMPLE_ATTR_USER_COOKIE] = { .type = NL_A_UNSPEC, + .optional = true }, + }; + + struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size); + struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg); + struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl); + struct nlattr *attr; + + struct nlattr *a[ARRAY_SIZE(psample_packet_policy)]; + if (!nlmsg || !genl + || !nl_policy_parse(&b, 0, psample_packet_policy, a, + ARRAY_SIZE(psample_packet_policy))) { + return EINVAL; + } + + attr = a[PSAMPLE_ATTR_DATA]; + if (attr) { + dp_packet_push(&sample->packet, nl_attr_get(attr), + nl_attr_get_size(attr)); + } + + sample->group_id = nl_attr_get_u32(a[PSAMPLE_ATTR_SAMPLE_GROUP]); + sample->rate = nl_attr_get_u32(a[PSAMPLE_ATTR_SAMPLE_RATE]); + + attr = a[PSAMPLE_ATTR_USER_COOKIE]; + if (attr && nl_attr_get_size(attr) == + sizeof sample->obs_domain_id + sizeof sample->obs_point_id) { + const ovs_be32 *data = nl_attr_get(attr); + + sample->has_cookie = true; + sample->obs_domain_id = ntohl(*data++); + sample->obs_point_id = ntohl(*data); + } + return 0; +} + +static void run(struct nl_sock *sock) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + struct sample sample = {}; + int error; + + dp_packet_init(&sample.packet, 1500); + + fprintf(stdout, "Listening for psample events\n"); + fflush(stdout); + + for (;;) { + uint64_t buf_stub[4096 / 8]; + struct ofpbuf buf; + + sample_clear(&sample); + + ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub); + error = nl_sock_recv(sock, &buf, NULL, true); + + if (error == ENOBUFS) { + fprintf(stderr, "[missed events]\n"); + continue; + } else if (error == EAGAIN) { + continue; + } else if (error) { + VLOG_ERR_RL(&rl, "error reading samples: %i", error); + continue; + } + + error = parse_psample(&buf, &sample); + if (error) { + VLOG_ERR_RL(&rl, "error parsing samples: %i", error); + continue; + } + + if (!has_filter || sample.group_id == group_id) { + fprintf(stdout, "group_id=0x%"PRIx32",prob=%"PRIu32" ", + sample.group_id, sample.rate); + if (sample.has_cookie) { + fprintf(stdout, + "obs_domain=0x%"PRIx32",obs_point=0x%"PRIx32" ", + sample.obs_domain_id, sample.obs_point_id); + } + ofp_print_dp_packet(stdout, &sample.packet); + } + fflush(stdout); + } +} + +static void +test_psample_main(int argc, char *argv[]) +{ + struct nl_sock *sock; + int error; + + parse_options(argc, argv); + + if (argc - optind > 1) { + ovs_fatal(0, "at most one positional argument supported " + "(use --help for help)"); + } else if (argc - optind == 1) { + if (!str_to_uint(argv[optind], 10, &group_id)) { + ovs_fatal(0, "invalid group id"); + } + has_filter = true; + } + + error = connect_psample_socket(&sock); + if (error) { + ovs_fatal(error, "failed to connect to psample socket"); + } + + run(sock); +} + +OVSTEST_REGISTER("test-psample", test_psample_main); diff --git a/tests/test-rculist.c b/tests/test-rculist.c new file mode 100644 index 00000000000..07a6338b862 --- /dev/null +++ b/tests/test-rculist.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#undef NDEBUG +#include + +#include "openvswitch/list.h" +#include "ovstest.h" +#include "ovs-thread.h" +#include "random.h" +#include "rculist.h" +#include "util.h" + +enum { MAX_ELEMS = 10, MAX_CHECKS = 200 }; + +/* Sample list element. */ +struct element { + int value; + struct rculist node; +}; + +static void +do_usleep(unsigned int usecs) +{ +#ifdef _WIN32 + Sleep(MAX(usecs / 1000, 1)); +#else + usleep(usecs); +#endif +} + +/* Continuously check the integrity of the list until it's empty. */ +static void * +checker_main(void *aux) +{ + struct rculist *list = (struct rculist *) aux; + struct element *elem; + bool checked = false; + + for (int i = 0; i < MAX_CHECKS; i++) { + int value = -1; + + RCULIST_FOR_EACH (elem, node, list) { + ovs_assert(value <= elem->value); + ovs_assert(elem->value < MAX_ELEMS); + value = elem->value; + if (!checked) { + checked = true; + } + do_usleep(10); + } + + ovsrcu_quiesce(); + + if (checked && rculist_is_empty(list)) { + break; + } + } + return NULL; +} + +/* Run test while a thread checks the integrity of the list. + * Tests must end up emptying the list. */ +static void +run_test_while_checking(void (*function)(struct rculist *list)) +{ + struct rculist list; + pthread_t checker; + + rculist_init(&list); + + checker = ovs_thread_create("checker", checker_main, &list); + function(&list); + + ovs_assert(rculist_is_empty(&list)); + ovsrcu_quiesce(); + xpthread_join(checker, NULL); + printf("."); +} + +static void +test_rculist_insert_delete__(struct rculist *list, bool long_version) +{ + struct element *elem; + int value; + + for (int i = 1; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_insert(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + value = MAX_ELEMS; + RCULIST_FOR_EACH_REVERSE_PROTECTED (elem, node, list) { + ovs_assert (elem->value <= value); + value = elem->value; + } + + if (long_version) { + struct element *next; + RCULIST_FOR_EACH_SAFE_PROTECTED (elem, next, node, list) { + rculist_remove(&elem->node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + } else { + RCULIST_FOR_EACH_SAFE_PROTECTED (elem, node, list) { + rculist_remove(&elem->node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + } +} + +static void +test_rculist_insert_delete(struct rculist *list) +{ + test_rculist_insert_delete__(list, false); +} + +static void +test_rculist_insert_delete_long(struct rculist *list) +{ + test_rculist_insert_delete__(list, true); +} + +static void +test_rculist_push_front_pop_back(struct rculist *list) +{ + struct element *elem; + + for (int i = MAX_ELEMS - 1; i > 0; i--) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_push_front(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_back(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_push_back_pop_front(struct rculist *list) +{ + struct element *elem; + + for (int i = 0; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_push_back(list, &elem->node); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_front(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_splice(struct rculist *list) +{ + struct element *elem; + struct rculist other; + + rculist_init(&other); + + /* Insert elements in list by splicing an intermediate rculist. */ + for (int i = 0; i < MAX_ELEMS; i++) { + elem = xmalloc(sizeof *elem); + elem->value = i; + rculist_insert(&other, &elem->node); + rculist_splice_hidden(list, rculist_next_protected(&other), &other); + rculist_init(&other); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } + + ovsrcu_quiesce(); + + ovs_assert(rculist_size(list) == MAX_ELEMS); + ovs_assert(rculist_is_empty(&other)); + while (!rculist_is_empty(list)) { + elem = CONTAINER_OF(rculist_pop_front(list), struct element, node); + ovsrcu_postpone(free, elem); + /* Leave some time for checkers to iterate through. */ + do_usleep(random_range(1000)); + } +} + +static void +test_rculist_main(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) +{ + run_test_while_checking(test_rculist_insert_delete); + run_test_while_checking(test_rculist_insert_delete_long); + run_test_while_checking(test_rculist_push_back_pop_front); + run_test_while_checking(test_rculist_push_front_pop_back); + run_test_while_checking(test_rculist_splice); + printf("\n"); +} + +OVSTEST_REGISTER("test-rculist", test_rculist_main); diff --git a/tests/test-rstp.c b/tests/test-rstp.c index 01aeaf84783..707ee3a6c8a 100644 --- a/tests/test-rstp.c +++ b/tests/test-rstp.c @@ -107,6 +107,8 @@ send_bpdu(struct dp_packet *pkt, void *port_, void *b_) dp_packet_delete(pkt); } +#define RSTP_PORT_PATH_COST_100M 200000 + static struct bridge * new_bridge(struct test_case *tc, int id) { @@ -122,6 +124,7 @@ new_bridge(struct test_case *tc, int id) for (i = 1; i < MAX_PORTS; i++) { p = rstp_add_port(b->rstp); rstp_port_set_aux(p, p); + rstp_port_set_path_cost(p, RSTP_PORT_PATH_COST_100M); rstp_port_set_state(p, RSTP_DISABLED); rstp_port_set_mac_operational(p, true); } @@ -466,6 +469,8 @@ test_rstp_main(int argc, char *argv[]) vlog_set_pattern(VLF_CONSOLE, "%c|%p|%m"); vlog_set_levels(NULL, VLF_SYSLOG, VLL_OFF); + rstp_init(); + if (argc != 2) { ovs_fatal(0, "usage: test-rstp INPUT.RSTP"); } @@ -544,8 +549,8 @@ test_rstp_main(int argc, char *argv[]) } get_token(); - path_cost = match(":") ? must_get_int() : - RSTP_DEFAULT_PORT_PATH_COST; + path_cost = match(":") ? must_get_int() + : RSTP_PORT_PATH_COST_100M; if (port_no < bridge->n_ports) { /* Enable port. */ reinitialize_port(p); diff --git a/tests/test-sflow.c b/tests/test-sflow.c index 460d4d6c54d..3c617bdd168 100644 --- a/tests/test-sflow.c +++ b/tests/test-sflow.c @@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[]) } daemon_save_fd(STDOUT_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(NULL, &server); if (error) { diff --git a/tests/test-stream.c b/tests/test-stream.c index 68ce2c5442f..14e3bfe381d 100644 --- a/tests/test-stream.c +++ b/tests/test-stream.c @@ -19,6 +19,7 @@ #include "fatal-signal.h" #include "openvswitch/vlog.h" #include "stream.h" +#include "stream-ssl.h" #include "util.h" VLOG_DEFINE_THIS_MODULE(test_stream); @@ -33,7 +34,16 @@ main(int argc, char *argv[]) set_program_name(argv[0]); if (argc < 2) { - ovs_fatal(0, "usage: %s REMOTE", argv[0]); + ovs_fatal(0, "usage: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0]); + } + if (strncmp("ssl:", argv[1], 4) == 0) { + if (argc < 5) { + ovs_fatal(0, "usage with ssl: %s REMOTE SSL_KEY SSL_CERT SSL_CA", + argv[0]); + } + stream_ssl_set_ca_cert_file(argv[4], false); + stream_ssl_set_key_and_cert(argv[2], argv[3]); } error = stream_open_block(stream_open(argv[1], &stream, DSCP_DEFAULT), diff --git a/tests/test-stream.py b/tests/test-stream.py index 93d63c019b3..a6a9c18b24b 100644 --- a/tests/test-stream.py +++ b/tests/test-stream.py @@ -15,10 +15,28 @@ import sys import ovs.stream +import ovs.util def main(argv): + if len(argv) < 2: + ovs.util.ovs_fatal(0, + "usage: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0], + ) remote = argv[1] + + if remote.startswith("ssl:"): + if len(argv) < 5: + ovs.util.ovs_fatal( + 0, + "usage with ssl: %s REMOTE [SSL_KEY] [SSL_CERT] [SSL_CA]", + argv[0], + ) + ovs.stream.SSLStream.ssl_set_ca_cert_file(argv[4]) + ovs.stream.SSLStream.ssl_set_certificate_file(argv[3]) + ovs.stream.SSLStream.ssl_set_private_key_file(argv[2]) + err, stream = ovs.stream.Stream.open_block( ovs.stream.Stream.open(remote), 10000) diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c index 3eadf54cd90..9e89827895a 100644 --- a/tests/test-unixctl.c +++ b/tests/test-unixctl.c @@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[]) fatal_ignore_sigpipe(); parse_options(&argc, &argv, &unixctl_path); - daemonize_start(false); + daemonize_start(false, false); int retval = unixctl_server_create(unixctl_path, &unixctl); if (retval) { exit(EXIT_FAILURE); diff --git a/tests/test-util.c b/tests/test-util.c index 7d899fbbfd9..5d88d38f26a 100644 --- a/tests/test-util.c +++ b/tests/test-util.c @@ -1116,12 +1116,16 @@ test_snprintf(struct ovs_cmdl_context *ctx OVS_UNUSED) { char s[16]; + /* GCC 7+ and Clang 18+ warn about the following calls that truncate + * a string using snprintf(). We're testing that truncation works + * properly, so temporarily disable the warning. */ #if __GNUC__ >= 7 - /* GCC 7+ warns about the following calls that truncate a string using - * snprintf(). We're testing that truncation works properly, so - * temporarily disable the warning. */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +#if __clang_major__ >= 18 +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wformat-truncation" #endif ovs_assert(snprintf(s, 4, "abcde") == 5); ovs_assert(!strcmp(s, "abc")); @@ -1130,6 +1134,9 @@ test_snprintf(struct ovs_cmdl_context *ctx OVS_UNUSED) ovs_assert(!strcmp(s, "abcd")); #if __GNUC__ >= 7 #pragma GCC diagnostic pop +#endif +#if __clang_major__ >= 18 +#pragma clang diagnostic pop #endif ovs_assert(snprintf(s, 6, "abcde") == 5); diff --git a/tests/test-vconn.c b/tests/test-vconn.c index fc8ce4a2c0e..96c89bd4e68 100644 --- a/tests/test-vconn.c +++ b/tests/test-vconn.c @@ -157,6 +157,7 @@ test_refuse_connection(struct ovs_cmdl_context *ctx) error = vconn_connect_block(vconn, (TIMEOUT - 2) * 1000); if (!strcmp(type, "tcp")) { if (error != ECONNRESET && error != EPIPE && error != ETIMEDOUT + && error != ECONNREFUSED #ifdef _WIN32 && error != WSAECONNRESET #endif diff --git a/tests/testsuite.at b/tests/testsuite.at index cf4e3eadfb5..9d77a9f512e 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -77,3 +77,4 @@ m4_include([tests/packet-type-aware.at]) m4_include([tests/nsh.at]) m4_include([tests/drop-stats.at]) m4_include([tests/pytest.at]) +m4_include([tests/learning-switch.at]) diff --git a/tests/tunnel-push-pop-ipv6.at b/tests/tunnel-push-pop-ipv6.at index c96b77cd15f..3edec5fbca1 100644 --- a/tests/tunnel-push-pop-ipv6.at +++ b/tests/tunnel-push-pop-ipv6.at @@ -1,5 +1,92 @@ AT_BANNER([tunnel_push_pop_ipv6]) +AT_SETUP([tunnel_push_pop_ipv6 - srv6]) + +OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00 options:pcap=p0.pcap]) +AT_CHECK([ovs-vsctl add-br int-br1 -- set bridge int-br1 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-br int-br2 -- set bridge int-br2 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-br int-br3 -- set bridge int-br3 datapath_type=dummy], [0]) +AT_CHECK([ovs-vsctl add-port int-br1 t1 -- set Interface t1 type=srv6 \ + options:remote_ip=2001:cafe::91 ofport_request=2 \ + options:srv6_flowlabel=copy \ + ], [0]) +AT_CHECK([ovs-vsctl add-port int-br2 t2 -- set Interface t2 type=srv6 \ + options:remote_ip=2001:cafe::92 ofport_request=3 \ + options:srv6_flowlabel=zero \ + ], [0]) +AT_CHECK([ovs-vsctl add-port int-br3 t3 -- set Interface t3 type=srv6 \ + options:remote_ip=2001:cafe::93 ofport_request=4 \ + options:srv6_flowlabel=compute \ + ], [0]) + +dnl Setup dummy interface IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK +]) +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::91 aa:55:aa:55:00:01], [0], [OK +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::92 aa:55:aa:55:00:02], [0], [OK +]) +AT_CHECK([ovs-appctl tnl/neigh/set br0 2001:cafe::93 aa:55:aa:55:00:03], [0], [OK +]) +AT_CHECK([ovs-ofctl add-flow br0 action=1]) +AT_CHECK([ovs-ofctl add-flow int-br1 action=2]) +AT_CHECK([ovs-ofctl add-flow int-br2 action=3]) +AT_CHECK([ovs-ofctl add-flow int-br3 action=4]) + +dnl Check "srv6_flowlabel=copy". +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br1 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort], [0], [dnl +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00002 +ipv6_label=0x00003 +]) + +dnl Check "srv6_flowlabel=zero". +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br2 'in_port(3),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*'], [0], [dnl +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00000 +ipv6_label=0x00000 +]) + +dnl dnl Check "srv6_flowlabel=compute" for different flows. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.3,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::3,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 4 | grep -o 'ipv6_label=0x[[0-9a-f]]*'| sort | uniq -c | wc -l], [0], [dnl +4 +]) + +dnl dnl Check "srv6_flowlabel=compute" for same IPv4/TCP flow. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x0800),ipv4(src=10.0.0.1,dst=10.0.0.2,proto=6,tos=0,ttl=64,frag=no),tcp(src=100,dst=200),tcp_flags(0x002)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 2 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort | uniq -c | wc -l], [0], [dnl +1 +]) + +dnl dnl Check "srv6_flowlabel=compute" for same IPv6/TCP flow. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=2,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x001)']) +AT_CHECK([ovs-appctl netdev-dummy/receive int-br3 'in_port(4),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:beef::1,dst=2001:beef::2,label=3,proto=6,tclass=0x0,hlimit=64),tcp(src=100,dst=200),tcp_flags(0x002)']) +AT_CHECK([ovs-ofctl parse-pcap p0.pcap | tail -n 2 | grep -o 'ipv6_label=0x[[0-9a-f]]*' | sort | uniq -c | wc -l], [0], [dnl +1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel_push_pop_ipv6 - ip6gre]) OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00]) @@ -19,13 +106,15 @@ dummy@ovs-dummy: hit:0 missed:0 t2 2/6: (ip6gre: remote_ip=2001:cafe::92) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -93,13 +182,15 @@ dummy@ovs-dummy: hit:0 missed:0 t3 3/6: (ip6erspan: erspan_dir=1, erspan_hwid=0x7, erspan_ver=2, key=567, remote_ip=2001:cafe::93) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -202,6 +293,8 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=vxlan \ options:remote_ip=flow options:key=123 ofport_request=5\ -- add-port int-br t5 -- set Interface t5 type=gre \ options:remote_ip=2001:cafe::92 options:key=455 options:packet_type=legacy_l3 ofport_request=6\ + -- add-port int-br t6 -- set Interface t6 type=srv6 \ + options:remote_ip=2001:cafe::92 ofport_request=7\ ], [0]) AT_CHECK([ovs-appctl dpif/show], [0], [dnl @@ -216,23 +309,27 @@ dummy@ovs-dummy: hit:0 missed:0 t3 4/4789: (vxlan: csum=true, out_key=flow, remote_ip=2001:cafe::93) t4 5/6081: (geneve: key=123, remote_ip=flow) t5 6/3: (gre: key=455, packet_type=legacy_l3, remote_ip=2001:cafe::92) + t6 7/6: (srv6: remote_ip=2001:cafe::92) ]) AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=2 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=2 ]) - -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 2001:cafe::92/24 br0], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -363,6 +460,8 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=2 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=2 ]) @@ -384,6 +483,12 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_pop(6081) ]) +dnl Check SRv6 tunnel pop +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=4,tclass=0x0,hlimit=64)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_pop(6) +]) + dnl Check VXLAN tunnel push AT_CHECK([ovs-ofctl add-flow int-br action=2]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) @@ -405,6 +510,13 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_push(tnl_port(3),header(size=62,type=109,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=47,tclass=0x0,hlimit=64),gre((flags=0x2000,proto=0x6558),key=0x1c8)),out_port(100)),1 ]) +dnl Check SRv6 tunnel push +AT_CHECK([ovs-ofctl add-flow int-br action=7]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: pop_eth,tnl_push(tnl_port(6),header(size=78,type=112,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=43,tclass=0x0,hlimit=64),srv6(segments_left=0,segs(2001:cafe::92))),out_port(100)),1 +]) + dnl Check Geneve tunnel push AT_CHECK([ovs-ofctl add-flow int-br "actions=set_field:2001:cafe::92->tun_ipv6_dst,5"]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:01),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) @@ -459,7 +571,7 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'], [0], [dnl port 5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=? ]) AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)'], [0], [dnl -tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),recirc_id(0),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),tunnel(tun_id=0x7b,ipv6_src=2001:cafe::92,ipv6_dst=2001:cafe::88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=3,rule_cookie=0,controller_id=0,max_len=65535)) ]) dnl Receive VXLAN with different MAC and verify that the neigh cache gets updated @@ -498,6 +610,15 @@ AT_CHECK([ovs-appctl tnl/arp/show | tail -n+3 | sort], [0], [dnl 2001:cafe::93 f8:bc:12:44:34:b7 br0 ]) +dnl Disable checksum from VXLAN port. +AT_CHECK([ovs-vsctl set Interface t3 options:csum=false]) +AT_CHECK([ovs-ofctl del-flows int-br]) +AT_CHECK([ovs-ofctl add-flow int-br action=4]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=36:b1:ee:7c:01:01,dst=36:b1:ee:7c:01:02),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4789),header(size=70,type=4,eth(dst=f8:bc:12:44:34:b7,src=aa:55:aa:55:00:00,dl_type=0x86dd),ipv6(src=2001:cafe::88,dst=2001:cafe::93,label=0,proto=17,tclass=0x0,hlimit=64),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x0)),out_port(100)),1 +]) + ovs-appctl time/warp 10000 AT_CHECK([ovs-vsctl del-port int-br t3 \ @@ -510,6 +631,8 @@ AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl Listening ports: genev_sys_6081 (6081) ref_cnt=1 gre_sys (3) ref_cnt=1 +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 vxlan_sys_4789 (4789) ref_cnt=1 vxlan_sys_4790 (4790) ref_cnt=1 ]) @@ -518,6 +641,7 @@ AT_CHECK([ovs-vsctl del-port int-br t1 \ -- del-port int-br t2 \ -- del-port int-br t4 \ -- del-port int-br t5 \ + -- del-port int-br t6 \ ], [0]) dnl Check tunnel lookup entries after deleting all remaining tunnel ports @@ -527,3 +651,177 @@ Listening ports: OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop_ipv6 - local_ip configuration]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:local_ip=2001:beef::88 \ + options:remote_ip=2001:cafe::92 \ + options:key=123 ofport_request=2]) + +dnl Setup multiple IP addresses. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:beef::88/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 2001:beef::/64 dev br0 SRC 2001:beef::88 local +Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local +]) +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This Neighbor Advertisement from p0 has two effects: +dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl + ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl + icmpv6(type=136,code=0),dnl + nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)' +]) + +dnl Check that local_ip is used for encapsulation in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 2001:cafe::92 via br0 + -> tunneling from aa:55:aa:55:00:00 2001:beef::88 to f8:bc:12:44:34:b6 2001:cafe::92 +Datapath actions: tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:beef::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1 +]) + +dnl Now check that the packet actually has the local_ip in the header. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa55000086dd +ip6=60000000001e11402001beef0000000000000000000000882001cafe000000000000000000000092 +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Same for UDP checksum. Masked with '....'. +udp=....17c1001e.... +geneve=0000655800007b00 +encap=${eth}${ip6}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow also has a local_ip. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:beef::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +dnl This is a regression test for outer header checksum offloading +dnl with recirculation. +AT_SETUP([tunnel_push_pop_ipv6 - recirculation after encapsulation]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:remote_ip=2001:cafe::92 \ + options:key=123 ofport_request=2]) + +dnl Setup an IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 2001:cafe::/64 dev br0 SRC 2001:cafe::88 local +]) + +dnl Add a dp-hash selection group. +AT_CHECK([ovs-ofctl add-group br0 \ + 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0']) +AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal]) + +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This Neighbor Advertisement from p0 has two effects: +dnl 1. The neighbor cache will learn that 2001:cafe::92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:00),eth_type(0x86dd),dnl + ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=58,tclass=0,hlimit=255,frag=no),dnl + icmpv6(type=136,code=0),dnl + nd(target=2001:cafe::92,sll=00:00:00:00:00:00,tll=f8:bc:12:44:34:b6)' +]) + +dnl Check that selection group is used in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 2001:cafe::92 via br0 + -> tunneling from aa:55:aa:55:00:00 2001:cafe::88 to f8:bc:12:44:34:b6 2001:cafe::92 +Datapath actions: tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x1) +]) + +dnl Now check that the packet is actually encapsulated and delivered. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa55000086dd +ip6=60000000001e11402001cafe0000000000000000000000882001cafe000000000000000000000092 +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Same for UDP checksum. Masked with '....'. +udp=....17c1001e.... +geneve=0000655800007b00 +encap=${eth}${ip6}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow is also correct. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=70,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x86dd),dnl +ipv6(src=2001:cafe::88,dst=2001:cafe::92,label=0,proto=17,tclass=0x0,hlimit=64),dnl +udp(src=0,dst=6081,csum=0xffff),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x2) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/tunnel-push-pop.at b/tests/tunnel-push-pop.at index 92eebba2eaa..7ec4c31ab2d 100644 --- a/tests/tunnel-push-pop.at +++ b/tests/tunnel-push-pop.at @@ -30,17 +30,15 @@ dummy@ovs-dummy: hit:0 missed:0 t4 5/3: (erspan: erspan_dir=flow, erspan_hwid=flow, erspan_idx=flow, erspan_ver=flow, key=56, remote_ip=flow) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0 pkt_mark=1234], [0], [OK +dnl Checking that a local routes for added IPs were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -237,18 +235,21 @@ dummy@ovs-dummy: hit:0 missed:0 t8 9/2152: (gtpu: key=123, remote_ip=1.1.2.92) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP addresses. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 2001:cafe::88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK -]) - +dnl Add a static route with a mark. AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0 pkt_mark=1234], [0], [OK ]) +dnl Checking that local routes for added IPs and the static route with a mark +dnl were successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep br0 | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2001:ca00::/24 dev br0 SRC 2001:cafe::88 local +User: 1.1.2.0/24 MARK 1234 dev br0 SRC 1.1.2.88 +]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -369,6 +370,26 @@ AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl 1.1.2.92 f8:bc:12:44:34:b6 br0 ]) +dnl Receiving Gratuitous ARP request with correct VLAN id should alter tunnel neighbor cache +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:c8,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=1,sha=f8:bc:12:44:34:c8,tha=00:00:00:00:00:00))']) + +ovs-appctl time/warp 1000 +ovs-appctl time/warp 1000 + +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:c8 br0 +]) + +dnl Receiving Gratuitous ARP reply with correct VLAN id should alter tunnel neighbor cache +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),eth(src=f8:bc:12:44:34:b2,dst=ff:ff:ff:ff:ff:ff),eth_type(0x8100),vlan(vid=10,pcp=7),encap(eth_type(0x0806),arp(sip=1.1.2.92,tip=1.1.2.92,op=2,sha=f8:bc:12:44:34:b2,tha=f8:bc:12:44:34:b2))']) + +ovs-appctl time/warp 1000 +ovs-appctl time/warp 1000 + +AT_CHECK([ovs-appctl tnl/neigh/show | grep br | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:b2 br0 +]) + dnl Receive ARP reply without VLAN header AT_CHECK([ovs-vsctl set port br0 tag=0]) AT_CHECK([ovs-appctl tnl/neigh/flush], [0], [OK @@ -588,7 +609,7 @@ AT_CHECK([ovs-ofctl dump-ports int-br | grep 'port 5'], [0], [dnl port 5: rx pkts=1, bytes=98, drop=?, errs=?, frame=?, over=?, crc=? ]) AT_CHECK([ovs-appctl dpif/dump-flows int-br | grep 'in_port(6081)' | sed -e 's/recirc_id=[[0-9]]*/recirc_id=/g'], [0], [dnl -tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),recirc_id(0),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) +recirc_id(0),tunnel(tun_id=0x7b,src=1.1.2.92,dst=1.1.2.88,geneve({class=0xffff,type=0x80,len=4,0xa/0xf}{class=0xffff,type=0,len=4}),flags(-df-csum+key)),in_port(6081),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), packets:0, bytes:0, used:never, actions:userspace(pid=0,controller(reason=1,dont_send=0,continuation=0,recirc_id=,rule_cookie=0,controller_id=0,max_len=65535)) ]) dnl Receive VXLAN with different MAC and verify that the neigh cache gets updated @@ -621,6 +642,13 @@ AT_CHECK([tail -1 stdout], [0], [Datapath actions: tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x7b)),out_port(100)),1 ]) +dnl Check VXLAN tunnel push with checksum. +AT_CHECK([ovs-vsctl set Interface t2 options:csum=true]) +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(2),eth(src=36:b1:ee:7c:01:01,dst=36:b1:ee:7c:01:02),eth_type(0x0800),ipv4(src=1.1.3.88,dst=1.1.3.112,proto=47,tos=0,ttl=64,frag=no)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4789),header(size=50,type=4,eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),udp(src=0,dst=4789,csum=0xffff),vxlan(flags=0x8000000,vni=0x7b)),out_port(100)),1 +]) + AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl 1.1.2.92 f8:bc:12:44:34:b6 br0 1.1.2.93 f8:bc:12:44:34:b7 br0 @@ -670,12 +698,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 -- set Interface t2 type=geneve \ options:remote_ip=1.1.2.92 options:key=123 ofport_request=2 \ ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) - -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -711,11 +739,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 dnl -- set Interface t2 type=geneve options:remote_ip=1.1.2.92 dnl options:key=123 ofport_request=2]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -757,6 +786,88 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep -q 'slow_path(action)'], [0]) OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([tunnel_push_pop - local_ip configuration]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:local_ip=2.2.2.88 \ + options:remote_ip=1.1.2.92 \ + options:key=123 ofport_request=2]) + +dnl Setup multiple IP addresses. +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK +]) +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 2.2.2.88/24], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +Cached: 2.2.2.0/24 dev br0 SRC 2.2.2.88 local +]) +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This ARP reply from p0 has two effects: +dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)' +]) + +dnl Check that local_ip is used for encapsulation in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 1.1.2.92 via br0 + -> tunneling from aa:55:aa:55:00:00 2.2.2.88 to f8:bc:12:44:34:b6 1.1.2.92 +Datapath actions: tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=2.2.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),1 +]) + +dnl Now check that the packet actually has the local_ip in the header. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa5500000800 +ip4=450000320000400040113305020202580101025c +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Masked with '....'. +udp=....17c1001e0000 +geneve=0000655800007b00 +encap=${eth}${ip4}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow also has a local_ip. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=2.2.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel_push_pop - underlay bridge match]) OVS_VSWITCHD_START([add-port br0 p0 -- set Interface p0 type=dummy ofport_request=1 other-config:hwaddr=aa:55:aa:55:00:00]) @@ -776,8 +887,11 @@ dummy@ovs-dummy: hit:0 missed:0 AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) + AT_CHECK([ovs-ofctl add-flow br0 'arp,priority=1,action=normal']) dnl Use arp reply to achieve tunnel next hop mac binding @@ -820,11 +934,12 @@ AT_CHECK([ovs-vsctl add-port int-br t2 dnl -- set Interface t2 type=geneve options:remote_ip=1.1.2.92 dnl options:key=123 ofport_request=2]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 1.1.2.92/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local ]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) @@ -888,10 +1003,12 @@ AT_CHECK([ovs-vsctl set port p8 tag=42 dnl -- set port br0 tag=42 dnl -- set port p7 tag=200]) -dnl Set IP address and route for br0. +dnl Set an IP address for br0. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 10.0.0.2/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 10.0.0.11/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 10.0.0.0/24 dev br0 SRC 10.0.0.2 local ]) dnl Send an ARP reply to port b8 on br0, so that packets will be forwarded @@ -933,10 +1050,12 @@ AT_CHECK([ovs-vsctl add-port ovs-tun0 tun0 dnl -- add-port ovs-tun0 p7 dnl -- set interface p7 type=dummy ofport_request=7]) -dnl Set IP address and route for br0. +dnl Set an IP address for br0. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 10.0.0.2/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 10.0.0.11/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 10.0.0.0/24 dev br0 SRC 10.0.0.2 local ]) dnl Send an ARP reply to port b8 on br0, so that packets will be forwarded @@ -973,3 +1092,170 @@ udp(src=0,dst=4789,csum=0x0),vxlan(flags=0x8000000,vni=0x0)),out_port(100)),8),7 OVS_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([tunnel_push_pop - use non-local port as tunnel endpoint]) + +OVS_VSWITCHD_START([add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1]) + +dnl Adding another port separately to ensure that it gets an +dnl aa:55:aa:55:00:03 MAC address (dummy port number 3). +AT_CHECK([ovs-vsctl add-port br0 vtep0 \ + -- set interface vtep0 type=dummy ofport_request=2]) +AT_CHECK([ovs-vsctl \ + -- add-br int-br \ + -- set bridge int-br datapath_type=dummy \ + -- set Interface int-br ofport_request=3]) +AT_CHECK([ovs-vsctl \ + -- add-port int-br t1 \ + -- set Interface t1 type=gre ofport_request=4 \ + options:remote_ip=1.1.2.92 +]) + +AT_CHECK([ovs-appctl dpif/show], [0], [dnl +dummy@ovs-dummy: hit:0 missed:0 + br0: + br0 65534/100: (dummy-internal) + p0 1/1: (dummy) + vtep0 2/2: (dummy) + int-br: + int-br 65534/3: (dummy-internal) + t1 4/4: (gre: remote_ip=1.1.2.92) +]) + +AT_CHECK([ovs-appctl netdev-dummy/ip4addr vtep0 1.1.2.88/24], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 1.1.2.0/24 dev vtep0 SRC 1.1.2.88 local +]) + +AT_CHECK([ovs-ofctl add-flow br0 action=normal]) +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl Use arp request and reply to achieve tunnel next hop mac binding. +dnl By default, vtep0's MAC address is aa:55:aa:55:00:03. +AT_CHECK([ovs-appctl netdev-dummy/receive vtep0 'recirc_id(0),in_port(2),dnl + eth(dst=ff:ff:ff:ff:ff:ff,src=aa:55:aa:55:00:03),eth_type(0x0806),dnl + arp(tip=1.1.2.92,sip=1.1.2.88,op=1,sha=aa:55:aa:55:00:03,tha=00:00:00:00:00:00)']) +AT_CHECK([ovs-appctl netdev-dummy/receive p0 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=aa:55:aa:55:00:03)']) + +AT_CHECK([ovs-appctl tnl/neigh/show | tail -n+3 | sort], [0], [dnl +1.1.2.92 f8:bc:12:44:34:b6 br0 +]) + +dnl Check GRE tunnel pop. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=aa:55:aa:55:00:03),eth_type(0x0800),dnl + ipv4(src=1.1.2.92,dst=1.1.2.88,proto=47,tos=0,ttl=64,frag=no)'], +[0], [stdout]) + +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_pop(4) +]) + +dnl Check GRE tunnel push. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(3),dnl + eth(dst=f9:bc:12:44:34:b6,src=af:55:aa:55:00:03),eth_type(0x0800),dnl + ipv4(src=1.1.3.88,dst=1.1.3.92,proto=1,tos=0,ttl=64,frag=no)'], +[0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: tnl_push(tnl_port(4),header(size=38,type=3,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:03,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=47,tos=0,ttl=64,frag=0x4000),dnl +gre((flags=0x0,proto=0x6558))),out_port(2)),1 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +dnl This is a regression test for outer header checksum offloading +dnl with recirculation. +AT_SETUP([tunnel_push_pop - recirculation after encapsulation]) + +OVS_VSWITCHD_START( + [add-port br0 p0 \ + -- set Interface p0 type=dummy ofport_request=1 \ + other-config:hwaddr=aa:55:aa:55:00:00]) +AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg]) +AT_CHECK([ovs-vsctl add-br int-br -- set bridge int-br datapath_type=dummy]) +AT_CHECK([ovs-vsctl add-port int-br t2 \ + -- set Interface t2 type=geneve \ + options:remote_ip=1.1.2.92 \ + options:key=123 ofport_request=2]) + +dnl Setup an IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 1.1.2.88/24], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached | sort], [0], [dnl +Cached: 1.1.2.0/24 dev br0 SRC 1.1.2.88 local +]) + +dnl Add a dp-hash selection group. +AT_CHECK([ovs-ofctl add-group br0 \ + 'group_id=1234,type=select,selection_method=dp_hash,bucket=weight=1,output:p0']) +AT_CHECK([ovs-ofctl add-flow br0 in_port=br0,action=group:1234]) +AT_CHECK([ovs-ofctl add-flow br0 in_port=p0,action=normal]) + +AT_CHECK([ovs-ofctl add-flow int-br action=normal]) + +dnl This ARP reply from p0 has two effects: +dnl 1. The ARP cache will learn that 1.1.2.92 is at f8:bc:12:44:34:b6. +dnl 2. The br0 mac learning will learn that f8:bc:12:44:34:b6 is on p0. +AT_CHECK([ovs-appctl netdev-dummy/receive p0 dnl + 'recirc_id(0),in_port(1),dnl + eth(src=f8:bc:12:44:34:b6,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),dnl + arp(sip=1.1.2.92,tip=1.1.2.88,op=2,sha=f8:bc:12:44:34:b6,tha=00:00:00:00:00:00)' +]) + +dnl Check that selection group is used in the trace. +AT_CHECK([ovs-appctl ofproto/trace int-br in_port=LOCAL \ + | grep -E 'tunnel|actions'], [0], [dnl + -> output to native tunnel + -> tunneling to 1.1.2.92 via br0 + -> tunneling from aa:55:aa:55:00:00 1.1.2.88 to f8:bc:12:44:34:b6 1.1.2.92 +Datapath actions: tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x1) +]) + +dnl Now check that the packet is actually encapsulated and delivered. +AT_CHECK([ovs-vsctl -- set Interface p0 options:tx_pcap=p0.pcap]) + +packet=50540000000a5054000000091234 +eth=f8bc124434b6aa55aa5500000800 +ip4=450000320000400040113406010102580101025c +dnl Source port is based on a packet hash, so it may differ depending on the +dnl compiler flags and CPU type. Masked with '....'. +udp=....17c1001e0000 +geneve=0000655800007b00 +encap=${eth}${ip4}${udp}${geneve} +dnl Output to tunnel from a int-br internal port. +dnl Checking that the packet arrived and it was correctly encapsulated. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 1]) + +dnl Sending again to exercise the non-miss upcall path. +AT_CHECK([ovs-appctl netdev-dummy/receive int-br "${packet}"]) +OVS_WAIT_UNTIL([test $(ovs-pcap p0.pcap | grep -c "${encap}${packet}") -eq 2]) + +dnl Finally, checking that the datapath flow is also correct. +AT_CHECK([ovs-appctl dpctl/dump-flows | grep tnl_push \ + | strip_ufid | strip_used], [0], [dnl +recirc_id(0),in_port(2),packet_type(ns=0,id=0),dnl +eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x1234), dnl +packets:1, bytes:14, used:0.0s, dnl +actions:tnl_push(tnl_port(6081),header(size=50,type=5,dnl +eth(dst=f8:bc:12:44:34:b6,src=aa:55:aa:55:00:00,dl_type=0x0800),dnl +ipv4(src=1.1.2.88,dst=1.1.2.92,proto=17,tos=0,ttl=64,frag=0x4000),dnl +udp(src=0,dst=6081,csum=0x0),geneve(vni=0x7b)),out_port(100)),dnl +hash(l4(0)),recirc(0x2) +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/tunnel.at b/tests/tunnel.at index 037b4c39081..31e935901d3 100644 --- a/tests/tunnel.at +++ b/tests/tunnel.at @@ -126,7 +126,7 @@ AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl AT_CHECK([ovs-appctl dpctl/add-flow "tunnel(dst=1.1.1.1,src=3.3.3.200/255.255.255.0,tp_dst=123,tp_src=1,ttl=64),recirc_id(0),in_port(1),eth(),eth_type(0x0800),ipv4()" "2"]) AT_CHECK([ovs-appctl dpctl/dump-flows | tail -1], [0], [dnl -tunnel(src=3.3.3.200/255.255.255.0,dst=1.1.1.1,ttl=64,tp_src=1,tp_dst=123),recirc_id(0),in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:2 +recirc_id(0),tunnel(src=3.3.3.200/255.255.255.0,dst=1.1.1.1,ttl=64,tp_src=1,tp_dst=123),in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:2 ]) OVS_VSWITCHD_STOP @@ -333,6 +333,50 @@ set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,flags(df|key))),1 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([tunnel - set_tunnel VXLAN]) +OVS_VSWITCHD_START([dnl + add-port br0 p1 -- set Interface p1 type=vxlan options:key=flow \ + options:remote_ip=1.1.1.1 ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=vxlan options:key=flow \ + options:remote_ip=2.2.2.2 ofport_request=2 \ + -- add-port br0 p3 -- set Interface p3 type=vxlan options:key=flow \ + options:remote_ip=3.3.3.3 ofport_request=3 \ + -- add-port br0 p4 -- set Interface p4 type=vxlan options:key=flow \ + options:remote_ip=4.4.4.4 ofport_request=4]) +AT_DATA([flows.txt], [dnl +actions=set_tunnel:1,output:1,set_tunnel:2,output:2,set_tunnel:3,output:3,set_tunnel:5,output:4 +]) + +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl + br0 65534/100: (dummy-internal) + p1 1/4789: (vxlan: key=flow, remote_ip=1.1.1.1) + p2 2/4789: (vxlan: key=flow, remote_ip=2.2.2.2) + p3 3/4789: (vxlan: key=flow, remote_ip=3.3.3.3) + p4 4/4789: (vxlan: key=flow, remote_ip=4.4.4.4) +]) + +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(100),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [Datapath actions: dnl +set(tunnel(tun_id=0x1,dst=1.1.1.1,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x2,dst=2.2.2.2,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x3,dst=3.3.3.3,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,tp_dst=4789,flags(df|key))),4789 +]) + +dnl With pre-existing tunnel metadata. +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x1,src=1.1.1.1,dst=5.5.5.5,tp_src=12345,tp_dst=4789,ttl=64,flags(key)),in_port(4789),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], [Datapath actions: dnl +set(tunnel(tun_id=0x2,dst=2.2.2.2,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x3,dst=3.3.3.3,ttl=64,tp_dst=4789,flags(df|key))),4789,dnl +set(tunnel(tun_id=0x5,dst=4.4.4.4,ttl=64,tp_dst=4789,flags(df|key))),4789 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + AT_SETUP([tunnel - key]) OVS_VSWITCHD_START([dnl add-port br0 p1 -- set Interface p1 type=gre options:key=1 \ @@ -480,11 +524,12 @@ dummy@ovs-dummy: hit:0 missed:0 v2 3/3: (dummy-internal) ]) -dnl First setup dummy interface IP address, then add the route -dnl so that tnl-port table can get valid IP address for the device. +dnl Setup dummy interface IP address. AT_CHECK([ovs-appctl netdev-dummy/ip4addr br0 172.31.1.1/24], [0], [OK ]) -AT_CHECK([ovs-appctl ovs/route/add 172.31.1.0/24 br0], [0], [OK +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: 172.31.1.0/24 dev br0 SRC 172.31.1.1 local ]) dnl change the flow table to bump the internal table version @@ -603,7 +648,7 @@ AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl OVS_VSWITCHD_STOP AT_CLEANUP -AT_SETUP([ofproto-dpif - set_field - tun_src/tun_dst/tun_id]) +AT_SETUP([tunnel - set_field - tun_src/tun_dst/tun_id]) OVS_VSWITCHD_START([dnl add-port br0 p1 -- set Interface p1 type=gre options:key=flow \ options:remote_ip=1.1.1.1 ofport_request=1 \ @@ -993,7 +1038,7 @@ AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0,src=1.1.1.1,dst=1.1.1.2,ttl=64),in_port(4789)'], [0], [stdout]) AT_CHECK([tail -1 stdout], [0], - [Datapath actions: set(tunnel(ipv6_dst=2001:cafe::1,ttl=64,tp_dst=4789,flags(df))),4789 + [Datapath actions: set(tunnel(ipv6_dst=2001:cafe::1,ttl=64,tp_dst=4789,flags(df|csum))),4789 ]) AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'tunnel(tun_id=0x0,ipv6_src=2001:cafe::1,ipv6_dst=2001:cafe::2,ttl=64),in_port(4789)'], [0], [stdout]) @@ -1223,3 +1268,97 @@ AT_CHECK([ovs-vsctl add-port br0 p1 -- set int p1 type=dummy]) OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) OVS_APP_EXIT_AND_WAIT([ovsdb-server])] AT_CLEANUP + +AT_SETUP([tunnel - re-create port with different name]) +OVS_VSWITCHD_START( + [add-port br0 p0 -- set int p0 type=vxlan options:remote_ip=10.10.10.1]) + +AT_CHECK([ovs-vsctl --if-exists del-port p0 -- \ + add-port br0 p1 -- \ + set int p1 type=vxlan options:remote_ip=10.10.10.1]) + +OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) +OVS_APP_EXIT_AND_WAIT([ovsdb-server])] +AT_CLEANUP + +AT_SETUP([tunnel - SRV6 basic]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=dummy \ + ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=srv6 \ + options:remote_ip=flow \ + ofport_request=2]) +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP + +dnl Setup dummy interface IP address. +AT_CHECK([ovs-appctl netdev-dummy/ip6addr br0 fc00::1/64], [0], [OK +]) +dnl Checking that a local route for added IP was successfully installed. +AT_CHECK([ovs-appctl ovs/route/show | grep Cached], [0], [dnl +Cached: fc00::/64 dev br0 SRC fc00::1 local +]) + +AT_DATA([flows.txt], [dnl +in_port=1,actions=set_field:fc00::2->tun_ipv6_dst,output:2 +in_port=2,actions=1 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +AT_CHECK([ovs-appctl dpif/show | tail -n +3], [0], [dnl + br0 65534/100: (dummy-internal) + p1 1/1: (dummy) + p2 2/6: (srv6: remote_ip=flow) +]) + +AT_CHECK([ovs-appctl tnl/ports/show |sort], [0], [dnl +Listening ports: +srv6_sys (6) ref_cnt=1 +srv6_sys (6) ref_cnt=1 +]) + +AT_CHECK([ovs-appctl ofproto/list-tunnels], [0], [dnl +port 6: p2 (srv6: ::->flow, key=0, legacy_l3, dp port=6, ttl=64) +]) + +dnl Encap: ipv4 inner packet +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=6,tos=4,ttl=128,frag=no),tcp(src=8,dst=9)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: set(tunnel(ipv6_dst=fc00::2,ttl=64,flags(df))),pop_eth,6 +]) + +dnl Encap: ipv6 inner packet +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy 'in_port(1),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x86dd),ipv6(src=2001:cafe::92,dst=2001:cafe::88,label=0,proto=47,tclass=0x0,hlimit=64)'], [0], [stdout]) +AT_CHECK([tail -1 stdout], [0], + [Datapath actions: set(tunnel(ipv6_dst=fc00::2,ttl=64,flags(df))),pop_eth,6 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([tunnel - Geneve metadata mirror]) +OVS_VSWITCHD_START([add-port br0 p1 -- set Interface p1 type=geneve \ + options:remote_ip=1.1.1.1 ofport_request=1 \ + -- add-port br0 p2 -- set Interface p2 type=dummy \ + ofport_request=2 ofport_request=2]) +OVS_VSWITCHD_DISABLE_TUNNEL_PUSH_POP +add_of_ports br0 90 +AT_CHECK([ovs-vsctl \ + set Bridge br0 mirrors=@m --\ + --id=@p90 get Port p90 --\ + --id=@m create Mirror name=mymirror select_all=true output_port=@p90], [0], [stdout]) + +AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0,len=4}->tun_metadata0,{class=0xffff,type=1,len=8}->tun_metadata1"]) + +AT_DATA([flows.txt], [dnl +in_port=2,actions=set_field:0xa->tun_metadata0,set_field:0x1234567890abcdef->tun_metadata1,1 +tun_metadata0=0xb/0xf,actions=2 +]) +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +flow="in_port(2),eth(src=50:54:00:00:00:05,dst=50:54:00:00:00:07),eth_type(0x0800),ipv4(src=192.168.0.1,dst=192.168.0.2,proto=1,tos=0,ttl=128,frag=no),icmp(type=8,code=0)" +AT_CHECK([ovs-appctl ofproto/trace ovs-dummy "$flow"], [0], [stdout]) +AT_CHECK_UNQUOTED([tail -1 stdout], [0], + [Datapath actions: 90,set(tunnel(dst=1.1.1.1,ttl=64,tp_dst=6081,geneve({class=0xffff,type=0,len=4,0xa}{class=0xffff,type=0x1,len=8,0x1234567890abcdef}),flags(df))),6081 +]) + +OVS_VSWITCHD_STOP +AT_CLEANUP diff --git a/tests/unixctl-py.at b/tests/unixctl-py.at index 72400611822..ae8bd5ad189 100644 --- a/tests/unixctl-py.at +++ b/tests/unixctl-py.at @@ -100,6 +100,7 @@ The available commands are: exit help log [[arg ...]] + set-options [[--format text|json]] version vlog/close vlog/list @@ -112,6 +113,18 @@ AT_CHECK([PYAPPCTL_PY -t test-unixctl.py help], [0], [expout]) AT_CHECK([ovs-vsctl --version | sed 's/ovs-vsctl/test-unixctl.py/' | head -1 > expout]) AT_CHECK([APPCTL -t test-unixctl.py version], [0], [expout]) AT_CHECK([PYAPPCTL_PY -t test-unixctl.py version], [0], [expout]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format json version], [0], [dnl +{"reply":"$(cat expout)","reply-format":"plain"} +]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format JSON version], [0], [dnl +{"reply":"$(cat expout)","reply-format":"plain"} +]) +AT_CHECK_UNQUOTED([PYAPPCTL_PY -t test-unixctl.py --format json --pretty version], [0], [dnl +{ + "reply":"$(cat expout)", + "reply-format":"plain" +} +]) AT_CHECK([APPCTL -t test-unixctl.py echo robot ninja], [0], [stdout]) AT_CHECK([cat stdout | sed -e "s/u'/'/g"], [0], [dnl diff --git a/tests/vlog.at b/tests/vlog.at index 3e92e70a93c..2768c074009 100644 --- a/tests/vlog.at +++ b/tests/vlog.at @@ -8,6 +8,7 @@ AT_CHECK([$PYTHON3 $srcdir/test-vlog.py --log-file log_file \ AT_CHECK([sed -e 's/.*-.*-.*T..:..:..Z |//' \ -e 's/File ".*", line [[0-9]][[0-9]]*,/File , line ,/' \ +-e '/\^/d' \ stderr_log], [0], [dnl 0 | module_0 | EMER | emergency 1 | module_0 | ERR | error @@ -385,6 +386,7 @@ AT_CHECK([APPCTL -t test-unixctl.py vlog/list], [0], [dnl console syslog file ------- ------ ------ daemon info info info +dns_resolve info info info fatal-signal info info info jsonrpc info info info poller info info info @@ -404,6 +406,7 @@ unixctl_server info info info console syslog file ------- ------ ------ daemon info err dbg +dns_resolve info info dbg fatal-signal info info dbg jsonrpc info info dbg poller info info dbg diff --git a/tests/vtep-ctl.at b/tests/vtep-ctl.at index 98067658446..e4ddfe5df03 100644 --- a/tests/vtep-ctl.at +++ b/tests/vtep-ctl.at @@ -19,7 +19,7 @@ dnl Creates an empty database in the current directory and then starts dnl an ovsdb-server on it for vtep-ctl to connect to. m4_define([VTEP_CTL_SETUP], [VTEP_OVSDB_INIT([db]) - AT_CHECK([ovsdb-server --detach --no-chdir --pidfile --remote=punix:socket db >/dev/null 2>&1], [0], [ignore], [ignore])]) + AT_CHECK([ovsdb-server --detach --no-chdir --log-file --pidfile --remote=punix:socket db], [0], [ignore], [ignore])]) dnl VTEP_CTL_CLEANUP dnl diff --git a/utilities/automake.mk b/utilities/automake.mk index c3e190fe52d..c4566938c44 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -27,6 +27,15 @@ scripts_SCRIPTS += \ utilities/ovs-kmod-ctl \ utilities/ovs-save scripts_DATA += utilities/ovs-lib +usdt_SCRIPTS += \ + utilities/usdt-scripts/bridge_loop.bt \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ + utilities/usdt-scripts/reval_monitor.py \ + utilities/usdt-scripts/upcall_cost.py \ + utilities/usdt-scripts/upcall_monitor.py completion_SCRIPTS += \ utilities/ovs-appctl-bashcomp.bash \ @@ -70,6 +79,11 @@ EXTRA_DIST += \ utilities/docker/debian/Dockerfile \ utilities/docker/debian/build-kernel-modules.sh \ utilities/usdt-scripts/bridge_loop.bt \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ + utilities/usdt-scripts/kernel_delay.py \ + utilities/usdt-scripts/kernel_delay.rst \ + utilities/usdt-scripts/reval_monitor.py \ utilities/usdt-scripts/upcall_cost.py \ utilities/usdt-scripts/upcall_monitor.py MAN_ROOTS += \ @@ -150,6 +164,8 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \ utilities/ovs-check-dead-ifs.in \ utilities/ovs-tcpdump.in \ utilities/ovs-pipegen.py \ + utilities/usdt-scripts/dpif_nl_exec_monitor.py \ + utilities/usdt-scripts/flow_reval_monitor.py \ utilities/usdt-scripts/upcall_monitor.py \ utilities/usdt-scripts/upcall_cost.py diff --git a/utilities/checkpatch.py b/utilities/checkpatch.py index 0d30b71b5b7..742a0bc470f 100755 --- a/utilities/checkpatch.py +++ b/utilities/checkpatch.py @@ -39,6 +39,15 @@ def open_spell_check_dict(): import enchant + try: + import codespell_lib + codespell_dir = os.path.dirname(codespell_lib.__file__) + codespell_file = os.path.join(codespell_dir, 'data', 'dictionary.txt') + if not os.path.exists(codespell_file): + codespell_file = '' + except: + codespell_file = '' + try: extra_keywords = ['ovs', 'vswitch', 'vswitchd', 'ovs-vswitchd', 'netdev', 'selinux', 'ovs-ctl', 'dpctl', 'ofctl', @@ -88,12 +97,36 @@ def open_spell_check_dict(): 'debian', 'travis', 'cirrus', 'appveyor', 'faq', 'erspan', 'const', 'hotplug', 'addresssanitizer', 'ovsdb', 'dpif', 'veth', 'rhel', 'jsonrpc', 'json', - 'syscall', 'lacp', 'ipf', 'skb', 'valgrind'] + 'syscall', 'lacp', 'ipf', 'skb', 'valgrind', + 'appctl', 'arp', 'asan', 'backport', 'backtrace', + 'chmod', 'ci', 'cpu', 'cpus', 'dnat', 'dns', 'dpcls', + 'eol', 'ethtool', 'fdb', 'freebsd', 'gcc', 'github', + 'glibc', 'gre', 'inlined', 'ip', 'ipfix', 'ipsec', + 'ixgbe', 'libbpf', 'libcrypto', 'libgcc', + 'libopenvswitch', 'libreswan', 'libssl', 'libxdp', + 'lldp', 'llvm', 'lockless', 'mcast', 'megaflows', + 'mfex', 'ncat', 'networkmanager', 'pcap', 'pedit', + 'pidfile', 'pps', 'rculist', 'rebalance', 'rebased' + 'recirculations', 'revalidators', 'rst', 'sed', + 'shrinked', 'snat', 'stderr', 'stdout', 'testpmd', + 'tftp', 'timeval', 'trie', 'tso', 'ubsan', 'ukey', + 'umask', 'unassociated', 'unixctl', 'uuid' + 'virtqueue', 'vms', 'vnet', 'vport', 'vports', + 'vtep', 'wc', 'wget', 'xenserver'] global spell_check_dict + spell_check_dict = enchant.Dict("en_US") + + if codespell_file: + with open(codespell_file) as f: + for line in f.readlines(): + words = line.strip().split('>')[1].strip(', ').split(',') + for word in words: + spell_check_dict.add_to_session(word.strip()) + for kw in extra_keywords: - spell_check_dict.add(kw) + spell_check_dict.add_to_session(kw) return True except: @@ -189,13 +222,14 @@ def reset_counters(): skip_gerrit_change_id_check = False skip_block_whitespace_check = False skip_signoff_check = False +skip_committer_signoff_check = False # Don't enforce character limit on files that include these characters in their # name, as they may have legitimate reasons to have longer lines. # # Python isn't checked as flake8 performs these checks during build. line_length_ignore_list = re.compile( - r'\.(am|at|etc|in|m4|mk|patch|py)$|^debian/.*$') + r'\.(am|at|etc|in|m4|mk|patch|py|yml)$|^debian/.*$') # Don't enforce a requirement that leading whitespace be all spaces on # files that include these characters in their name, since these kinds @@ -408,9 +442,16 @@ def check_spelling(line, comment): if not spell_check_dict or not spellcheck: return False + is_name_tag = re.compile(r'^\s*([a-z-]+-by): (.*@.*)$', re.I | re.M | re.S) + if line.startswith('Fixes: ') or is_name_tag.match(line): + return False + words = filter_comments(line, True) if comment else line words = words.replace(':', ' ').split(' ') + flagged_words = [] + num_suggestions = 3 + for word in words: skip = False strword = re.subn(r'\W+', '', word)[0].replace(',', '') @@ -435,9 +476,15 @@ def check_spelling(line, comment): skip = True if not skip: - print_warning("Check for spelling mistakes (e.g. \"%s\")" - % strword) - return True + flagged_words.append(strword) + + if len(flagged_words) > 0: + for mistake in flagged_words: + print_warning("Possible misspelled word: \"%s\"" % mistake) + print("Did you mean: ", + spell_check_dict.suggest(mistake)[:num_suggestions]) + + return True return False @@ -448,7 +495,7 @@ def __check_doc_is_listed(text, doctype, docdir, docfile): docre = re.compile(r'\n\+.*{}'.format(docfile.replace('.rst', ''))) elif doctype == 'automake': beginre = re.compile(r'\+\+\+.*Documentation/automake.mk') - docre = re.compile(r'\n\+\t{}/{}'.format(docdir, docfile)) + docre = re.compile(r'\n\+\t(?:{}/)?{}'.format(docdir, docfile)) else: raise NotImplementedError("Invalid doctype: {}".format(doctype)) @@ -662,18 +709,23 @@ def regex_warn_factory(description): easy_to_misuse_api = [ ('ovsrcu_barrier', - 'lib/ovs-rcu.c', + ['lib/ovs-rcu.c'], 'Are you sure you need to use ovsrcu_barrier(), ' 'in most cases ovsrcu_synchronize() will be fine?'), + ('netdev_features_to_bps', + ['lib/netdev.c', 'lib/netdev-bsd.c', 'lib/netdev-linux.c'], + 'Are you sure you need to use netdev_features_to_bps()? ' + 'If you want to retrieve the current and/or maximum link speed, ' + 'consider using netdev_get_speed() instead.'), ] checks += [ {'regex': r'(\.c)(\.in)?$', - 'match_name': lambda x: x != location, + 'match_name': lambda x, loc=locations: x not in loc, 'prereq': lambda x: not is_comment_line(x), 'check': regex_function_factory(function_name), 'print': regex_warn_factory(description)} - for (function_name, location, description) in easy_to_misuse_api] + for (function_name, locations, description) in easy_to_misuse_api] def regex_operator_factory(operator): @@ -687,7 +739,7 @@ def regex_operator_factory(operator): '&=', '^=', '|=', '<<=', '>>=']] \ + [r'[^<" ]<[^=" ]', r'[^\->" ]>[^=" ]', - r'[^ !()/"]\*[^/]', + r'[^ !()/"\*]\*+[^/]', r'[^ !&()"]&', r'[^" +(]\+[^"+;]', r'[^" \-(]\-[^"\->;]', @@ -778,6 +830,36 @@ def run_file_checks(text): check['check'](text) +def run_subject_checks(subject, spellcheck=False): + warnings = False + + if spellcheck and check_spelling(subject, False): + warnings = True + + summary = subject[subject.rindex(': ') + 2:] + area_summary = subject[subject.index(': ') + 2:] + area_summary_len = len(area_summary) + if area_summary_len > 70: + print_warning("The subject, ': ', is over 70 " + "characters, i.e., %u." % area_summary_len) + warnings = True + + if summary[0].isalpha() and summary[0].islower(): + print_warning( + "The subject summary should start with a capital.") + warnings = True + + if subject[-1] not in [".", "?", "!"]: + print_warning( + "The subject summary should end with a dot.") + warnings = True + + if warnings: + print(subject) + + return warnings + + def ovs_checkpatch_parse(text, filename, author=None, committer=None): global print_file_name, total_line, checking_file, \ empty_return_check_state @@ -798,6 +880,7 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): r'^@@ ([0-9-+]+),([0-9-+]+) ([0-9-+]+),([0-9-+]+) @@') is_author = re.compile(r'^(Author|From): (.*)$', re.I | re.M | re.S) is_committer = re.compile(r'^(Commit: )(.*)$', re.I | re.M | re.S) + is_subject = re.compile(r'^(Subject: )(.*)$', re.I | re.M | re.S) is_signature = re.compile(r'^(Signed-off-by: )(.*)$', re.I | re.M | re.S) is_co_author = re.compile(r'^(Co-authored-by: )(.*)$', @@ -872,7 +955,8 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): break if (committer and author != committer - and committer not in signatures): + and committer not in signatures + and not skip_committer_signoff_check): print_error("Committer %s needs to sign off." % committer) @@ -897,6 +981,8 @@ def ovs_checkpatch_parse(text, filename, author=None, committer=None): committer = is_committer.match(line).group(2) elif is_author.match(line): author = is_author.match(line).group(2) + elif is_subject.match(line): + run_subject_checks(line, spellcheck) elif is_signature.match(line): m = is_signature.match(line) signatures.append(m.group(2)) @@ -988,7 +1074,8 @@ def usage(): -S|--spellcheck Check C comments and commit-message for possible spelling mistakes -t|--skip-trailing-whitespace Skips the trailing whitespace test - --skip-gerrit-change-id Skips the gerrit change id test""" + --skip-gerrit-change-id Skips the gerrit change id test + --skip-committer-signoff Skips the committer sign-off test""" % sys.argv[0]) @@ -1015,6 +1102,19 @@ def ovs_checkpatch_file(filename): result = ovs_checkpatch_parse(part.get_payload(decode=False), filename, mail.get('Author', mail['From']), mail['Commit']) + + if not mail['Subject'] or not mail['Subject'].strip(): + if mail['Subject']: + mail.replace_header('Subject', sys.argv[-1]) + else: + mail.add_header('Subject', sys.argv[-1]) + + print("Subject missing! Your provisional subject is", + mail['Subject']) + + if run_subject_checks('Subject: ' + mail['Subject'], spellcheck): + result = True + ovs_checkpatch_print_result() return result @@ -1046,6 +1146,7 @@ def partition(pred, iterable): "skip-signoff-lines", "skip-trailing-whitespace", "skip-gerrit-change-id", + "skip-committer-signoff", "spellcheck", "quiet"]) except: @@ -1066,6 +1167,8 @@ def partition(pred, iterable): skip_trailing_whitespace_check = True elif o in ("--skip-gerrit-change-id"): skip_gerrit_change_id_check = True + elif o in ("--skip-committer-signoff"): + skip_committer_signoff_check = True elif o in ("-f", "--check-file"): checking_file = True elif o in ("-S", "--spellcheck"): diff --git a/utilities/gdb/ovs_gdb.py b/utilities/gdb/ovs_gdb.py index 763ece2a78d..982395dd1d2 100644 --- a/utilities/gdb/ovs_gdb.py +++ b/utilities/gdb/ovs_gdb.py @@ -30,6 +30,8 @@ # - ovs_dump_netdev_provider # - ovs_dump_ovs_list {[] [] {dump}]} # - ovs_dump_packets [tcpdump options] +# - ovs_dump_cmap {[] [] {dump}]} +# - ovs_dump_hmap {dump} # - ovs_dump_simap # - ovs_dump_smap # - ovs_dump_udpif_keys {|} {short} @@ -849,6 +851,119 @@ def invoke(self, arg, from_tty): member).dereference())) +# +# Implements the GDB "ovs_dump_cmap" command +# +class CmdDumpCmap(gdb.Command): + """Dump all nodes of a given cmap + Usage: + ovs_dump_cmap {[] [] {dump}]} + + For example dump all the rules in a dpcls_subtable: + + (gdb) ovs_dump_cmap &subtable->rules + (struct cmap *) 0x3e02758 + + This is not very useful, so please use this with the container_of mode: + + (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node + (struct dpcls_rule *) 0x3e02758 + + Now you can manually use the print command to show the content, or use the + dump option to dump the structure for all nodes: + + (gdb) ovs_dump_cmap &subtable->rules "struct dpcls_rule" cmap_node dump + (struct dpcls_rule *) 0x3e02758 = + {cmap_node = {next = {p = 0x0}}, mask = 0x3dfe100, flow = {hash = ... + """ + def __init__(self): + super(CmdDumpCmap, self).__init__("ovs_dump_cmap", + gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + arg_list = gdb.string_to_argv(arg) + typeobj = None + member = None + dump = False + + if len(arg_list) != 1 and len(arg_list) != 3 and len(arg_list) != 4: + print("usage: ovs_dump_cmap " + "{[] [] {dump}]}") + return + + cmap = gdb.parse_and_eval(arg_list[0]).cast( + gdb.lookup_type('struct cmap').pointer()) + + if len(arg_list) >= 3: + typeobj = arg_list[1] + member = arg_list[2] + if len(arg_list) == 4 and arg_list[3] == "dump": + dump = True + + for node in ForEachCMAP(cmap.dereference()): + if typeobj is None or member is None: + print("(struct cmap *) {}".format(node)) + else: + print("({} *) {} {}".format( + typeobj, + container_of(node, + gdb.lookup_type(typeobj).pointer(), member), + "=" if dump else "")) + if dump: + print(" {}\n".format(container_of( + node, + gdb.lookup_type(typeobj).pointer(), + member).dereference())) + + +# +# Implements the GDB "ovs_dump_hmap" command +# +class CmdDumpHmap(gdb.Command): + """Dump all nodes of a given hmap + Usage: + ovs_dump_hmap {dump} + + For example dump all the bridges when the all_bridges variable is + optimized out due to LTO: + + (gdb) ovs_dump_hmap "&'all_bridges.lto_priv.0'" "struct bridge" "node" + (struct bridge *) 0x55ec43069c70 + (struct bridge *) 0x55ec430428a0 + (struct bridge *) 0x55ec430a55f0 + + The 'dump' option will also include the full structure content in the + output. + """ + def __init__(self): + super(CmdDumpHmap, self).__init__("ovs_dump_hmap", + gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + arg_list = gdb.string_to_argv(arg) + typeobj = None + member = None + dump = False + + if len(arg_list) != 3 and len(arg_list) != 4: + print("usage: ovs_dump_hmap " + " {dump}") + return + + hmap = gdb.parse_and_eval(arg_list[0]).cast( + gdb.lookup_type('struct hmap').pointer()) + + typeobj = arg_list[1] + member = arg_list[2] + if len(arg_list) == 4 and arg_list[3] == "dump": + dump = True + + for node in ForEachHMAP(hmap.dereference(), typeobj, member): + print("({} *) {} {}".format(typeobj, node, "=" if dump else "")) + if dump: + print(" {}\n".format(node.dereference())) + + # # Implements the GDB "ovs_dump_simap" command # @@ -1449,6 +1564,8 @@ def extract_pkt(self, pkt): CmdDumpOfpacts() CmdDumpOvsList() CmdDumpPackets() +CmdDumpCmap() +CmdDumpHmap() CmdDumpSimap() CmdDumpSmap() CmdDumpUdpifKeys() diff --git a/utilities/ovs-appctl-bashcomp.bash b/utilities/ovs-appctl-bashcomp.bash index 4384be8ae10..0a9af1a18f0 100644 --- a/utilities/ovs-appctl-bashcomp.bash +++ b/utilities/ovs-appctl-bashcomp.bash @@ -223,6 +223,13 @@ printf_stderr() { # The code below is taken from Peter Amidon. His change makes it more # robust. extract_bash_prompt() { + # On Bash 4.4+ just use the @P expansion + if ((BASH_VERSINFO[0] > 4 || + (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] >= 4))); then + _BASH_PROMPT="${PS1@P}" + return + fi + local myPS1 v myPS1="$(sed 's/Begin prompt/\\Begin prompt/; s/End prompt/\\End prompt/' <<< "$PS1")" diff --git a/utilities/ovs-appctl.c b/utilities/ovs-appctl.c index ba0c172e6da..682ee100ce0 100644 --- a/utilities/ovs-appctl.c +++ b/utilities/ovs-appctl.c @@ -26,57 +26,107 @@ #include "daemon.h" #include "dirs.h" #include "openvswitch/dynamic-string.h" +#include "openvswitch/json.h" #include "jsonrpc.h" #include "process.h" #include "timeval.h" +#include "svec.h" #include "unixctl.h" #include "util.h" #include "openvswitch/vlog.h" static void usage(void); -static const char *parse_command_line(int argc, char *argv[]); + +/* Parsed command line args. */ +struct cmdl_args { + enum unixctl_output_fmt format; + unsigned int format_flags; + char *target; +}; + +static struct cmdl_args *cmdl_args_create(void); +static struct cmdl_args *parse_command_line(int argc, char *argv[]); static struct jsonrpc *connect_to_target(const char *target); +static char *reply_to_string(struct json *reply, enum unixctl_output_fmt fmt, + unsigned int fmt_flags); int main(int argc, char *argv[]) { - char *cmd_result, *cmd_error; + struct svec opt_argv = SVEC_EMPTY_INITIALIZER; + struct json *cmd_result, *cmd_error; struct jsonrpc *client; + struct cmdl_args *args; char *cmd, **cmd_argv; - const char *target; + char *msg = NULL; int cmd_argc; int error; set_program_name(argv[0]); /* Parse command line and connect to target. */ - target = parse_command_line(argc, argv); - client = connect_to_target(target); + args = parse_command_line(argc, argv); + client = connect_to_target(args->target); + + /* Transact options request (if required) and process reply. */ + if (args->format != UNIXCTL_OUTPUT_FMT_TEXT) { + svec_add(&opt_argv, "--format"); + svec_add(&opt_argv, unixctl_output_fmt_to_string(args->format)); + } + svec_terminate(&opt_argv); + + if (!svec_is_empty(&opt_argv)) { + error = unixctl_client_transact(client, "set-options", + opt_argv.n, opt_argv.names, + &cmd_result, &cmd_error); + + if (error) { + ovs_fatal(error, "%s: transaction error", args->target); + } + + if (cmd_error) { + jsonrpc_close(client); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT, 0); + fputs(msg, stderr); + free(msg); + ovs_error(0, "%s: server returned an error", args->target); + exit(2); + } + + json_destroy(cmd_result); + json_destroy(cmd_error); + } + svec_destroy(&opt_argv); - /* Transact request and process reply. */ + /* Transact command request and process reply. */ cmd = argv[optind++]; cmd_argc = argc - optind; cmd_argv = cmd_argc ? argv + optind : NULL; error = unixctl_client_transact(client, cmd, cmd_argc, cmd_argv, &cmd_result, &cmd_error); if (error) { - ovs_fatal(error, "%s: transaction error", target); + ovs_fatal(error, "%s: transaction error", args->target); } if (cmd_error) { jsonrpc_close(client); - fputs(cmd_error, stderr); - ovs_error(0, "%s: server returned an error", target); + msg = reply_to_string(cmd_error, UNIXCTL_OUTPUT_FMT_TEXT, 0); + fputs(msg, stderr); + free(msg); + ovs_error(0, "%s: server returned an error", args->target); exit(2); } else if (cmd_result) { - fputs(cmd_result, stdout); + msg = reply_to_string(cmd_result, args->format, args->format_flags); + fputs(msg, stdout); + free(msg); } else { OVS_NOT_REACHED(); } jsonrpc_close(client); - free(cmd_result); - free(cmd_error); + json_destroy(cmd_result); + json_destroy(cmd_error); + free(args); return 0; } @@ -101,24 +151,43 @@ Common commands:\n\ vlog/reopen Make the program reopen its log file\n\ Other options:\n\ --timeout=SECS wait at most SECS seconds for a response\n\ + -f, --format=FMT Output format. One of: 'json', or 'text'\n\ + (default: text)\n\ + --pretty Format the output in a more readable fashion.\n\ + Requires: --format=json.\n\ -h, --help Print this helpful information\n\ -V, --version Display ovs-appctl version information\n", program_name, program_name); exit(EXIT_SUCCESS); } -static const char * +static struct cmdl_args * +cmdl_args_create(void) +{ + struct cmdl_args *args = xmalloc(sizeof *args); + + args->format = UNIXCTL_OUTPUT_FMT_TEXT; + args->format_flags = 0; + args->target = NULL; + + return args; +} + +static struct cmdl_args * parse_command_line(int argc, char *argv[]) { enum { OPT_START = UCHAR_MAX + 1, - VLOG_OPTION_ENUMS + OPT_PRETTY, + VLOG_OPTION_ENUMS, }; static const struct option long_options[] = { {"target", required_argument, NULL, 't'}, {"execute", no_argument, NULL, 'e'}, + {"format", required_argument, NULL, 'f'}, {"help", no_argument, NULL, 'h'}, {"option", no_argument, NULL, 'o'}, + {"pretty", no_argument, NULL, OPT_PRETTY}, {"version", no_argument, NULL, 'V'}, {"timeout", required_argument, NULL, 'T'}, VLOG_LONG_OPTIONS, @@ -126,11 +195,11 @@ parse_command_line(int argc, char *argv[]) }; char *short_options_ = ovs_cmdl_long_options_to_short_options(long_options); char *short_options = xasprintf("+%s", short_options_); - const char *target; - int e_options; + struct cmdl_args *args = cmdl_args_create(); unsigned int timeout = 0; + bool pretty = false; + int e_options; - target = NULL; e_options = 0; for (;;) { int option; @@ -141,10 +210,10 @@ parse_command_line(int argc, char *argv[]) } switch (option) { case 't': - if (target) { + if (args->target) { ovs_fatal(0, "-t or --target may be specified only once"); } - target = optarg; + args->target = optarg; break; case 'e': @@ -157,6 +226,12 @@ parse_command_line(int argc, char *argv[]) } break; + case 'f': + if (!unixctl_output_fmt_from_string(optarg, &args->format)) { + ovs_fatal(0, "value %s on -f or --format is invalid", optarg); + } + break; + case 'h': usage(); break; @@ -165,6 +240,10 @@ parse_command_line(int argc, char *argv[]) ovs_cmdl_print_options(long_options); exit(EXIT_SUCCESS); + case OPT_PRETTY: + pretty = true; + break; + case 'T': if (!str_to_uint(optarg, 10, &timeout) || !timeout) { ovs_fatal(0, "value %s on -T or --timeout is invalid", optarg); @@ -194,7 +273,17 @@ parse_command_line(int argc, char *argv[]) "(use --help for help)"); } - return target ? target : "ovs-vswitchd"; + if (pretty) { + if (args->format != UNIXCTL_OUTPUT_FMT_JSON) { + ovs_fatal(0, "--pretty is supported with --format json only"); + } + args->format_flags |= JSSF_PRETTY; + } + + if (!args->target) { + args->target = "ovs-vswitchd"; + } + return args; } static struct jsonrpc * @@ -236,3 +325,31 @@ connect_to_target(const char *target) return client; } +/* The caller is responsible for freeing the returned string, with free(), when + * it is no longer needed. */ +static char * +reply_to_string(struct json *reply, enum unixctl_output_fmt fmt, + unsigned int fmt_flags) +{ + ovs_assert(reply); + + if (fmt == UNIXCTL_OUTPUT_FMT_TEXT && reply->type != JSON_STRING) { + ovs_error(0, "Unexpected reply type in JSON rpc reply: %s", + json_type_to_string(reply->type)); + exit(2); + } + + struct ds ds = DS_EMPTY_INITIALIZER; + + if (fmt == UNIXCTL_OUTPUT_FMT_TEXT) { + ds_put_cstr(&ds, json_string(reply)); + } else { + json_to_ds(reply, JSSF_SORT | fmt_flags, &ds); + } + + if (ds_last(&ds) != EOF && ds_last(&ds) != '\n') { + ds_put_char(&ds, '\n'); + } + + return ds_steal_cstr(&ds); +} diff --git a/utilities/ovs-ctl.in b/utilities/ovs-ctl.in index eba9512fe8b..57abd3a5b45 100644 --- a/utilities/ovs-ctl.in +++ b/utilities/ovs-ctl.in @@ -103,8 +103,13 @@ set_system_ids () { action "Configuring Open vSwitch system IDs" "$@" $extra_ids } -check_force_cores () { - if test X"$FORCE_COREFILES" = Xyes; then +check_core_config () { + if test X"$DUMP_HUGEPAGES" = Xyes; then + echo 0x7f > /proc/self/coredump_filter + if test X"$FORCE_COREFILES" = Xyes; then + ulimit -c unlimited + fi + elif test X"$FORCE_COREFILES" = Xyes; then ulimit -c 67108864 fi } @@ -116,7 +121,7 @@ del_transient_ports () { } do_start_ovsdb () { - check_force_cores + check_core_config if daemon_is_running ovsdb-server; then log_success_msg "ovsdb-server is already running" @@ -151,8 +156,8 @@ do_start_ovsdb () { [ "$OVS_USER" != "" ] && set "$@" --user "$OVS_USER" [ "$OVSDB_SERVER_OPTIONS" != "" ] && set "$@" $OVSDB_SERVER_OPTIONS - start_daemon "$OVSDB_SERVER_PRIORITY" "$OVSDB_SERVER_WRAPPER" "$@" \ - || return 1 + start_daemon "$OVSDB_SERVER_PRIORITY" "$OVSDB_SERVER_WRAPPER" \ + "$OVSDB_SERVER_UMASK" "$@" || return 1 # Initialize database settings. ovs_vsctl -- init -- set Open_vSwitch . db-version="$schemaver" \ @@ -193,7 +198,7 @@ add_managers () { } do_start_forwarding () { - check_force_cores + check_core_config insert_mod_if_required || return 1 @@ -221,8 +226,8 @@ do_start_forwarding () { [ "$OVS_USER" != "" ] && set "$@" --user "$OVS_USER" [ "$OVS_VSWITCHD_OPTIONS" != "" ] &&set "$@" $OVS_VSWITCHD_OPTIONS - start_daemon "$OVS_VSWITCHD_PRIORITY" "$OVS_VSWITCHD_WRAPPER" "$@" || - return 1 + start_daemon "$OVS_VSWITCHD_PRIORITY" "$OVS_VSWITCHD_WRAPPER" \ + "$OVS_VSWITCHD_UMASK" "$@" || return 1 fi } @@ -330,6 +335,7 @@ set_defaults () { DAEMON_CWD=/ FORCE_COREFILES=yes + DUMP_HUGEPAGES=no MLOCKALL=yes SELF_CONFINEMENT=yes MONITOR=yes @@ -342,6 +348,8 @@ set_defaults () { OVS_VSWITCHD_WRAPPER= OVSDB_SERVER_OPTIONS= OVS_VSWITCHD_OPTIONS= + OVSDB_SERVER_UMASK= + OVS_VSWITCHD_UMASK= DB_FILE=$dbdir/conf.db DB_SOCK=$rundir/db.sock @@ -415,10 +423,17 @@ Other important options for "start", "restart" and "force-reload-kmod": add given key-value pair to Open_vSwitch external-ids --delete-bridges delete all bridges just before starting ovs-vswitchd --ovs-user="user[:group]" pass the --user flag to ovs daemons + --ovsdb-server-umask=MODE Set umask prior to run ovsdb-server daemon. + This is useful to manage daemon's sockets permissions. + Default is not to change umask (inherited from shell). + --ovs-vswitchd-umask=MODE Set umask prior to run ovs-vswitchd daemon. + This is useful to manage daemon's sockets permissions. + Default is not to change umask (inherited from shell). Less important options for "start", "restart" and "force-reload-kmod": --daemon-cwd=DIR set working dir for OVS daemons (default: $DAEMON_CWD) --no-force-corefiles do not force on core dumps for OVS daemons + --dump-hugepages include hugepages in core dumps --no-mlockall do not lock all of ovs-vswitchd into memory --ovsdb-server-priority=NICE set ovsdb-server's niceness (default: $OVSDB_SERVER_PRIORITY) --ovsdb-server-options=OPTIONS additional options for ovsdb-server (example: '-vconsole:dbg -vfile:dbg') @@ -441,7 +456,7 @@ File location options: Options for "enable-protocol": --protocol=PROTOCOL protocol to enable with iptables (default: gre) --sport=PORT source port to match (for tcp or udp protocol) - --dport=PORT ddestination port to match (for tcp or udp protocol) + --dport=PORT destination port to match (for tcp or udp protocol) Option for "start-ovs-ipsec": --ike-daemon=IKE_DAEMON diff --git a/utilities/ovs-dpctl-top.in b/utilities/ovs-dpctl-top.in index fbe6e4f560a..ec57eccd66e 100755 --- a/utilities/ovs-dpctl-top.in +++ b/utilities/ovs-dpctl-top.in @@ -351,7 +351,7 @@ def args_get(): # None is a special value indicating to read flows from stdin. # This handles the case # ovs-dpctl dump-flows | ovs-dpctl-flows.py - parser.add_argument("-v", "--version", version="@VERSION@", + parser.add_argument("-v", "--version", version="@VERSION@@VERSION_SUFFIX@", action="version", help="show version") parser.add_argument("-f", "--flow-file", dest="flowFiles", default=None, action="append", @@ -1236,11 +1236,7 @@ def flows_script(args): if (args.flowFiles is None): logging.info("reading flows from stdin") - ihdl = os.fdopen(sys.stdin.fileno(), 'r', 0) - try: - flow_db = flows_read(ihdl, flow_db) - finally: - ihdl.close() + flow_db = flows_read(sys.stdin, flow_db) else: for flowFile in args.flowFiles: logging.info("reading flows from %s", flowFile) diff --git a/utilities/ovs-lib.in b/utilities/ovs-lib.in index 13477a6a9e9..d162227dc5e 100644 --- a/utilities/ovs-lib.in +++ b/utilities/ovs-lib.in @@ -70,7 +70,7 @@ ovs_ctl () { esac } -VERSION='@VERSION@' +VERSION='@VERSION@@VERSION_SUFFIX@' DAEMON_CWD=/ @@ -165,9 +165,9 @@ install_dir () { } start_daemon () { - priority=$1 - wrapper=$2 - shift; shift + priority=$1 && shift + wrapper=$1 && shift + umask=$1 && shift daemon=$1 strace="" @@ -223,8 +223,19 @@ start_daemon () { set nice -n "$priority" "$@" fi + # Set requested umask if any and turn previous value back. + if [ -n "$umask" ]; then + previuos_umask_value=$(umask) + umask "$umask" + fi + action "Starting $daemon" "$@" || return 1 + # If umask was set, turn umask value to previous value. + if [ -n "$umask" ]; then + umask "$previuos_umask_value" + fi + if test X"$strace" != X; then # Strace doesn't have the -D option so we attach after the fact. setsid $strace -o "$logdir/$daemon.strace.log" \ diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in index 10a6a64de90..d0f99f2bb92 100644 --- a/utilities/ovs-ofctl.8.in +++ b/utilities/ovs-ofctl.8.in @@ -296,6 +296,40 @@ Flushes the connection tracking entries in \fIzone\fR on \fIswitch\fR. This command uses an Open vSwitch extension that is only in Open vSwitch 2.6 and later. . +.IP "\fBct\-flush \fIswitch [zone=N] [mark=X[/M]] [labels=Y[/N]] [ct-orig-tuple [ct-reply-tuple]]\fR +Flushes the connection entries on \fIswitch\fR based on \fIzone\fR, \fImark\fR, +\fIlabels\fR and connection tracking tuples \fIct-[orig|reply]-tuple\fR. +.IP +If \fIct-[orig|reply]-tuple\fR is not provided, flushes all the connection +entries. If \fIzone\fR is specified, only flushes the connections in +\fIzone\fR. if \fImark\fR or \fIlabels\fR is provided, it will flush +only entries that are matching specific \fImark/labels\fR. +.IP +If \fIct-[orig|reply]-tuple\fR is provided, flushes the connection entry +specified by \fIct-[orig|reply]-tuple\fR in \fIzone\fR. The zone defaults +to 0 if it is not provided. The \fImark\fR and \fIlabels\fR defaults to "0/0" +if it is not provided. The userspace connection tracker requires flushing +with the original pre-NATed tuple and a warning log will be otherwise +generated. The tuple can be partial and will remove all connections that are +matching on the specified fields. In order to specify only +\fIct-reply-tuple\fR, provide empty string as \fIct-orig-tuple\fR. +.IP +Note: Currently there is limitation for matching on ICMP, in order to partially +match on ICMP parameters the \fIct-[orig|reply]-tuple\fR has to include +either source or destination IP. +.IP +An example of an IPv4 ICMP \fIct-[orig|reply]-tuple\fR: +.IP +"ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10" +.IP +An example of an IPv6 TCP \fIct-[orig|reply]-tuple\fR: +.IP +"ct_ipv6_src=fc00::1,ct_ipv6_dst=fc00::2,ct_nw_proto=6,ct_tp_src=1,ct_tp_dst=2" +.IP +This command uses an Open vSwitch extension that is only in Open vSwitch 3.1 +and later. Support for matching on \fImark\fR and \fIlabels\fR is only in +Open vSwitch 3.3 and later. +. .SS "OpenFlow Switch Flow Table Commands" . These commands manage the flow table in an OpenFlow switch. In each diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c index fe911458027..ba3458e55ad 100644 --- a/utilities/ovs-ofctl.c +++ b/utilities/ovs-ofctl.c @@ -48,6 +48,7 @@ #include "openvswitch/meta-flow.h" #include "openvswitch/ofp-actions.h" #include "openvswitch/ofp-bundle.h" +#include "openvswitch/ofp-ct.h" #include "openvswitch/ofp-errors.h" #include "openvswitch/ofp-group.h" #include "openvswitch/ofp-match.h" @@ -153,6 +154,12 @@ static int show_stats = 1; /* --pcap: Makes "compose-packet" print a pcap on stdout. */ static int print_pcap = 0; +/* --bare: Makes "compose-packet" print a bare hexified payload. */ +static int print_bare = 0; + +/* -bad-csum: Makes "compose-packet" generate an invalid checksum. */ +static int bad_csum = 0; + /* --raw: Makes "ofp-print" read binary data from stdin. */ static int raw = 0; @@ -172,7 +179,7 @@ main(int argc, char *argv[]) ctx.argc = argc - optind; ctx.argv = argv + optind; - daemon_become_new_user(false); + daemon_become_new_user(false, false); if (read_only) { ovs_cmdl_run_command_read_only(&ctx, get_all_commands()); } else { @@ -242,6 +249,8 @@ parse_options(int argc, char *argv[]) {"color", optional_argument, NULL, OPT_COLOR}, {"may-create", no_argument, NULL, OPT_MAY_CREATE}, {"pcap", no_argument, &print_pcap, 1}, + {"bare", no_argument, &print_bare, 1}, + {"bad-csum", no_argument, &bad_csum, 1}, {"raw", no_argument, &raw, 1}, {"read-only", no_argument, NULL, OPT_READ_ONLY}, DAEMON_LONG_OPTIONS, @@ -485,6 +494,11 @@ usage(void) " dump-ipfix-bridge SWITCH print ipfix stats of bridge\n" " dump-ipfix-flow SWITCH print flow ipfix of a bridge\n" " ct-flush-zone SWITCH ZONE flush conntrack entries in ZONE\n" + " ct-flush SWITCH [ZONE] [mark=X[/M]] [labels=Y[/N]]\n" + " [CT_ORIG_TUPLE [CT_REPLY_TUPLE]]\n" + " flush conntrack entries specified\n" + " by CT_ORIG/REPLY_TUPLE, ZONE, mark\n" + " and labels\n" "\nFor OpenFlow switches and controllers:\n" " probe TARGET probe whether TARGET is up\n" " ping TARGET [N] latency of N-byte echos\n" @@ -2123,7 +2137,7 @@ monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests, int error; daemon_save_fd(STDERR_FILENO); - daemonize_start(false); + daemonize_start(false, false); error = unixctl_server_create(unixctl_path, &server); if (error) { ovs_fatal(error, "failed to create unixctl server"); @@ -3050,6 +3064,30 @@ ofctl_ct_flush_zone(struct ovs_cmdl_context *ctx) vconn_close(vconn); } +static void +ofctl_ct_flush(struct ovs_cmdl_context *ctx) +{ + struct vconn *vconn; + struct ofp_ct_match match = {0}; + struct ds ds = DS_EMPTY_INITIALIZER; + uint16_t zone; + int args = ctx->argc - 2; + bool with_zone = false; + + if (args && !ofp_ct_match_parse((const char **) &ctx->argv[2], + args, &ds, &match, &with_zone, &zone)) { + ovs_fatal(0, "Failed to parse CT match: %s", ds_cstr(&ds)); + } + + open_vconn(ctx->argv[1], &vconn); + enum ofp_version version = vconn_get_version(vconn); + struct ofpbuf *msg = ofp_ct_match_encode(&match, with_zone ? &zone : NULL, + version); + ds_destroy(&ds); + transact_noreply(vconn, msg); + vconn_close(vconn); +} + static void ofctl_dump_ipfix_flow(struct ovs_cmdl_context *ctx) { @@ -4896,20 +4934,33 @@ ofctl_parse_key_value(struct ovs_cmdl_context *ctx) } } -/* "compose-packet [--pcap] FLOW [L7]": Converts the OpenFlow flow - * specification FLOW to a packet with flow_compose() and prints the hex bytes - * in the packet on stdout. Also verifies that the flow extracted from that - * packet matches the original FLOW. +/* "compose-packet [--pcap|--bare] [--bad-csum] FLOW [L7]": Converts the + * OpenFlow flow specification FLOW to a packet with flow_compose() and prints + * the hex bytes of the packet, with offsets, to stdout. + * + * With --pcap, prints the packet in pcap format, so that you can do something + * like "ovs-ofctl --pcap compose-packet udp | tcpdump -vvvv -r-" to use + * another tool to dump the packet contents. + * + * With --bare, prints the packet as a single bare hex string with no + * spaces or offsets, so that you can pass the result directly to e.g. + * "ovs-appctl netdev-dummy/receive vif $(ovs-ofctl compose-packet --bare + * FLOW)" * - * With --pcap, prints the packet to stdout instead as a pcap file, so that you - * can do something like "ovs-ofctl --pcap compose-packet udp | tcpdump -vvvv - * -r-" to use another tool to dump the packet contents. + * With --bad-csum, produces a packet with an invalid IP checksum. (For IPv4.) + * + * Regardless of the mode, the command also verifies that the flow extracted + * from that packet matches the original FLOW. * * If L7 is specified, draws the L7 payload data from it, otherwise defaults to * 64 bytes of payload. */ static void ofctl_compose_packet(struct ovs_cmdl_context *ctx) { + if (print_pcap && print_bare) { + ovs_fatal(1, "--bare and --pcap are mutually exclusive"); + } + if (print_pcap && isatty(STDOUT_FILENO)) { ovs_fatal(1, "not writing pcap data to stdout; redirect to a file " "or pipe to tcpdump instead"); @@ -4937,7 +4988,7 @@ ofctl_compose_packet(struct ovs_cmdl_context *ctx) l7_len = dp_packet_size(&payload); l7 = dp_packet_steal_data(&payload); } - flow_compose(&p, &flow1, l7, l7_len); + flow_compose(&p, &flow1, l7, l7_len, bad_csum); free(l7); if (print_pcap) { @@ -4945,6 +4996,16 @@ ofctl_compose_packet(struct ovs_cmdl_context *ctx) ovs_pcap_write_header(p_file); ovs_pcap_write(p_file, &p); ovs_pcap_close(p_file); + } else if (print_bare) { + /* Binary to a bare hex string. */ + for (int i = 0; i < dp_packet_size(&p); i++) { + uint8_t val = ((uint8_t *) dp_packet_data(&p))[i]; + /* Don't use ds_put_hex because it adds 0x prefix as well as + * it doesn't guarantee an even number of payload characters, which + * may be important elsewhere (e.g. in netdev-dummy/receive). */ + printf("%02" PRIx8, val); + } + } else { ovs_hex_dump(stdout, dp_packet_data(&p), dp_packet_size(&p), 0, false); } @@ -5061,7 +5122,11 @@ static const struct ovs_cmdl_command all_commands[] = { 1, 1, ofctl_dump_ipfix_flow, OVS_RO }, { "ct-flush-zone", "switch zone", - 2, 2, ofctl_ct_flush_zone, OVS_RO }, + 2, 2, ofctl_ct_flush_zone, OVS_RW }, + + { "ct-flush", "switch [zone=N] [mark=X[/M]] [labels=Y[/N]] " + "[ct-orig-tuple [ct-reply-tuple]]", + 1, 6, ofctl_ct_flush, OVS_RW }, { "ofp-parse", "file", 1, 1, ofctl_ofp_parse, OVS_RW }, diff --git a/utilities/ovs-parse-backtrace.in b/utilities/ovs-parse-backtrace.in index f44f05cd1e1..42f831eed51 100755 --- a/utilities/ovs-parse-backtrace.in +++ b/utilities/ovs-parse-backtrace.in @@ -51,7 +51,7 @@ def addr2line(binary, addr): def main(): - parser = optparse.OptionParser(version='@VERSION@', + parser = optparse.OptionParser(version='@VERSION@@VERSION_SUFFIX@', usage="usage: %prog [binary]", description="""\ Parses the output of ovs-appctl backtrace producing a more human readable diff --git a/utilities/ovs-pcap.in b/utilities/ovs-pcap.in index 6b5f63399ec..d0ca9478869 100755 --- a/utilities/ovs-pcap.in +++ b/utilities/ovs-pcap.in @@ -85,7 +85,7 @@ if __name__ == "__main__": if key in ['-h', '--help']: usage() elif key in ['-V', '--version']: - print("ovs-pcap (Open vSwitch) @VERSION@") + print("ovs-pcap (Open vSwitch) @VERSION@@VERSION_SUFFIX@") else: sys.exit(0) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in index e0ba910f94c..69060b4ace4 100755 --- a/utilities/ovs-pki.in +++ b/utilities/ovs-pki.in @@ -57,6 +57,77 @@ FreeBSD|NetBSD|Darwin) ;; esac +case $(uname -s) in +MINGW*|MSYS*) + chmod() + { + local PERM=$1 + local FILE=$2 + local INH= + + if test -d "${FILE}"; then + # Inheritance rules for folders: apply to a folder itself, + # subfolders and files within. + INH='(OI)(CI)' + fi + + case "${PERM}" in + *700 | *600) + # Reset all own and inherited ACEs and grant full access to the + # "Creator Owner". We're giving full access even for 0600, + # because it doesn't matter for a use case of ovs-pki. + icacls "${FILE}" /inheritance:r /grant:r "*S-1-3-0:${INH}F" + ;; + *750) + # Reset all own and inherited ACEs, grant full access to the + # "Creator Owner" and a read+execute access to the "Creator Group". + icacls "${FILE}" /inheritance:r /grant:r \ + "*S-1-3-0:${INH}F" "*S-1-3-1:${INH}RX" + ;; + *) + echo >&2 "Unable to set ${PERM} mode for ${FILE}." + exit 1 + ;; + esac + } + + mkdir() + { + ARG_P= + PERM= + for arg; do + shift + case ${arg} in + -m?*) + PERM=${arg#??} + continue + ;; + -m) + PERM=$1 + shift + continue + ;; + -p) + ARG_P=-p + continue + ;; + *) + set -- "$@" "${arg}" + ;; + esac + done + + command mkdir ${ARG_P} $@ + if [ ${PERM} ]; then + for dir; do + shift + chmod ${PERM} ${dir} + done + fi + } + ;; +esac + for option; do # This option-parsing mechanism borrowed from a Autoconf-generated # configure script under the following license: @@ -118,7 +189,7 @@ EOF exit 0 ;; -V|--version) - echo "ovs-pki (Open vSwitch) @VERSION@" + echo "ovs-pki (Open vSwitch) @VERSION@@VERSION_SUFFIX@" exit 0 ;; --di*=*) @@ -318,7 +389,7 @@ EOF -extensions ca_cert -out cacert.pem \ -days 3650 -batch -keyfile private/cakey.pem -selfsign \ -infiles careq.pem 1>&3 2>&3 - chmod 0700 private/cakey.pem + chmod 0600 private/cakey.pem cd "$oldpwd" done @@ -466,14 +537,24 @@ CN = $cn [ v3_req ] subjectAltName = DNS:$cn EOF + # It is important to create private keys in $TMP because umask doesn't + # work on Windows and permissions there are inherited from the folder. + # umask itself is still needed though to ensure correct permissions + # on non-Windows platforms. if test $keytype = rsa; then - (umask 077 && openssl genrsa -out "$1-privkey.pem" $bits) 1>&3 2>&3 \ - || exit $? + (umask 077 && openssl genrsa -out "$TMP/privkey.pem" $bits) \ + 1>&3 2>&3 || exit $? else must_exist "$dsaparam" - (umask 077 && openssl gendsa -out "$1-privkey.pem" "$dsaparam") \ + (umask 077 && openssl gendsa -out "$TMP/privkey.pem" "$dsaparam") \ 1>&3 2>&3 || exit $? fi + # Windows: applying permissions (ACEs) to the file itself, just in case. + # 'mv' should technically preserve all the inherited ACEs from a TMP + # folder, but it's better to not rely on that. + chmod 0600 "$TMP/privkey.pem" + mv "$TMP/privkey.pem" "$1-privkey.pem" + openssl req -config "$TMP/req.cnf" -new -text \ -key "$1-privkey.pem" -out "$1-req.pem" 1>&3 2>&3 } @@ -545,16 +626,9 @@ elif test "$command" = self-sign; then cat > "$TMP/v3.ext" <&3 || exit $? - - # Reset the permissions on the certificate to the user's default. - cat "$arg1-cert.pem.tmp" > "$arg1-cert.pem" - rm -f "$arg1-cert.pem.tmp" + openssl x509 -in "$arg1-req.pem" -out "$arg1-cert.pem" \ + -signkey "$arg1-privkey.pem" -req -days 3650 -text \ + -extfile $TMP/v3.ext 2>&3 || exit $? else echo "$0: $command command unknown; use --help for help" >&2 exit 1 diff --git a/utilities/ovs-sim.in b/utilities/ovs-sim.in index 08957bdf46f..779ea60aee1 100755 --- a/utilities/ovs-sim.in +++ b/utilities/ovs-sim.in @@ -131,7 +131,7 @@ EOF export -f as sim_add() { - if test "$1" == --help; then + if test "$1" = --help; then cat < 4 || + (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] >= 4))); then + printf '%s\n' "${PS1@P}" + return + fi + # Original inspiration from # http://stackoverflow.com/questions/10060500/bash-how-to-evaluate-ps1-ps2, # but changed quite a lot to make it more robust. diff --git a/utilities/ovs-vsctl.8.in b/utilities/ovs-vsctl.8.in index 9e319aa1cf8..5ce949df496 100644 --- a/utilities/ovs-vsctl.8.in +++ b/utilities/ovs-vsctl.8.in @@ -354,7 +354,7 @@ Prints the name of the bridge that contains \fIiface\fR on standard output. . .SS "Conntrack Zone Commands" -These commands query and modify datapath CT zones and Timeout Policies. +These commands query and modify datapath CT zones, Timeout Policies and Limits. . .IP "[\fB\-\-may\-exist\fR] \fBadd\-zone\-tp \fIdatapath \fBzone=\fIzone_id \fIpolicies\fR" Creates a conntrack zone timeout policy with \fIzone_id\fR in @@ -365,20 +365,37 @@ packet and a 60-second policy for ICMP reply packets. See the \fBCT_Timeout_Policy\fR table in \fBovs-vswitchd.conf.db\fR(5) for the supported keys. .IP -Without \fB\-\-may\-exist\fR, attempting to add a \fIzone_id\fR that -already exists is an error. With \fB\-\-may\-exist\fR, -this command does nothing if \fIzone_id\fR already exists. +Without \fB\-\-may\-exist\fR, attempting to add a \fIpolicy\fR for +\fIzone_id\fR that already has a policy is an error. + With \fB\-\-may\-exist\fR, this command does nothing if policy for + \fIzone_id\fR already exists. . .IP "[\fB\-\-if\-exists\fR] \fBdel\-zone\-tp \fIdatapath \fBzone=\fIzone_id\fR" Delete the timeout policy associated with \fIzone_id\fR from \fIdatapath\fR. .IP -Without \fB\-\-if\-exists\fR, attempting to delete a zone that -does not exist is an error. With \fB\-\-if\-exists\fR, attempting to -delete a zone that does not exist has no effect. +Without \fB\-\-if\-exists\fR, attempting to delete a policy for zone that +does not exist or doesn't have a policy is an error. With +\fB\-\-if\-exists\fR, attempting to delete a a policy that does not +exist has no effect. . .IP "\fBlist\-zone\-tp \fIdatapath\fR" Prints the timeout policies of all zones in \fIdatapath\fR. . +.IP "\fBset\-zone\-limit \fIdatapath \fIzone_id\fR|\fBdefault \fIzone_limit\fR" +Sets a conntrack zone limit with \fIzone_id\fR|\fIdefault\fR in +\fIdatapath\fR. The \fIlimit\fR with value \fB0\fR means unlimited. +.IP +. +.IP "[\fB\-\-if\-exists\fR] \fBdel\-zone\-limit \fIdatapath \fIzone_id\fR|\fBdefault\fR" +Delete the limit associated with \fIzone_id\fR from \fIdatapath\fR. +.IP +Without \fB\-\-if\-exists\fR, attempting to delete a limit for zone that +does not exist or doesn't have a limit is an error. With \fB\-\-if\-exists\fR, +attempting to delete a limit that does not exist has no effect. +. +.IP "\fBlist\-zone\-limits \fIdatapath\fR" +Prints the limits of all zones in \fIdatapath\fR. +. .SS "Datapath Capabilities Command" The command query datapath capabilities. . diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c index 1032089fc26..495be356524 100644 --- a/utilities/ovs-vsctl.c +++ b/utilities/ovs-vsctl.c @@ -180,6 +180,7 @@ main(int argc, char *argv[]) ovsdb_idl_set_shuffle_remotes(idl, shuffle_remotes); ovsdb_idl_set_remote(idl, db, retry); ovsdb_idl_set_leader_only(idl, leader_only); + ovsdb_idl_set_db_change_aware(idl, false); run_prerequisites(commands, n_commands, idl); /* Execute the commands. @@ -441,6 +442,13 @@ Auto Attach commands:\n\ Switch commands:\n\ emer-reset reset switch to known good state\n\ \n\ +Connection Tracking commands:\n\ + set-zone-limit DATAPATH ZONE|default LIMIT\n\ + set CT LIMIT for ZONE|default on DATAPATH\n\ + del-zone-limit DATAPATH ZONE|default\n\ + delete CT limit for ZONE|default on DATAPATH\n\ + list-zone-limits DATAPATH list all limits configured on DATAPATH\n\ +\n\ %s\ %s\ \n\ @@ -574,15 +582,18 @@ add_bridge_to_cache(struct vsctl_context *vsctl_ctx, struct ovsrec_bridge *br_cfg, const char *name, struct vsctl_bridge *parent, int vlan) { - struct vsctl_bridge *br = xmalloc(sizeof *br); + struct vsctl_bridge *br = xzalloc(sizeof *br); + br->br_cfg = br_cfg; br->name = xstrdup(name); ovs_list_init(&br->ports); br->parent = parent; br->vlan = vlan; hmap_init(&br->children); + if (parent) { struct vsctl_bridge *conflict = find_vlan_bridge(parent, vlan); + if (conflict) { VLOG_WARN("%s: bridge has multiple VLAN bridges (%s and %s) " "for VLAN %d, but only one is allowed", @@ -658,7 +669,7 @@ static struct vsctl_port * add_port_to_cache(struct vsctl_context *vsctl_ctx, struct vsctl_bridge *parent, struct ovsrec_port *port_cfg) { - struct vsctl_port *port; + struct vsctl_port *port = xzalloc(sizeof *port); if (port_cfg->tag && *port_cfg->tag >= 0 && *port_cfg->tag <= 4095) { @@ -670,7 +681,6 @@ add_port_to_cache(struct vsctl_context *vsctl_ctx, struct vsctl_bridge *parent, } } - port = xmalloc(sizeof *port); ovs_list_push_back(&parent->ports, &port->ports_node); ovs_list_init(&port->ifaces); port->port_cfg = port_cfg; @@ -817,6 +827,7 @@ vsctl_context_populate_cache(struct ctl_context *ctx) continue; } br = shash_find_data(&vsctl_ctx->bridges, br_cfg->name); + ovs_assert(br); for (j = 0; j < br_cfg->n_ports; j++) { struct ovsrec_port *port_cfg = br_cfg->ports[j]; struct vsctl_port *port; @@ -888,14 +899,23 @@ check_conflicts(struct vsctl_context *vsctl_ctx, const char *name, port = shash_find_data(&vsctl_ctx->ports, name); if (port) { - ctl_fatal("%s because a port named %s already exists on " - "bridge %s", msg, name, port->bridge->name); + if (port->bridge) { + ctl_fatal("%s because a port named %s already exists on " + "bridge %s", msg, name, port->bridge->name); + } else { + ctl_fatal("%s because a port named %s already exists", msg, name); + } } iface = shash_find_data(&vsctl_ctx->ifaces, name); if (iface) { - ctl_fatal("%s because an interface named %s already exists " - "on bridge %s", msg, name, iface->port->bridge->name); + if (iface->port->bridge) { + ctl_fatal("%s because an interface named %s already exists " + "on bridge %s", msg, name, iface->port->bridge->name); + } else { + ctl_fatal("%s because an interface named %s already exists", msg, + name); + } } free(msg); @@ -935,7 +955,7 @@ find_port(struct vsctl_context *vsctl_ctx, const char *name, bool must_exist) ovs_assert(vsctl_ctx->cache_valid); port = shash_find_data(&vsctl_ctx->ports, name); - if (port && !strcmp(name, port->bridge->name)) { + if (port && port->bridge && !strcmp(name, port->bridge->name)) { port = NULL; } if (must_exist && !port) { @@ -953,7 +973,8 @@ find_iface(struct vsctl_context *vsctl_ctx, const char *name, bool must_exist) ovs_assert(vsctl_ctx->cache_valid); iface = shash_find_data(&vsctl_ctx->ifaces, name); - if (iface && !strcmp(name, iface->port->bridge->name)) { + if (iface && iface->port->bridge && + !strcmp(name, iface->port->bridge->name)) { iface = NULL; } if (must_exist && !iface) { @@ -1288,8 +1309,8 @@ cmd_add_zone_tp(struct ctl_context *ctx) ctl_fatal("No timeout policy"); } - if (zone && !may_exist) { - ctl_fatal("zone id %"PRIu64" already exists", zone_id); + if (zone && zone->timeout_policy && !may_exist) { + ctl_fatal("zone id %"PRIu64" already has a policy", zone_id); } tp = create_timeout_policy(ctx, &ctx->argv[3], n_tps); @@ -1318,11 +1339,20 @@ cmd_del_zone_tp(struct ctl_context *ctx) } struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); - if (must_exist && !zone) { - ctl_fatal("zone id %"PRIu64" does not exist", zone_id); + if (must_exist && !(zone && zone->timeout_policy)) { + ctl_fatal("zone id %"PRIu64" does not have a policy", zone_id); } - if (zone) { + if (!zone) { + return; + } + + if (zone->limit) { + if (zone->timeout_policy) { + ovsrec_ct_timeout_policy_delete(zone->timeout_policy); + } + ovsrec_ct_zone_set_timeout_policy(zone, NULL); + } else { ovsrec_datapath_update_ct_zones_delkey(dp, zone_id); } } @@ -1357,12 +1387,118 @@ cmd_list_zone_tp(struct ctl_context *ctx) } } +static void +cmd_set_zone_limit(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + int64_t zone_id = -1; + int64_t limit = -1; + + const char *dp_name = ctx->argv[1]; + + ovs_scan(ctx->argv[2], "%"SCNi64, &zone_id); + ovs_scan(ctx->argv[3], "%"SCNi64, &limit); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, dp_name); + if (!dp) { + ctl_fatal("datapath %s does not exist", dp_name); + } + + if (limit < 0 || limit > UINT32_MAX) { + ctl_fatal("limit (%"PRIi64") out of range", limit); + } + + if (!strcmp(ctx->argv[2], "default")) { + ovsrec_datapath_set_ct_zone_default_limit(dp, &limit, 1); + return; + } + + if (zone_id < 0 || zone_id > UINT16_MAX) { + ctl_fatal("zone_id (%"PRIi64") out of range", zone_id); + } + + struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); + if (!zone) { + zone = ovsrec_ct_zone_insert(ctx->txn); + ovsrec_datapath_update_ct_zones_setkey(dp, zone_id, zone); + } + + ovsrec_ct_zone_set_limit(zone, &limit, 1); +} + +static void +cmd_del_zone_limit(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + int64_t zone_id; + + bool must_exist = !shash_find(&ctx->options, "--if-exists"); + const char *dp_name = ctx->argv[1]; + + ovs_scan(ctx->argv[2], "%"SCNi64, &zone_id); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, dp_name); + if (!dp) { + ctl_fatal("datapath %s does not exist", dp_name); + } + + if (!strcmp(ctx->argv[2], "default")) { + if (must_exist && !dp->ct_zone_default_limit) { + ctl_fatal("datapath %s does not have a limit", dp_name); + } + + ovsrec_datapath_set_ct_zone_default_limit(dp, NULL, 0); + return; + } + + struct ovsrec_ct_zone *zone = find_ct_zone(dp, zone_id); + if (must_exist && !(zone && zone->limit)) { + ctl_fatal("zone_id %"PRIi64" does not have a limit", zone_id); + } + + if (!zone) { + return; + } + + if (zone->timeout_policy) { + ovsrec_ct_zone_set_limit(zone, NULL, 0); + } else { + ovsrec_datapath_update_ct_zones_delkey(dp, zone_id); + } +} + +static void +cmd_list_zone_limits(struct ctl_context *ctx) +{ + struct vsctl_context *vsctl_ctx = vsctl_context_cast(ctx); + + struct ovsrec_datapath *dp = find_datapath(vsctl_ctx, ctx->argv[1]); + if (!dp) { + ctl_fatal("datapath: %s record not found", ctx->argv[1]); + } + + if (dp->ct_zone_default_limit) { + ds_put_format(&ctx->output, "Default, Limit: %"PRIu64"\n", + *dp->ct_zone_default_limit); + } + + for (int i = 0; i < dp->n_ct_zones; i++) { + struct ovsrec_ct_zone *zone = dp->value_ct_zones[i]; + if (zone->limit) { + ds_put_format(&ctx->output, "Zone: %"PRIu64", Limit: %"PRIu64"\n", + dp->key_ct_zones[i], *zone->limit); + } + } +} + static void pre_get_zone(struct ctl_context *ctx) { ovsdb_idl_add_column(ctx->idl, &ovsrec_open_vswitch_col_datapaths); ovsdb_idl_add_column(ctx->idl, &ovsrec_datapath_col_ct_zones); + ovsdb_idl_add_column(ctx->idl, &ovsrec_datapath_col_ct_zone_default_limit); ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_zone_col_timeout_policy); + ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_zone_col_limit); ovsdb_idl_add_column(ctx->idl, &ovsrec_ct_timeout_policy_col_timeouts); } @@ -2711,9 +2847,9 @@ post_db_reload_do_checks(const struct vsctl_context *vsctl_ctx) static void vsctl_context_init_command(struct vsctl_context *vsctl_ctx, - struct ctl_command *command) + struct ctl_command *command, bool last_command) { - ctl_context_init_command(&vsctl_ctx->base, command); + ctl_context_init_command(&vsctl_ctx->base, command, last_command); vsctl_ctx->verified_ports = false; } @@ -2859,7 +2995,8 @@ do_vsctl(const char *args, struct ctl_command *commands, size_t n_commands, } vsctl_context_init(&vsctl_ctx, NULL, idl, txn, ovs, symtab); for (c = commands; c < &commands[n_commands]; c++) { - vsctl_context_init_command(&vsctl_ctx, c); + vsctl_context_init_command(&vsctl_ctx, c, + c == &commands[n_commands - 1]); if (c->syntax->run) { (c->syntax->run)(&vsctl_ctx.base); } @@ -3144,6 +3281,14 @@ static const struct ctl_command_syntax vsctl_commands[] = { /* Datapath capabilities. */ {"list-dp-cap", 1, 1, "", pre_get_dp_cap, cmd_list_dp_cap, NULL, "", RO}, + /* CT zone limit. */ + {"set-zone-limit", 3, 3, "ARG ARG ARG", pre_get_zone, cmd_set_zone_limit, + NULL, "", RW}, + {"del-zone-limit", 2, 2, "ARG ARG", pre_get_zone, cmd_del_zone_limit, NULL, + "--if-exists", RW}, + {"list-zone-limits", 1, 1, "ARG", pre_get_zone, cmd_list_zone_limits, NULL, + "", RO}, + {NULL, 0, 0, NULL, NULL, NULL, NULL, NULL, RO}, }; diff --git a/utilities/usdt-scripts/dpif_nl_exec_monitor.py b/utilities/usdt-scripts/dpif_nl_exec_monitor.py new file mode 100755 index 00000000000..0a9ff812334 --- /dev/null +++ b/utilities/usdt-scripts/dpif_nl_exec_monitor.py @@ -0,0 +1,662 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# dpif_nl_exec_monitor.py uses the dpif_netlink_operate__:op_flow_execute USDT +# probe to receive all DPIF_OP_EXECUTE operations that are queued for +# transmission over the netlink socket. It will do some basic decoding, and if +# requested a packet dump. +# +# Here is an example: +# +# # ./dpif_nl_exec_monitor.py --packet-decode decode +# Display DPIF_OP_EXECUTE operations being queued for transmission... +# TIME CPU COMM PID NL_SIZE +# 3124.516679897 1 ovs-vswitchd 8219 180 +# nlmsghdr : len = 0, type = 36, flags = 1, seq = 0, pid = 0 +# genlmsghdr: cmd = 3, version = 1, reserver = 0 +# ovs_header: dp_ifindex = 21 +# > Decode OVS_PACKET_ATTR_* TLVs: +# nla_len 46, nla_type OVS_PACKET_ATTR_PACKET[1], data: 00 00 00... +# nla_len 20, nla_type OVS_PACKET_ATTR_KEY[2], data: 08 00 02 00... +# > Decode OVS_KEY_ATTR_* TLVs: +# nla_len 8, nla_type OVS_KEY_ATTR_PRIORITY[2], data: 00 00... +# nla_len 8, nla_type OVS_KEY_ATTR_SKB_MARK[15], data: 00 00... +# nla_len 88, nla_type OVS_PACKET_ATTR_ACTIONS[3], data: 4c 00 03... +# > Decode OVS_ACTION_ATTR_* TLVs: +# nla_len 76, nla_type OVS_ACTION_ATTR_SET[3], data: 48 00... +# > Decode OVS_TUNNEL_KEY_ATTR_* TLVs: +# nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_ID[0], data:... +# nla_len 20, nla_type OVS_TUNNEL_KEY_ATTR_IPV6_DST[13], ... +# nla_len 5, nla_type OVS_TUNNEL_KEY_ATTR_TTL[4], data: 40 +# nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT... +# nla_len 4, nla_type OVS_TUNNEL_KEY_ATTR_CSUM[6], data: +# nla_len 6, nla_type OVS_TUNNEL_KEY_ATTR_TP_DST[10],... +# nla_len 12, nla_type OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS... +# nla_len 8, nla_type OVS_ACTION_ATTR_OUTPUT[1], data: 02 00 00 00 +# - Dumping OVS_PACKET_ATR_PACKET data: +# ###[ Ethernet ]### +# dst = 00:00:00:00:ec:01 +# src = 04:f4:bc:28:57:00 +# type = IPv4 +# ###[ IP ]### +# version = 4 +# ihl = 5 +# tos = 0x0 +# len = 50 +# id = 0 +# flags = +# frag = 0 +# ttl = 127 +# proto = icmp +# chksum = 0x2767 +# src = 10.0.0.1 +# dst = 10.0.0.100 +# \options \ +# ###[ ICMP ]### +# type = echo-request +# code = 0 +# chksum = 0xf7f3 +# id = 0x0 +# seq = 0xc +# +# The example above dumps the full netlink and packet decode. However options +# exist to disable this. Here is the full list of supported options: +# +# usage: dpif_nl_exec_monitor.py [-h] [--buffer-page-count NUMBER] [-D [DEBUG]] +# [-d {none,hex,decode}] [-n {none,hex,nlraw}] +# [-p VSWITCHD_PID] [-s [64-2048]] +# [-w PCAP_FILE] +# +# optional arguments: +# -h, --help show this help message and exit +# --buffer-page-count NUMBER +# Number of BPF ring buffer pages, default 1024 +# -D [DEBUG], --debug [DEBUG] +# Enable eBPF debugging +# -d {none,hex,decode}, --packet-decode {none,hex,decode} +# Display packet content in selected mode, default none +# -n {none,hex,nlraw}, --nlmsg-decode {none,hex,nlraw} +# Display netlink message content in selected mode, +# default nlraw +# -p VSWITCHD_PID, --pid VSWITCHD_PID +# ovs-vswitch's PID +# -s [64-2048], --nlmsg-size [64-2048] +# Set maximum netlink message size to capture, default +# 512 +# -w PCAP_FILE, --pcap PCAP_FILE +# Write upcall packets to specified pcap file + +from bcc import BPF, USDT, USDTException +from os.path import exists +from scapy.all import hexdump, wrpcap +from scapy.layers.l2 import Ether + +import argparse +import psutil +import re +import struct +import sys +import time + +# +# Actual eBPF source code +# +ebpf_source = """ +#include + +#define MAX_NLMSG + +struct event_t { + u32 cpu; + u32 pid; + u64 ts; + u32 nl_size; + char comm[TASK_COMM_LEN]; + u8 nl_msg[MAX_NLMSG]; +}; + +struct ofpbuf { + void *base; + void *data; + uint32_t size; + + /* The actual structure is longer, but we are only interested in the + * first couple of entries. */ +}; + +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +int trace__op_flow_execute(struct pt_regs *ctx) { + struct ofpbuf nlbuf; + uint32_t size; + + bpf_usdt_readarg_p(5, ctx, &nlbuf, sizeof(nlbuf)); + + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + if (!event) { + uint32_t type = 0; + uint64_t *value = dropcnt.lookup(&type); + if (value) + __sync_fetch_and_add(value, 1); + + return 1; + } + + event->ts = bpf_ktime_get_ns(); + event->cpu = bpf_get_smp_processor_id(); + event->pid = bpf_get_current_pid_tgid(); + bpf_get_current_comm(&event->comm, sizeof(event->comm)); + + event->nl_size = nlbuf.size; + if (event->nl_size > MAX_NLMSG) + size = MAX_NLMSG; + else + size = event->nl_size; + + bpf_probe_read(&event->nl_msg, size, nlbuf.data); + + events.ringbuf_submit(event, 0); + return 0; +}; +""" + + +# +# print_event() +# +def print_event(ctx, data, size): + event = b["events"].event(data) + print("{:<18.9f} {:<4} {:<16} {:<10} {:<10}". + format(event.ts / 1000000000, + event.cpu, + event.comm.decode("utf-8"), + event.pid, + event.nl_size)) + + # + # Dumping the netlink message data if requested. + # + if event.nl_size < options.nlmsg_size: + nl_size = event.nl_size + else: + nl_size = options.nlmsg_size + + if options.nlmsg_decode == "hex": + # + # Abuse scapy's hex dump to dump flow key + # + print(re.sub("^", " " * 4, + hexdump(Ether(bytes(event.nl_msg)[:nl_size]), dump=True), + flags=re.MULTILINE)) + + if options.nlmsg_decode == "nlraw": + decode_result = decode_nlm(bytes(event.nl_msg)[:nl_size], dump=True) + else: + decode_result = decode_nlm(bytes(event.nl_msg)[:nl_size], dump=False) + + # + # Decode packet only if there is data + # + if "OVS_PACKET_ATTR_PACKET" not in decode_result: + return + + pkt_data = decode_result["OVS_PACKET_ATTR_PACKET"] + indent = 4 if options.nlmsg_decode != "nlraw" else 6 + + if options.packet_decode != "none": + print("{}- Dumping OVS_PACKET_ATR_PACKET data:".format(" " * indent)) + + if options.packet_decode == "hex": + print(re.sub("^", " " * indent, hexdump(pkt_data, dump=True), + flags=re.MULTILINE)) + + packet = Ether(pkt_data) + if options.packet_decode == "decode": + print(re.sub("^", " " * indent, packet.show(dump=True), + flags=re.MULTILINE)) + + if options.pcap is not None: + wrpcap(options.pcap, packet, append=True) + + +# +# decode_nlm_tlvs() +# +def decode_nlm_tlvs(tlvs, header=None, indent=4, dump=True, + attr_to_str_func=None, decode_tree=None): + bytes_left = len(tlvs) + result = {} + + if dump: + print("{}{}".format(" " * indent, header)) + + while bytes_left: + if bytes_left < 4: + if dump: + print("{}WARN: decode truncated; can't read header".format( + " " * indent)) + break + + nla_len, nla_type = struct.unpack("=HH", tlvs[:4]) + + if nla_len < 4: + if dump: + print("{}WARN: decode truncated; nla_len < 4".format( + " " * indent)) + break + + nla_data = tlvs[4:nla_len] + trunc = "" + + if attr_to_str_func is None: + nla_type_name = "type_{}".format(nla_type) + else: + nla_type_name = attr_to_str_func(nla_type) + + if nla_len > bytes_left: + trunc = "..." + nla_data = nla_data[:(bytes_left - 4)] + else: + result[nla_type_name] = nla_data + + if dump: + print("{}nla_len {}, nla_type {}[{}], data: {}{}".format( + " " * indent, nla_len, nla_type_name, nla_type, + "".join("{:02x} ".format(b) for b in nla_data), trunc)) + + # + # If we have the full data, try to decode further + # + if trunc == "" and decode_tree is not None \ + and nla_type_name in decode_tree: + node = decode_tree[nla_type_name] + decode_nlm_tlvs(nla_data, + header=node["header"], + indent=indent + node["indent"], dump=True, + attr_to_str_func=node["attr_str_func"], + decode_tree=node["decode_tree"]) + + if trunc != "": + if dump: + print("{}WARN: decode truncated; nla_len > msg_len[{}] ". + format(" " * indent, bytes_left)) + break + + # update next offset, but make sure it's aligned correctly + next_offset = (nla_len + 3) & ~(3) + tlvs = tlvs[next_offset:] + bytes_left -= next_offset + + return result + + +# +# decode_nlm() +# +def decode_nlm(msg, indent=4, dump=True): + result = {} + + # + # Decode 'struct nlmsghdr' + # + if dump: + print("{}nlmsghdr : len = {}, type = {}, flags = {}, seq = {}, " + "pid = {}".format(" " * indent, + *struct.unpack("=IHHII", msg[:16]))) + + msg = msg[16:] + + # + # Decode 'struct genlmsghdr' + # + if dump: + print("{}genlmsghdr: cmd = {}, version = {}, reserver = {}".format( + " " * indent, *struct.unpack("=BBH", msg[:4]))) + + msg = msg[4:] + + # + # Decode 'struct ovs_header' + # + if dump: + print("{}ovs_header: dp_ifindex = {}".format( + " " * indent, *struct.unpack("=I", msg[:4]))) + + msg = msg[4:] + + # + # Decode TLVs + # + nl_attr_tree = { + "OVS_PACKET_ATTR_KEY": { + "header": "> Decode OVS_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_key_attr_str, + "decode_tree": None, + }, + "OVS_PACKET_ATTR_ACTIONS": { + "header": "> Decode OVS_ACTION_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_action_attr_str, + "decode_tree": { + "OVS_ACTION_ATTR_SET": { + "header": "> Decode OVS_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_key_attr_str, + "decode_tree": { + "OVS_KEY_ATTR_TUNNEL": { + "header": "> Decode OVS_TUNNEL_KEY_ATTR_* TLVs:", + "indent": 4, + "attr_str_func": get_ovs_tunnel_key_attr_str, + "decode_tree": None, + }, + }, + }, + }, + }, + } + + result = decode_nlm_tlvs(msg, indent=indent + 2, dump=dump, + header="> Decode OVS_PACKET_ATTR_* TLVs:", + attr_to_str_func=get_ovs_pkt_attr_str, + decode_tree=nl_attr_tree) + return result + + +# +# get_ovs_pkt_attr_str() +# +def get_ovs_pkt_attr_str(attr): + ovs_pkt_attr = ["OVS_PACKET_ATTR_UNSPEC", + "OVS_PACKET_ATTR_PACKET", + "OVS_PACKET_ATTR_KEY", + "OVS_PACKET_ATTR_ACTIONS", + "OVS_PACKET_ATTR_USERDATA", + "OVS_PACKET_ATTR_EGRESS_TUN_KEY", + "OVS_PACKET_ATTR_UNUSED1", + "OVS_PACKET_ATTR_UNUSED2", + "OVS_PACKET_ATTR_PROBE", + "OVS_PACKET_ATTR_MRU", + "OVS_PACKET_ATTR_LEN", + "OVS_PACKET_ATTR_HASH"] + if attr < 0 or attr >= len(ovs_pkt_attr): + return "".format(attr) + + return ovs_pkt_attr[attr] + + +# +# get_ovs_key_attr_str() +# +def get_ovs_key_attr_str(attr): + ovs_key_attr = ["OVS_KEY_ATTR_UNSPEC", + "OVS_KEY_ATTR_ENCAP", + "OVS_KEY_ATTR_PRIORITY", + "OVS_KEY_ATTR_IN_PORT", + "OVS_KEY_ATTR_ETHERNET", + "OVS_KEY_ATTR_VLAN", + "OVS_KEY_ATTR_ETHERTYPE", + "OVS_KEY_ATTR_IPV4", + "OVS_KEY_ATTR_IPV6", + "OVS_KEY_ATTR_TCP", + "OVS_KEY_ATTR_UDP", + "OVS_KEY_ATTR_ICMP", + "OVS_KEY_ATTR_ICMPV6", + "OVS_KEY_ATTR_ARP", + "OVS_KEY_ATTR_ND", + "OVS_KEY_ATTR_SKB_MARK", + "OVS_KEY_ATTR_TUNNEL", + "OVS_KEY_ATTR_SCTP", + "OVS_KEY_ATTR_TCP_FLAGS", + "OVS_KEY_ATTR_DP_HASH", + "OVS_KEY_ATTR_RECIRC_ID", + "OVS_KEY_ATTR_MPLS", + "OVS_KEY_ATTR_CT_STATE", + "OVS_KEY_ATTR_CT_ZONE", + "OVS_KEY_ATTR_CT_MARK", + "OVS_KEY_ATTR_CT_LABELS", + "OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4", + "OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6", + "OVS_KEY_ATTR_NSH"] + + if attr < 0 or attr >= len(ovs_key_attr): + return "".format(attr) + + return ovs_key_attr[attr] + + +# +# get_ovs_action_attr_str() +# +def get_ovs_action_attr_str(attr): + ovs_action_attr = ["OVS_ACTION_ATTR_UNSPEC", + "OVS_ACTION_ATTR_OUTPUT", + "OVS_ACTION_ATTR_USERSPACE", + "OVS_ACTION_ATTR_SET", + "OVS_ACTION_ATTR_PUSH_VLAN", + "OVS_ACTION_ATTR_POP_VLAN", + "OVS_ACTION_ATTR_SAMPLE", + "OVS_ACTION_ATTR_RECIRC", + "OVS_ACTION_ATTR_HASH", + "OVS_ACTION_ATTR_PUSH_MPLS", + "OVS_ACTION_ATTR_POP_MPLS", + "OVS_ACTION_ATTR_SET_MASKED", + "OVS_ACTION_ATTR_CT", + "OVS_ACTION_ATTR_TRUNC", + "OVS_ACTION_ATTR_PUSH_ETH", + "OVS_ACTION_ATTR_POP_ETH", + "OVS_ACTION_ATTR_CT_CLEAR", + "OVS_ACTION_ATTR_PUSH_NSH", + "OVS_ACTION_ATTR_POP_NSH", + "OVS_ACTION_ATTR_METER", + "OVS_ACTION_ATTR_CLONE", + "OVS_ACTION_ATTR_CHECK_PKT_LEN", + "OVS_ACTION_ATTR_ADD_MPLS", + "OVS_ACTION_ATTR_TUNNEL_PUSH", + "OVS_ACTION_ATTR_TUNNEL_POP", + "OVS_ACTION_ATTR_DROP", + "OVS_ACTION_ATTR_LB_OUTPUT"] + if attr < 0 or attr >= len(ovs_action_attr): + return "".format(attr) + + return ovs_action_attr[attr] + + +# +# get_ovs_tunnel_key_attr_str() +# +def get_ovs_tunnel_key_attr_str(attr): + ovs_tunnel_key_attr = ["OVS_TUNNEL_KEY_ATTR_ID", + "OVS_TUNNEL_KEY_ATTR_IPV4_SRC", + "OVS_TUNNEL_KEY_ATTR_IPV4_DST", + "OVS_TUNNEL_KEY_ATTR_TOS", + "OVS_TUNNEL_KEY_ATTR_TTL", + "OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT", + "OVS_TUNNEL_KEY_ATTR_CSUM", + "OVS_TUNNEL_KEY_ATTR_OAM", + "OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS", + "OVS_TUNNEL_KEY_ATTR_TP_SRC", + "OVS_TUNNEL_KEY_ATTR_TP_DST", + "OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS", + "OVS_TUNNEL_KEY_ATTR_IPV6_SRC", + "OVS_TUNNEL_KEY_ATTR_IPV6_DST", + "OVS_TUNNEL_KEY_ATTR_PAD", + "OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS", + "OVS_TUNNEL_KEY_ATTR_GTPU_OPTS"] + if attr < 0 or attr >= len(ovs_tunnel_key_attr): + return "".format(attr) + + return ovs_tunnel_key_attr[attr] + + +# +# buffer_size_type() +# +def buffer_size_type(astr, min=64, max=2048): + value = int(astr) + if min <= value <= max: + return value + else: + raise argparse.ArgumentTypeError( + "value not in range {}-{}".format(min, max)) + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global b + global options + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, default=1024, metavar="NUMBER") + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs="?") + parser.add_argument("-d", "--packet-decode", + help="Display packet content in selected mode, " + "default none", + choices=["none", "hex", "decode"], default="none") + parser.add_argument("-n", "--nlmsg-decode", + help="Display netlink message content in selected mode" + ", default nlraw", + choices=["none", "hex", "nlraw"], default="nlraw") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=int, default=None) + parser.add_argument("-s", "--nlmsg-size", + help="Set maximum netlink message size to capture, " + "default 512", type=buffer_size_type, default=512, + metavar="[64-2048]") + parser.add_argument("-w", "--pcap", metavar="PCAP_FILE", + help="Write upcall packets to specified pcap file", + type=str, default=None) + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if options.pid is None: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if options.pid is not None: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(-1) + + options.pid = proc.pid + + # + # Error checking on input parameters + # + if options.pid is None: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(-1) + + if options.pcap is not None: + if exists(options.pcap): + print("ERROR: Destination capture file \"{}\" already exists!". + format(options.pcap)) + sys.exit(-1) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Attach the usdt probe + # + u = USDT(pid=int(options.pid)) + try: + u.enable_probe(probe="dpif_netlink_operate__:op_flow_execute", + fn_name="trace__op_flow_execute") + except USDTException as e: + print("ERROR: {}" + "ovs-vswitchd!".format( + (re.sub("^", " " * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(-1) + + # + # Uncomment to see how arguments are decoded. + # print(u.get_text()) + # + + # + # Attach probe to running process + # + source = ebpf_source.replace("", str(options.nlmsg_size)) + source = source.replace("", + str(options.buffer_page_count)) + + b = BPF(text=source, usdt_contexts=[u], debug=options.debug) + + # + # Print header + # + print("Display DPIF_OP_EXECUTE operations being queued for transmission " + "onto the netlink socket.") + print("{:<18} {:<4} {:<16} {:<10} {:<10}".format( + "TIME", "CPU", "COMM", "PID", "NL_SIZE")) + + # + # Dump out all events + # + b["events"].open_ring_buffer(print_event) + while 1: + try: + b.ring_buffer_poll() + time.sleep(0.5) + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\nWARNING: Not all upcalls were captured, {} were dropped!" + "\n Increase the BPF ring buffer size with the " + "--buffer-page-count option.".format(count)) + + +# +# Start main() as the default entry point... +# +if __name__ == "__main__": + main() diff --git a/utilities/usdt-scripts/flow_reval_monitor.py b/utilities/usdt-scripts/flow_reval_monitor.py new file mode 100755 index 00000000000..28479a5650d --- /dev/null +++ b/utilities/usdt-scripts/flow_reval_monitor.py @@ -0,0 +1,982 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022-2024 Redhat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# flow_reval_monitor.py uses the dpif_netlink_operate:flow_put and +# revalidator:flow_result USDT probes to monitor flow lifetimes and +# expiration events. By default, this will show all flow_put and flow +# expiration events, along with their reasons. This will look like so: +# +# TID TIME UFID EVENT/REASON +# 71828 1549.119959874 39f0f28f-33... Insert (put) flow to ovs kernel module. +# 71828 1549.420877223 850db41c-47... Insert (put) flow to ovs kernel module. +# 71828 1550.476923456 5bacfca9-fe... Insert (put) flow to ovs kernel module. +# 71832 1559.650192299 850db41c-47... Idle flow timed out +# 71832 1561.153332825 39f0f28f-33... Idle flow timed out +# 71832 1572.684316304 5bacfca9-fe... Idle flow timed out +# +# Flow key data can be printed using the --flow-keys option. This will +# print the equivalent datapath flow string. +# +# When filtering flows, the syntax is the same as used by +# `ovs-appctl dpctl/add-flow`. +# +# For a complete list of options, please use the '--help' or '-h' argument. +# +# Examples: +# +# To use the script on a running ovs-vswitchd to see flow keys and expiration +# events for flows with an ipv4 source of 192.168.10.10: +# $ ./flow_reval_monitor.py --flow-keys --filter-flows \ +# "ipv4(src=192.168.10.10)" +# TIME UFID EVENT/REASON +# 105082.457322742 ufid:f76fc899-376d-466b-bc74-0000b933eb97 flow_put +# ufid:f76fc899-376d-466b-bc74-0000b933eb97 has the following flow information: +# in_port(2), +# eth(src=0e:04:47:fc:74:51, dst=da:dc:c5:69:05:d7), \ +# eth_type(0x800), \ +# ipv4(src=192.168.10.10, dst=192.168.10.30, proto=1, tos=0, ttl=64,[...]), +# icmp(type=8, code=0) +# 105092.635450202 ufid:f76fc899-376d-466b-bc74-0000b933eb97 Flow timed out +# +# Notes: +# 1) No options are needed to attach when there is a single running instance +# of ovs-vswitchd. +# 2) If you're using the flow filtering option, it will only track flows that +# have been upcalled since the script began running. +# 3) When using the flow filtering option, the key size will likely need to +# be expanded to match on all the fields in the message. The default is +# kept small to keep the buffer copy sizes down when displaying +# flows (-k), but is hardcoded to 2048 when an actual filter (-l) is +# applied +# 4) The flow filtering format is a simplified form of the ODP syntax, and +# does not support masked matches, which means you will need to filter +# on exact details. The fields present are dependent on how the +# classifier and OFP rules form the ODP rules - not all fields may be +# present in a particular flow. +# 5) The flow_put filtering only happens for flows installed into the ovs +# kernel module. This means flows taking the HW offload path (ie: tc), +# or on DPDK side won't get matched. + +try: + from bcc import BPF + from bcc import USDT + from bcc import USDTException +except ModuleNotFoundError: + print("ERROR: Can't find the BPF Compiler Collection Tools.") + print("Please install them before running this script.") + exit(1) + +from enum import IntEnum +from ipaddress import IPv4Address, IPv6Address +from pathlib import Path + +import argparse +import psutil +import re +import struct +import subprocess +import sys + +# +# eBPF source code +# +bpf_src = """ +#include + +#define MAX_KEY +#define FLOW_FILTER + +enum probe { }; + + + +struct event_t { + u64 ts; + u32 pid; + u32 result; + u32 reason; + u32 ufid[4]; + u64 key_size; + unsigned char key[MAX_KEY]; + enum probe probe; +}; + +BPF_HASH(watchlist, ovs_u128); +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +/* Hack to make a 'static' like storage object. */ +BPF_TABLE("percpu_array", uint32_t, struct udpif_key, udpk, 1); + +static struct event_t *get_event(enum probe p) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->probe = p; + event->ts = bpf_ktime_get_ns(); + event->pid = bpf_get_current_pid_tgid(); + + return event; +} + +static int emit_flow_result(struct udpif_key *ukey, ovs_u128 ufid, + u32 result, u32 reason) { + struct event_t *event = NULL; + u64 *ufid_present = NULL; + + ufid_present = watchlist.lookup(&ufid); + if (FLOW_FILTER && !ufid_present) { + return 0; + } + + event = get_event(FLOW_RESULT); + if (!event) { + /* If we can't reserve the space in the ring buffer, return 1. */ + return 1; + } + + event->result = result; + event->reason = reason; + bpf_probe_read(&event->ufid, sizeof ufid, &ufid); + events.ringbuf_submit(event, 0); + + return 0; +} + +int usdt__flow_result(struct pt_regs *ctx) { + struct udpif_key *ukey = NULL; + u32 reason = 0; + u32 result = 0; + ovs_u128 ufid; + u32 zero = 0; + + ukey = udpk.lookup(&zero); + if (!ukey) { + return 1; + } + bpf_usdt_readarg_p(2, ctx, ukey, sizeof(struct udpif_key)); + bpf_usdt_readarg(3, ctx, &result); + bpf_usdt_readarg(4, ctx, &reason); + ufid = ukey->ufid; + + return emit_flow_result(ukey, ufid, result, reason); +} + +int usdt__flow_sweep_result(struct pt_regs *ctx) { + struct udpif_key *ukey = NULL; + u32 reason = 0; + u32 result = 0; + ovs_u128 ufid; + u32 zero = 0; + + ukey = udpk.lookup(&zero); + if (!ukey) { + return 1; + } + bpf_usdt_readarg_p(2, ctx, ukey, sizeof(struct udpif_key)); + bpf_usdt_readarg(3, ctx, &result); + bpf_usdt_readarg(4, ctx, &reason); + ufid = ukey->ufid; + + return emit_flow_result(ukey, ufid, result, reason); +} + +int usdt__op_flow_put(struct pt_regs *ctx) { + struct dpif_flow_put put; + ovs_u128 ufid; + + struct event_t *event = get_event(OP_FLOW_PUT); + if (!event) { + /* If we can't reserve the space in the ring buffer, return 1. */ + return 1; + } + + bpf_usdt_readarg_p(2, ctx, &put, sizeof put); + bpf_probe_read(&event->ufid, sizeof event->ufid, put.ufid); + bpf_probe_read(&ufid, sizeof ufid, &event->ufid); + if (put.key_len > MAX_KEY) { + put.key_len = MAX_KEY; + } + event->key_size = put.key_len; + bpf_probe_read(&event->key, put.key_len, put.key); + event->reason = 0; + events.ringbuf_submit(event, 0); + + watchlist.increment(ufid); + return 0; +} +""" + +Event = IntEnum("Event", ["OP_FLOW_PUT", "FLOW_RESULT"], start=0) +RevalResult = IntEnum( + "reval_result", + [ + "UKEY_KEEP", + "UKEY_DELETE", + "UKEY_MODIFY", + ], + start=0, +) + +# +# The below FdrReasons and FdrReasonStrings definitions can be found in the +# ofproto/ofproto-dpif-upcall.c file. Please keep them in sync. +# +FdrReasons = IntEnum( + "flow_del_reason", + [ + "FDR_NONE", + "FDR_AVOID_CACHING", + "FDR_BAD_ODP_FIT", + "FDR_FLOW_IDLE", + "FDR_FLOW_LIMIT", + "FDR_FLOW_WILDCARDED", + "FDR_NO_OFPROTO", + "FDR_PURGE", + "FDR_TOO_EXPENSIVE", + "FDR_UPDATE_FAIL", + "FDR_XLATION_ERROR", + ], + start=0, +) + +FdrReasonStrings = { + FdrReasons.FDR_NONE: "No delete reason specified", + FdrReasons.FDR_AVOID_CACHING: "Cache avoidance flag set", + FdrReasons.FDR_BAD_ODP_FIT: "Bad ODP flow fit", + FdrReasons.FDR_FLOW_IDLE: "Flow idle timeout", + FdrReasons.FDR_FLOW_LIMIT: "Kill all flows condition reached", + FdrReasons.FDR_FLOW_WILDCARDED: "Flow needs a narrower wildcard mask", + FdrReasons.FDR_NO_OFPROTO: "Bridge not found", + FdrReasons.FDR_PURGE: "User requested flow deletion", + FdrReasons.FDR_TOO_EXPENSIVE: "Too expensive to revalidate", + FdrReasons.FDR_UPDATE_FAIL: "Datapath update failed", + FdrReasons.FDR_XLATION_ERROR: "Flow translation error" +} + + +def err(msg, code=-1): + """Prints an error to stderr and exits""" + + print(msg, file=sys.stderr) + sys.exit(code) + + +def run_program(command): + """Invokes a new process and returns stdout. Note that this will honor + the PATH environment variable, so best to use it sparingly, or with a + full path to binary.""" + + try: + process = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding="utf8", + check=True, + ) + + except subprocess.CalledProcessError as perror: + return perror.returncode, perror.stdout + + return 0, process.stdout + + +def get_ovs_definitions(objects, pahole="pahole", pid=None): + """Uses `pahole` or similar utility to pull object definitions from a + running OVS process. The objects argument can either be a string + or can be a list of strings. Optionally, pass a specific `pahole` + binary to use rather than the default. PID needs to be set.""" + + if pid is None: + raise ValueError("A valid pid value should be supplied!") + + if not isinstance(objects, list): + objects = [objects] + + if len(objects) == 0: + raise ValueError("Must supply at least one object!") + + vswitchd = Path(f"/proc/{pid}/exe").resolve() + + object_str = ",".join(objects) + + def run_pahole(debug_file): + """Helper designed for running pahole, or something with compatible + output""" + + error, result = run_program( + [pahole, "-C", object_str, "--compile", debug_file] + ) + + if error: + if f"pahole: {debug_file}: Invalid argument" not in result: + err( + "ERROR: Pahole failed to get ovs-vswitchd data " + "structures!\n{}".format( + re.sub( + "^", " " * 7, result.rstrip(), flags=re.MULTILINE + ) + ) + ) + + return None + + if bool(re.search("pahole: type .* not found", result)): + return None + + return result + + def run_readelf(bin_file): + """Helper designed for running readelf or something with compatible + output""" + + error, result = run_program( + ["readelf", "-n", "--debug-dump=links", bin_file] + ) + + if error: + err( + "ERROR: Failed 'readelf' on \"{}\"!\n{}".format( + bin_file, re.sub("^", " " * 7, result, flags=re.MULTILINE) + ) + ) + + return result + + def get_debug_file(bin_file): + """Runs readelf against the binary, and attempts to find the associated + debuginfo file.""" + elf_result = run_readelf(bin_file) + match = re.search("Build ID: ([0-9a-fA-F]+)", elf_result) + if not match: + err("ERROR: Can't find build ID to read debug symbols!") + + dbg_file = "/usr/lib/debug/.build-id/{}/{}.debug".format( + match.group(1)[:2], match.group(1)[2:] + ) + + return dbg_file + + def get_from_shared_library(debug_file): + ovs_libs = [ + "libofproto", + "libopenvswitch", + "libovsdb", + "libsflow", + "libvtep", + ] + error, ldd_result = run_program(["ldd", debug_file]) + + if error: + err( + "ERROR: Failed 'ldd' on \"{}\"!\n{}".format( + debug_file, + re.sub("^", " " * 7, ldd_result, flags=re.MULTILINE), + ) + ) + + for lib in ovs_libs: + match = re.search( + r"^\s*{}.* => (.*) \(.*\)$".format(lib), + ldd_result, + flags=re.MULTILINE, + ) + if match is None: + continue + + result = run_pahole(match.group(1)) + if result is None: + result = run_pahole(get_debug_file(match.group(1))) + + if result: + return result + + return None + + # + # First try to find the debug data as part of the executable. + # + result = run_pahole(vswitchd) + + if result is None: + print(f'INFO: Failed to find debug info in "{vswitchd}"!') + + # + # Get additional .debug information if available. + # + dbg_file = get_debug_file(vswitchd) + result = run_pahole(dbg_file) + if result is None: + print(f'INFO: Failed to find debug info in "{dbg_file}"!') + + # + # Try to get information from shared libraries if used. + # + result = get_from_shared_library(vswitchd) + + if result is None: + err(f"ERROR: Failed to find needed data structures through {pahole}") + + # + # We need an empty _Atomic definition to avoid compiler complaints. + # + result = "#define _Atomic\n" + result + + # + # Remove the uint64_t definition as it conflicts with the kernel one. + # + result = re.sub("^typedef.*uint64_t;$", "", result, flags=re.MULTILINE) + + return result + + +def buffer_size_type(astr, min=64, max=2048): + """Checks whether a string passed in is a number between min and max.""" + + value = int(astr) + if min <= value <= max: + return value + else: + raise argparse.ArgumentTypeError( + "value not in range {}-{}".format(min, max) + ) + + +def format_ufid(ufid): + """Formats a UFID object into a human readable form. If ufid is None, + prints "ufid:none" instead.""" + if ufid is None: + return "ufid:none" + + return "{:08x}-{:04x}-{:04x}-{:04x}-{:04x}{:08x}".format( + ufid[0], + ufid[1] >> 16, + ufid[1] & 0xFFFF, + ufid[2] >> 16, + ufid[2] & 0, + ufid[3], + ) + + +def find_and_delete_from_watchlist(event): + """If the event ufid is in the watchlist, delete it""" + + for k, _ in b["watchlist"].items(): + key_ufid = struct.unpack("=IIII", k) + if key_ufid == tuple(event.ufid): + key = (b["watchlist"].Key * 1)(k) + b["watchlist"].items_delete_batch(key) + break + + +def handle_flow_put(event): + """Event handler for the `flow_put` action. This function will try + to populate the watchlist based on the vswitchd emitting a put event + to push an ODP flow key with associated actions into the kernel module""" + + if args.flow_keys or args.filter_flows is not None: + key = decode_key(bytes(event.key)[: event.key_size]) + flow_dict, flow_str = parse_flow_dict(key) + # For each attribute that we're watching. + if args.filter_flows is not None: + if not compare_flow_to_target(args.filter_flows, flow_dict): + find_and_delete_from_watchlist(event) + return + + print( + "{:<10} {:<18.9f} {:<36} {}".format( + event.pid, + event.ts / 1000000000, + format_ufid(event.ufid), + "Insert (put) flow to ovs kernel module.", + ) + ) + + if args.flow_keys and len(flow_str): + flow_str_fields = flow_str.split("), ") + flow_str = " " + curlen = 4 + for field in flow_str_fields: + if curlen + len(field) > 79: + flow_str += "\n " + curlen = 4 + if field[-1] != ")": + field += ")" + flow_str += field + ", " + curlen += len(field) + 2 + + print(" - It holds the following key information:") + print(flow_str) + + +def compare_flow_to_target(target, flow): + """Routine to compare two flow keys""" + + for key in target: + if key not in flow: + return False + elif target[key] is True: + continue + elif target[key] == flow[key]: + continue + elif isinstance(target[key], dict) and isinstance(flow[key], dict): + return compare_flow_to_target(target[key], flow[key]) + else: + return False + return True + + +# +# parse_flow_str() +# +def parse_flow_str(flow_str): + """Loosely parses an ODP flow key into a dict for further processing""" + + f_list = [i.strip(", ") for i in flow_str.split(")")] + if f_list[-1] == "": + f_list = f_list[:-1] + flow_dict = {} + for e in f_list: + split_list = e.split("(") + k = split_list[0] + if len(split_list) == 1: + flow_dict[k] = True + elif split_list[1].count("=") == 0: + flow_dict[k] = split_list[1] + else: + sub_dict = {} + sublist = [i.strip() for i in split_list[1].split(",")] + for subkey in sublist: + brk = subkey.find("=") + sub_dict[subkey[:brk]] = subkey[brk + 1 :] + flow_dict[k] = sub_dict + return flow_dict + + +def print_expiration(event): + """Prints a UFID eviction with a reason.""" + ufid_str = format_ufid(event.ufid) + + try: + reason = FdrReasonStrings[event.reason] + except KeyError: + reason = f"Unknown reason '{event.reason}'" + + print( + "{:<10} {:<18.9f} {:<36} {:<17}".format( + event.pid, + event.ts / 1000000000, + ufid_str, + reason, + ) + ) + + +def decode_key(msg): + """Decodes netlink OVS key attribute.""" + bytes_left = len(msg) + result = {} + while bytes_left: + if bytes_left < 4: + break + nla_len, nla_type = struct.unpack("=HH", msg[:4]) + if nla_len < 4: + break + nla_data = msg[4:nla_len] + if nla_len > bytes_left: + nla_data = nla_data[: (bytes_left - 4)] + break + else: + result[get_ovs_key_attr_str(nla_type)] = nla_data + next_offset = (nla_len + 3) & (~3) + msg = msg[next_offset:] + bytes_left -= next_offset + if bytes_left: + print(f"INFO: Buffer truncated with {bytes_left} bytes left.") + return result + + +# +# get_ovs_key_attr_str() +# +def get_ovs_key_attr_str(attr): + ovs_key_attr = [ + "OVS_KEY_ATTR_UNSPEC", + "encap", + "skb_priority", + "in_port", + "eth", + "vlan", + "eth_type", + "ipv4", + "ipv6", + "tcp", + "udp", + "icmp", + "icmpv6", + "arp", + "nd", + "skb_mark", + "tunnel", + "sctp", + "tcp_flags", + "dp_hash", + "recirc_id", + "mpls", + "ct_state", + "ct_zone", + "ct_mark", + "ct_label", + "ct_tuple4", + "ct_tuple6", + "nsh", + ] + + if attr < 0 or attr > len(ovs_key_attr): + return ": {}".format(attr) + return ovs_key_attr[attr] + + +def parse_flow_dict(key_dict, decode=True): + """Processes a flow key dict (see `parse_flow_str` or `decode_key`) and + returns a tuple of both the final flow key dict, and a string that + represents and ODP-like representation. Attempts to decode the actual + data values if `decode` is true. Otherwise, this can be for a loose form + of validation. Throws a KeyError when it encounters an unknown flow + key.""" + + ret_str = "" + parseable = {} + skip = ["nsh", "tunnel", "mpls", "vlan"] + need_byte_swap = ["ct_label"] + ipv4addrs = ["ct_tuple4", "tunnel", "ipv4", "arp"] + ipv6addrs = ["ipv6", "nd", "ct_tuple6"] + macs = {"eth": [0, 1], "arp": [3, 4], "nd": [1, 2]} + fields = [ + ("OVS_KEY_ATTR_UNSPEC"), + ("encap",), + ("skb_priority", " 1: + data = list( + struct.unpack( + fields[attr][1], v[: struct.calcsize(fields[attr][1])] + ) + ) + if k in ipv4addrs: + if data[0].count(0) < 4: + data[0] = str(IPv4Address(data[0])) + else: + data[0] = b"\x00" + if data[1].count(0) < 4: + data[1] = str(IPv4Address(data[1])) + else: + data[1] = b"\x00" + if k in ipv6addrs: + if data[0].count(0) < 16: + data[0] = str(IPv6Address(data[0])) + else: + data[0] = b"\x00" + if data[1].count(0) < len(data[1]): + data[1] = str(IPv6Address(data[1])) + else: + data[1] = b"\x00" + if k in macs.keys(): + for e in macs[k]: + if data[e].count(0) == 6: + mac_str = b"\x00" + else: + mac_str = ":".join(["%02x" % i for i in data[e]]) + data[e] = mac_str + if decode and len(fields[attr]) > 2: + field_dict = dict(zip(fields[attr][2:], data)) + s = ", ".join(k + "=" + str(v) for k, v in field_dict.items()) + elif decode and k != "eth_type": + s = str(data[0]) + field_dict = s + else: + if decode: + s = hex(data[0]) + field_dict = s + ret_str += k + "(" + s + "), " + parseable[k] = field_dict + ret_str = ret_str[:-2] + return (parseable, ret_str) + + +def handle_event(ctx, data, size): + """Dispatches to the correct event handler based on the event probe + type. + + Once we grab the event, we have three cases. + 1. It's a revalidator probe and the reason is nonzero: A flow is expiring + 2. It's a revalidator probe and the reason is zero: flow revalidated + 3. It's a flow_put probe. + + We will ignore case 2, and report all others. + """ + + event = b["events"].event(data) + if event.probe == Event.OP_FLOW_PUT: + handle_flow_put(event) + elif ( + event.probe == Event.FLOW_RESULT + and event.result == RevalResult.UKEY_DELETE + ): + print_expiration(event) + + +def main(): + # + # Don't like these globals, but ctx passing does not work with the existing + # open_ring_buffer() API :( + # + global b + global args + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + parser.add_argument( + "--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, + default=1024, + metavar="NUMBER", + ) + parser.add_argument( + "-f", + "--flow-key-size", + help="Set maximum flow key size to capture, " + "default 128 - see notes", + type=buffer_size_type, + default=128, + metavar="[128-2048]", + ) + parser.add_argument( + "-k", + "--flow-keys", + help="Print flow keys as flow strings", + action="store_true", + ) + parser.add_argument( + "-l", + "--filter-flows", + metavar="FLOW_STRING", + help="Filter flows that match the specified " "ODP-like flow", + type=str, + default=None, + nargs="*", + ) + parser.add_argument( + "-P", + "--pahole", + metavar="PAHOLE", + help="Pahole executable to use, default pahole", + type=str, + default="pahole", + ) + parser.add_argument( + "-p", + "--pid", + metavar="VSWITCHD_PID", + help="ovs-vswitchd's PID", + type=int, + default=None, + ) + parser.add_argument( + "-D", + "--debug", + help="Enable eBPF debugging", + type=int, + const=0x3F, + default=0, + nargs="?", + ) + args = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if args.pid is None: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if args.pid is not None: + err( + "Error: Multiple ovs-vswitchd daemons running, " + "use the -p option!" + ) + + args.pid = proc.pid + # + # Error checking on input parameters + # + if args.pid is None: + err("ERROR: Failed to find ovs-vswitchd's PID!") + + # + # Attach the USDT probes + # + try: + u = USDT(pid=int(args.pid)) + u.enable_probe(probe="op_flow_put", fn_name="usdt__op_flow_put") + u.enable_probe(probe="flow_result", fn_name="usdt__flow_result") + u.enable_probe( + probe="flow_sweep_result", fn_name="usdt__flow_sweep_result" + ) + except USDTException as e: + err("Failed to attach probes due to:\n" + str(e)) + + # + # Attach the probes to the running process + # + source = bpf_src.replace( + "", str(args.buffer_page_count) + ) + + source = source.replace( + "", + get_ovs_definitions( + ["udpif_key", "ovs_u128", "dpif_flow_put"], + pid=args.pid, + pahole=args.pahole, + ), + ) + + if args.filter_flows is None: + filter_bool = 0 + + # Set the key size based on what the user wanted + source = source.replace("", str(args.flow_key_size)) + else: + filter_bool = 1 + args.filter_flows = parse_flow_str(args.filter_flows[0]) + + # Run through the parser to make sure we only filter on fields we + # understand + parse_flow_dict(args.filter_flows, False) + + # This is hardcoded here because it doesn't make sense to shrink the + # size, since the flow key might be missing fields that are matched in + # the flow filter. + source = source.replace("", "2048") + + source = source.replace("", str(filter_bool)) + + source = source.replace( + "", + "\n".join([f"{event.name} = {event.value}," for event in Event]), + ) + + b = BPF(text=source, usdt_contexts=[u], debug=args.debug) + + # + # Print header + # + print( + "{:<10} {:<18} {:<36} {:<17}".format( + "TID", "TIME", "UFID", "EVENT/REASON" + ) + ) + + # + # Dump out all events. + # + b["events"].open_ring_buffer(handle_event) + while 1: + try: + b.ring_buffer_poll() + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print( + "\n# WARNING: Not all flow operations were captured, {} were" + " dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count) + ) + + +# +# Start main() as the default entry point +# +if __name__ == "__main__": + main() diff --git a/utilities/usdt-scripts/kernel_delay.py b/utilities/usdt-scripts/kernel_delay.py new file mode 100755 index 00000000000..de6b0c9de4d --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.py @@ -0,0 +1,1482 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022,2023 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Script information: +# ------------------- +# This script allows a developer to quickly identify if the issue at hand +# might be related to the kernel running out of resources or if it really is +# an Open vSwitch issue. +# +# For documentation see the kernel_delay.rst file. +# +# +# Dependencies: +# ------------- +# You need to install the BCC package for your specific platform or build it +# yourself using the following instructions: +# https://raw.githubusercontent.com/iovisor/bcc/master/INSTALL.md +# +# Python needs the following additional packages installed: +# - pytz +# - psutil +# +# You can either install your distribution specific package or use pip: +# pip install pytz psutil +# +import argparse +import datetime +import os +import pytz +import psutil +import re +import sys +import time + +import ctypes as ct + +try: + from bcc import BPF, USDT, USDTException + from bcc.syscall import syscalls, syscall_name +except ModuleNotFoundError: + print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!") + sys.exit(os.EX_OSFILE) + +from enum import IntEnum + + +# +# Actual eBPF source code +# +EBPF_SOURCE = """ +#include +#include + +#define MONITOR_PID + +enum { + +}; + +struct event_t { + u64 ts; + u32 tid; + u32 id; + + int user_stack_id; + int kernel_stack_id; + + u32 syscall; + u64 entry_ts; +}; + +BPF_RINGBUF_OUTPUT(events, ); +BPF_STACK_TRACE(stack_traces, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1); + +BPF_ARRAY(capture_on, u64, 1); +static inline bool capture_enabled(u64 pid_tgid) { + int key = 0; + u64 *ret; + + if ((pid_tgid >> 32) != MONITOR_PID) + return false; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static inline bool capture_enabled__() { + int key = 0; + u64 *ret; + + ret = capture_on.lookup(&key); + return ret && *ret == 1; +} + +static struct event_t *get_event(uint32_t id) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->id = id; + event->ts = bpf_ktime_get_ns(); + event->tid = bpf_get_current_pid_tgid(); + + return event; +} + +static int start_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + /* If the value is -1 we can't start as we are still processing the + * results in userspace. */ + if (!val || *val != 0) { + trigger_miss.increment(0); + return 0; + } + + struct event_t *event = get_event(EVENT_START_TRIGGER); + if (event) { + events.ringbuf_submit(event, 0); + *val = 1; + } else { + trigger_miss.increment(0); + } + return 0; +} + +static int stop_trigger() { + int key = 0; + u64 *val = capture_on.lookup(&key); + + if (!val || *val != 1) + return 0; + + struct event_t *event = get_event(EVENT_STOP_TRIGGER); + + if (event) + events.ringbuf_submit(event, 0); + + if (val) + *val = -1; + + return 0; +} + + + + + +/* + * For the syscall monitor the following probes get installed. + */ +struct syscall_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct syscall_data_key_t { + u32 pid; + u32 tid; + u32 syscall; +}; + +BPF_HASH(syscall_start, u64, u64); +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t); + +TRACEPOINT_PROBE(raw_syscalls, sys_enter) { + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + syscall_start.update(&pid_tgid, &t); + + return 0; +} + +TRACEPOINT_PROBE(raw_syscalls, sys_exit) { + struct syscall_data_t *val, zero = {}; + struct syscall_data_key_t key; + + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + key.pid = pid_tgid >> 32; + key.tid = (u32)pid_tgid; + key.syscall = args->id; + + u64 *start_ns = syscall_start.lookup(&pid_tgid); + + if (!start_ns) + return 0; + + val = syscall_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - *start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + + if () { + struct event_t *event = get_event(EVENT_SYSCALL); + if (event) { + event->syscall = args->id; + event->entry_ts = *start_ns; + if () { + event->user_stack_id = stack_traces.get_stackid( + args, BPF_F_USER_STACK); + event->kernel_stack_id = stack_traces.get_stackid( + args, 0); + } + events.ringbuf_submit(event, 0); + } + } + } + return 0; +} + + +/* + * For measuring the thread stopped time, we need the following. + */ +struct stop_time_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct pid_tid_key_t { + u32 pid; + u32 tid; +}; + +BPF_HASH(stop_start, u64, u64); +BPF_HASH(stop_data, struct pid_tid_key_t, struct stop_time_data_t); + +static inline void thread_handle_stopped_run(u32 pid, u32 tgid, u64 ktime) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + u64 *start_ns = stop_start.lookup(&pid_tgid); + + if (!start_ns || *start_ns == 0) + return; + + struct stop_time_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = tgid, + .tid = pid }; + + val = stop_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + } + *start_ns = 0; +} + + +/* + * For measuring the thread run time, we need the following. + */ +struct run_time_data_t { + u64 count; + u64 total_ns; + u64 max_ns; + u64 min_ns; +}; + +BPF_HASH(run_start, u64, u64); +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t); + +static inline void thread_start_run(u64 pid_tgid, u64 ktime) +{ + run_start.update(&pid_tgid, &ktime); +} + +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + u64 *start_ns = run_start.lookup(&pid_tgid); + + if (!start_ns || *start_ns == 0) + return; + + struct run_time_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = tgid, + .tid = pid }; + + val = run_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->max_ns) + val->max_ns = delta; + if (val->min_ns == 0 || delta < val->min_ns) + val->min_ns = delta; + } + *start_ns = 0; +} + + +/* + * For measuring the thread-ready delay, we need the following. + */ +struct ready_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +BPF_HASH(ready_start, u64, u64); +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t); + +static inline int sched_wakeup__(u32 pid, u32 tgid) +{ + u64 pid_tgid = (u64) tgid << 32 | pid; + + if (!capture_enabled(pid_tgid)) + return 0; + + u64 t = bpf_ktime_get_ns(); + ready_start.update(&pid_tgid, &t); + + thread_handle_stopped_run(pid, tgid, t); + return 0; +} + +RAW_TRACEPOINT_PROBE(sched_wakeup) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_wakeup_new) +{ + struct task_struct *t = (struct task_struct *)ctx->args[0]; + return sched_wakeup__(t->pid, t->tgid); +} + +RAW_TRACEPOINT_PROBE(sched_switch) +{ + struct task_struct *prev = (struct task_struct *)ctx->args[1]; + struct task_struct *next= (struct task_struct *)ctx->args[2]; + u64 ktime = 0; + + if (!capture_enabled__()) + return 0; + + if (prev->tgid == MONITOR_PID) { + u64 prev_pid_tgid = (u64)next->tgid << 32 | next->pid; + ktime = bpf_ktime_get_ns(); + + if (prev-> == TASK_RUNNING) + ready_start.update(&prev_pid_tgid, &ktime); + + if (prev-> & __TASK_STOPPED) + stop_start.update(&prev_pid_tgid, &ktime); + + thread_stop_run(prev->pid, prev->tgid, ktime); + } + + if (next->tgid != MONITOR_PID) + return 0; + + if (ktime == 0) + ktime = bpf_ktime_get_ns(); + + u64 pid_tgid = (u64)next->tgid << 32 | next->pid; + u64 *start_ns = ready_start.lookup(&pid_tgid); + + if (start_ns && *start_ns != 0) { + + struct ready_data_t *val, zero = {}; + struct pid_tid_key_t key = { .pid = next->tgid, + .tid = next->pid }; + + val = ready_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = ktime - *start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + } + *start_ns = 0; + } + + thread_start_run(pid_tgid, ktime); + return 0; +} + + +/* + * For measuring the hard irq time, we need the following. + */ +struct hardirq_start_data_t { + u64 start_ns; + char irq_name[32]; +}; + +struct hardirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct hardirq_data_key_t { + u32 pid; + u32 tid; + char irq_name[32]; +}; + +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t); +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t); + +TRACEPOINT_PROBE(irq, irq_handler_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name)); + hardirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, irq_handler_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct hardirq_start_data_t *data; + data = hardirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + if (args->ret != IRQ_NONE) { + struct hardirq_data_t *val, zero = {}; + struct hardirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid }; + + bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name), + data->irq_name); + val = hardirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + } + } + + data->start_ns = 0; + return 0; +} + + +/* + * For measuring the soft irq time, we need the following. + */ +struct softirq_start_data_t { + u64 start_ns; + u32 vec_nr; +}; + +struct softirq_data_t { + u64 count; + u64 total_ns; + u64 worst_ns; +}; + +struct softirq_data_key_t { + u32 pid; + u32 tid; + u32 vec_nr; +}; + +BPF_HASH(softirq_start, u64, struct softirq_start_data_t); +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t); + +TRACEPOINT_PROBE(irq, softirq_entry) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t data = {}; + + data.start_ns = bpf_ktime_get_ns(); + data.vec_nr = args->vec; + softirq_start.update(&pid_tgid, &data); + return 0; +} + +TRACEPOINT_PROBE(irq, softirq_exit) +{ + u64 pid_tgid = bpf_get_current_pid_tgid(); + + if (!capture_enabled(pid_tgid)) + return 0; + + struct softirq_start_data_t *data; + data = softirq_start.lookup(&pid_tgid); + if (!data || data->start_ns == 0) + return 0; + + struct softirq_data_t *val, zero = {}; + struct softirq_data_key_t key = { .pid = pid_tgid >> 32, + .tid = (u32)pid_tgid, + .vec_nr = data->vec_nr}; + + val = softirq_data.lookup_or_try_init(&key, &zero); + if (val) { + u64 delta = bpf_ktime_get_ns() - data->start_ns; + val->count++; + val->total_ns += delta; + if (delta > val->worst_ns) + val->worst_ns = delta; + } + + data->start_ns = 0; + return 0; +} +""" + + +# +# time_ns() +# +try: + from time import time_ns +except ImportError: + # For compatibility with Python <= v3.6. + def time_ns(): + now = datetime.datetime.now() + return int(now.timestamp() * 1e9) + + +# +# Probe class to use for the start/stop triggers +# +class Probe(object): + ''' + The goal for this object is to support as many as possible + probe/events as supported by BCC. See + https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments + ''' + def __init__(self, probe, pid=None): + self.pid = pid + self.text_probe = probe + self._parse_text_probe() + + def __str__(self): + if self.probe_type == "usdt": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.usdt_provider, self.usdt_probe) + elif self.probe_type == "trace": + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, + self.trace_system, self.trace_event) + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.kprobe_function) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, + self.uprobe_function) + else: + return "[{}] <{}:unknown probe>".format(self.text_probe, + self.probe_type) + + def _raise(self, error): + raise ValueError("[{}]; {}".format(self.text_probe, error)) + + def _verify_kprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_trace_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_uprobe_probe(self): + # Nothing to verify for now, just return. + return + + def _verify_usdt_probe(self): + if not self.pid: + self._raise("USDT probes need a valid PID.") + + usdt = USDT(pid=self.pid) + + for probe in usdt.enumerate_probes(): + if probe.provider.decode("utf-8") == self.usdt_provider and \ + probe.name.decode("utf-8") == self.usdt_probe: + return + + self._raise("Can't find UDST probe '{}:{}'".format(self.usdt_provider, + self.usdt_probe)) + + def _parse_text_probe(self): + ''' + The text probe format is defined as follows: + : + + Types: + USDT: u|usdt:: + TRACE: t|trace:: + KPROBE: k|kprobe: + KRETPROBE: kr|kretprobe: + UPROBE: up|uprobe: + URETPROBE: ur|uretprobe: + ''' + args = self.text_probe.split(":") + if len(args) <= 1: + self._raise("Can't extract probe type.") + + if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace", + "u", "usdt", "up", "uprobe", "ur", "uretprobe"]: + self._raise("Invalid probe type '{}'".format(args[0])) + + self.probe_type = "kprobe" if args[0] == "k" else args[0] + self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type + self.probe_type = "trace" if args[0] == "t" else self.probe_type + self.probe_type = "usdt" if args[0] == "u" else self.probe_type + self.probe_type = "uprobe" if args[0] == "up" else self.probe_type + self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type + + if self.probe_type == "usdt": + if len(args) != 3: + self._raise("Invalid number of arguments for USDT") + + self.usdt_provider = args[1] + self.usdt_probe = args[2] + self._verify_usdt_probe() + + elif self.probe_type == "trace": + if len(args) != 3: + self._raise("Invalid number of arguments for TRACE") + + self.trace_system = args[1] + self.trace_event = args[2] + self._verify_trace_probe() + + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for K(RET)PROBE") + self.kprobe_function = args[1] + self._verify_kprobe_probe() + + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + if len(args) != 2: + self._raise("Invalid number of arguments for U(RET)PROBE") + self.uprobe_function = args[1] + self._verify_uprobe_probe() + + def _get_kprobe_c_code(self, function_name, function_content): + # + # The kprobe__* do not require a function name, so it's + # ignored in the code generation. + # + return """ +int {}__{}(struct pt_regs *ctx) {{ + {} +}} +""".format(self.probe_type, self.kprobe_function, function_content) + + def _get_trace_c_code(self, function_name, function_content): + # + # The TRACEPOINT_PROBE() do not require a function name, so it's + # ignored in the code generation. + # + return """ +TRACEPOINT_PROBE({},{}) {{ + {} +}} +""".format(self.trace_system, self.trace_event, function_content) + + def _get_uprobe_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def _get_usdt_c_code(self, function_name, function_content): + return """ +int {}(struct pt_regs *ctx) {{ + {} +}} +""".format(function_name, function_content) + + def get_c_code(self, function_name, function_content): + if self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return self._get_kprobe_c_code(function_name, function_content) + elif self.probe_type == "trace": + return self._get_trace_c_code(function_name, function_content) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return self._get_uprobe_c_code(function_name, function_content) + elif self.probe_type == "usdt": + return self._get_usdt_c_code(function_name, function_content) + + return "" + + def probe_name(self): + if self.probe_type == "kprobe" or self.probe_type == "kretprobe": + return "{}".format(self.kprobe_function) + elif self.probe_type == "trace": + return "{}:{}".format(self.trace_system, + self.trace_event) + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": + return "{}".format(self.uprobe_function) + elif self.probe_type == "usdt": + return "{}:{}".format(self.usdt_provider, + self.usdt_probe) + + return "" + + +# +# event_to_dict() +# +def event_to_dict(event): + return dict([(field, getattr(event, field)) + for (field, _) in event._fields_ + if isinstance(getattr(event, field), (int, bytes))]) + + +# +# Event enum +# +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER"], + start=0) + + +# +# process_event() +# +def process_event(ctx, data, size): + global start_trigger_ts + global stop_trigger_ts + + event = bpf["events"].event(data) + if event.id == Event.SYSCALL: + syscall_events.append({"tid": event.tid, + "ts_entry": event.entry_ts, + "ts_exit": event.ts, + "syscall": event.syscall, + "user_stack_id": event.user_stack_id, + "kernel_stack_id": event.kernel_stack_id}) + elif event.id == Event.START_TRIGGER: + # + # This event would have started the trigger already, so all we need to + # do is record the start timestamp. + # + start_trigger_ts = event.ts + + elif event.id == Event.STOP_TRIGGER: + # + # This event would have stopped the trigger already, so all we need to + # do is record the start timestamp. + stop_trigger_ts = event.ts + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# unsigned_int() +# +def unsigned_int(value): + try: + value = int(value) + except ValueError: + raise argparse.ArgumentTypeError("must be an integer") + + if value < 0: + raise argparse.ArgumentTypeError("must be positive") + return value + + +# +# unsigned_nonzero_int() +# +def unsigned_nonzero_int(value): + value = unsigned_int(value) + if value == 0: + raise argparse.ArgumentTypeError("must be nonzero") + return value + + +# +# get_thread_name() +# +def get_thread_name(pid, tid): + try: + with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f: + return f.readline().strip("\n") + except FileNotFoundError: + pass + + return f"" + + +# +# get_vec_nr_name() +# +def get_vec_nr_name(vec_nr): + known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", + "tasklet", "sched", "hrtimer", "rcu"] + + if vec_nr < 0 or vec_nr > len(known_vec_nr): + return f"" + + return known_vec_nr[vec_nr] + + +# +# start/stop/reset capture +# +def start_capture(): + bpf["capture_on"][ct.c_int(0)] = ct.c_int(1) + + +def stop_capture(force=False): + if force: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff) + else: + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0) + + +def capture_running(): + return bpf["capture_on"][ct.c_int(0)].value == 1 + + +def reset_capture(): + bpf["syscall_start"].clear() + bpf["syscall_data"].clear() + bpf["run_start"].clear() + bpf["run_data"].clear() + bpf["ready_start"].clear() + bpf["ready_data"].clear() + bpf["hardirq_start"].clear() + bpf["hardirq_data"].clear() + bpf["softirq_start"].clear() + bpf["softirq_data"].clear() + bpf["stack_traces"].clear() + bpf["stop_start"].clear() + bpf["stop_data"].clear() + + +# +# Display timestamp +# +def print_timestamp(msg): + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "{} @{} ({} UTC)".format( + msg, ltz.isoformat(), utc.strftime("%H:%M:%S")) + print(time_string) + + +# +# process_results() +# +def process_results(syscall_events=None, trigger_delta=None): + if trigger_delta: + print_timestamp("# Triggered sample dump, stop-start delta {:,} ns". + format(trigger_delta)) + else: + print_timestamp("# Sample dump") + + # + # First get a list of all threads we need to report on. + # + threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items() + if k.syscall != 0xffffffff} + + threads_run = {k.tid for k, _ in bpf["run_data"].items() + if k.pid != 0xffffffff} + + threads_ready = {k.tid for k, _ in bpf["ready_data"].items() + if k.pid != 0xffffffff} + + threads_stopped = {k.tid for k, _ in bpf["stop_data"].items() + if k.pid != 0xffffffff} + + threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items() + if k.pid != 0xffffffff} + + threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items() + if k.pid != 0xffffffff} + + threads = sorted(threads_syscall | threads_run | threads_ready | + threads_stopped | threads_hardirq | threads_softirq, + key=lambda x: get_thread_name(options.pid, x)) + + # + # Print header... + # + print("{:10} {:16} {}".format("TID", "THREAD", "")) + print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76)) + indent = 28 * " " + + # + # Print all events/statistics per threads. + # + poll_id = [k for k, v in syscalls.items() if v == b"poll"][0] + for thread in threads: + + if thread != threads[0]: + print("") + + # + # SYSCALL_STATISTICS + # + print("{:10} {:16} {}\n{}{:20} {:>6} {:>10} {:>16} {:>16}".format( + thread, get_thread_name(options.pid, thread), + "[SYSCALL STATISTICS]", indent, + "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns")) + + total_count = 0 + total_ns = 0 + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["syscall_data"].items()), + key=lambda kv: -kv[1].total_ns): + + print("{}{:20.20} {:6} {:10} {:16,} {:16,}".format( + indent, syscall_name(k.syscall).decode("utf-8"), k.syscall, + v.count, v.total_ns, v.worst_ns)) + if k.syscall != poll_id: + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:6} {:10} {:16,}".format( + indent, "TOTAL( - poll):", "", total_count, total_ns)) + + # + # THREAD RUN STATISTICS + # + for k, v in filter(lambda t: t[0].tid == thread, + bpf["run_data"].items()): + + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( + "", "", "[THREAD RUN STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) + + print("{}{:10} {:16,} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.min_ns, v.max_ns)) + break + + # + # THREAD READY STATISTICS + # + for k, v in filter(lambda t: t[0].tid == thread, + bpf["ready_data"].items()): + + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD READY STATISTICS]", indent, + "SCHED_CNT", "TOTAL ns", "MAX ns")) + + print("{}{:10} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.worst_ns)) + break + + # + # THREAD STOPPED STATISTICS + # + for k, v in filter(lambda t: t[0].tid == thread, + bpf["stop_data"].items()): + + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( + "", "", "[THREAD STOPPED STATISTICS]", indent, + "STOP_CNT", "TOTAL ns", "MAX ns")) + + print("{}{:10} {:16,} {:16,}".format( + indent, v.count, v.total_ns, v.worst_ns)) + break + + # + # HARD IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["hardirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n{}{:20} {:>10} {:>16} {:>16}". + format("", "", "[HARD IRQ STATISTICS]", indent, + "NAME", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:10} {:16,} {:16,}".format( + indent, k.irq_name.decode("utf-8"), + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:10} {:16,}".format( + indent, "TOTAL:", total_count, total_ns)) + + # + # SOFT IRQ STATISTICS + # + total_ns = 0 + total_count = 0 + header_printed = False + for k, v in sorted(filter(lambda t: t[0].tid == thread, + bpf["softirq_data"].items()), + key=lambda kv: -kv[1].total_ns): + + if not header_printed: + print("\n{:10} {:16} {}\n" + "{}{:20} {:>7} {:>10} {:>16} {:>16}". + format("", "", "[SOFT IRQ STATISTICS]", indent, + "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX ns")) + header_printed = True + + print("{}{:20.20} {:>7} {:10} {:16,} {:16,}".format( + indent, get_vec_nr_name(k.vec_nr), k.vec_nr, + v.count, v.total_ns, v.worst_ns)) + + total_count += v.count + total_ns += v.total_ns + + if total_count > 0: + print("{}{:20.20} {:7} {:10} {:16,}".format( + indent, "TOTAL:", "", total_count, total_ns)) + + # + # Print events + # + lost_stack_traces = 0 + if syscall_events: + stack_traces = bpf.get_table("stack_traces") + + print("\n\n# SYSCALL EVENTS:" + "\n{}{:>19} {:>19} {:>10} {:16} {:>10} {}".format( + 2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM", + "DELTA (us)", "SYSCALL")) + print("{}{:19} {:19} {:10} {:16} {:10} {}".format( + 2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16, + "-" * 10, "-" * 16)) + for event in syscall_events: + print("{}{:19} {:19} {:10} {:16} {:10,} {}".format( + " " * 2, + event["ts_entry"], event["ts_exit"], event["tid"], + get_thread_name(options.pid, event["tid"]), + int((event["ts_exit"] - event["ts_entry"]) / 1000), + syscall_name(event["syscall"]).decode("utf-8"))) + # + # Not sure where to put this, but I'll add some info on stack + # traces here... Userspace stack traces are very limited due to + # the fact that bcc does not support dwarf backtraces. As OVS + # gets compiled without frame pointers we will not see much. + # If however, OVS does get built with frame pointers, we should not + # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle + # the debug symbols correctly. Also, note that for kernel + # traces you should not use BPF_STACK_TRACE_BUILDID, so two + # buffers are needed. + # + # Some info on manual dwarf walk support: + # https://github.com/iovisor/bcc/issues/3515 + # https://github.com/iovisor/bcc/pull/4463 + # + if options.stack_trace_size == 0: + continue + + if event["kernel_stack_id"] < 0 or event["user_stack_id"] < 0: + lost_stack_traces += 1 + + kernel_stack = stack_traces.walk(event["kernel_stack_id"]) \ + if event["kernel_stack_id"] >= 0 else [] + user_stack = stack_traces.walk(event["user_stack_id"]) \ + if event["user_stack_id"] >= 0 else [] + + for addr in kernel_stack: + print("{}{}".format( + " " * 10, + bpf.ksym(addr, show_module=True, + show_offset=True).decode("utf-8", "replace"))) + + for addr in user_stack: + addr_str = bpf.sym(addr, options.pid, show_module=True, + show_offset=True).decode("utf-8", "replace") + + if addr_str == "[unknown]": + addr_str += " 0x{:x}".format(addr) + + print("{}{}".format(" " * 10, addr_str)) + + # + # Print any footer messages. + # + if lost_stack_traces > 0: + print("\n#WARNING: We where not able to display {} stack traces!\n" + "# Consider increasing the stack trace size using\n" + "# the '--stack-trace-size' option.\n" + "# Note that this can also happen due to a stack id\n" + "# collision.".format(lost_stack_traces)) + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global bpf + global options + global syscall_events + global start_trigger_ts + global stop_trigger_ts + + start_trigger_ts = 0 + stop_trigger_ts = 0 + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs="?") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=unsigned_int, default=None) + parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS", + help="Record syscall events that take longer than " + "DURATION_NS. Omit the duration value to record all " + "syscall events", + type=unsigned_int, const=0, default=None, nargs="?") + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=unsigned_int, default=1024, metavar="NUMBER") + parser.add_argument("--sample-count", + help="Number of sample runs, default 1", + type=unsigned_nonzero_int, default=1, metavar="RUNS") + parser.add_argument("--sample-interval", + help="Delay between sample runs, default 0", + type=float, default=0, metavar="SECONDS") + parser.add_argument("--sample-time", + help="Sample time, default 0.5 seconds", + type=float, default=0.5, metavar="SECONDS") + parser.add_argument("--skip-syscall-poll-events", + help="Skip poll() syscalls with --syscall-events", + action="store_true") + parser.add_argument("--stack-trace-size", + help="Number of unique stack traces that can be " + "recorded, default 4096. 0 to disable", + type=unsigned_int, default=4096) + parser.add_argument("--start-trigger", metavar="TRIGGER", + help="Start trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--stop-trigger", metavar="TRIGGER", + help="Stop trigger, see documentation for details", + type=str, default=None) + parser.add_argument("--trigger-delta", metavar="DURATION_NS", + help="Only report event when the trigger duration > " + "DURATION_NS, default 0 (all events)", + type=unsigned_int, const=0, default=0, nargs="?") + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if not options.pid: + for proc in psutil.process_iter(): + if "ovs-vswitchd" in proc.name(): + if options.pid: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(os.EX_NOINPUT) + + options.pid = proc.pid + + # + # Error checking on input parameters. + # + if not options.pid: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(os.EX_UNAVAILABLE) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Make sure we are running as root, or else we can not attach the probes. + # + if os.geteuid() != 0: + print("ERROR: We need to run as root to attached probes!") + sys.exit(os.EX_NOPERM) + + # + # Setup any of the start stop triggers + # + if options.start_trigger is not None: + try: + start_trigger = Probe(options.start_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid start trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + start_trigger = None + + if options.stop_trigger is not None: + try: + stop_trigger = Probe(options.stop_trigger, pid=options.pid) + except ValueError as e: + print(f"ERROR: Invalid stop trigger {str(e)}") + sys.exit(os.EX_CONFIG) + else: + stop_trigger = None + + # + # Attach probe to running process. + # + source = EBPF_SOURCE.replace("", "\n".join( + [" EVENT_{} = {},".format( + event.name, event.value) for event in Event])) + source = source.replace("", + str(options.buffer_page_count)) + source = source.replace("", str(options.pid)) + + if BPF.kernel_struct_has_field(b"task_struct", b"state") == 1: + source = source.replace("", "state") + else: + source = source.replace("", "__state") + + poll_id = [k for k, v in syscalls.items() if v == b"poll"][0] + if options.syscall_events is None: + syscall_trace_events = "false" + elif options.syscall_events == 0: + if not options.skip_syscall_poll_events: + syscall_trace_events = "true" + else: + syscall_trace_events = f"args->id != {poll_id}" + else: + syscall_trace_events = "delta > {}".format(options.syscall_events) + if options.skip_syscall_poll_events: + syscall_trace_events += f" && args->id != {poll_id}" + + source = source.replace("", + syscall_trace_events) + + source = source.replace("", + str(options.stack_trace_size)) + + source = source.replace("", "true" + if options.stack_trace_size > 0 else "false") + + # + # Handle start/stop probes + # + if start_trigger: + source = source.replace("", + start_trigger.get_c_code( + "start_trigger_probe", + "return start_trigger();")) + else: + source = source.replace("", "") + + if stop_trigger: + source = source.replace("", + stop_trigger.get_c_code( + "stop_trigger_probe", + "return stop_trigger();")) + else: + source = source.replace("", "") + + # + # Setup usdt or other probes that need handling trough the BFP class. + # + usdt = USDT(pid=int(options.pid)) + try: + if start_trigger and start_trigger.probe_type == "usdt": + usdt.enable_probe(probe=start_trigger.probe_name(), + fn_name="start_trigger_probe") + if stop_trigger and stop_trigger.probe_type == "usdt": + usdt.enable_probe(probe=stop_trigger.probe_name(), + fn_name="stop_trigger_probe") + + except USDTException as e: + print("ERROR: {}".format( + (re.sub("^", " " * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(os.EX_OSERR) + + bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug) + + if start_trigger: + try: + if start_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + + if start_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=start_trigger.probe_name(), + fn_name="start_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe start trigger " + f"'{start_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + if stop_trigger: + try: + if stop_trigger.probe_type == "uprobe": + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + + if stop_trigger.probe_type == "uretprobe": + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", + sym=stop_trigger.probe_name(), + fn_name="stop_trigger_probe", + pid=options.pid) + except Exception as e: + print("ERROR: Failed attaching uprobe stop trigger" + f"'{stop_trigger.probe_name()}';\n {str(e)}") + sys.exit(os.EX_OSERR) + + # + # If no triggers are configured use the delay configuration + # + bpf["events"].open_ring_buffer(process_event) + + sample_count = 0 + while sample_count < options.sample_count: + sample_count += 1 + syscall_events = [] + + if not options.start_trigger: + print_timestamp("# Start sampling") + start_capture() + stop_time = -1 if options.stop_trigger else \ + time_ns() + options.sample_time * 1000000000 + else: + # For start triggers the stop time depends on the start trigger + # time, or depends on the stop trigger if configured. + stop_time = -1 if options.stop_trigger else 0 + + while True: + keyboard_interrupt = False + try: + last_start_ts = start_trigger_ts + last_stop_ts = stop_trigger_ts + + if stop_time > 0: + delay = int((stop_time - time_ns()) / 1000000) + if delay <= 0: + break + else: + delay = -1 + + bpf.ring_buffer_poll(timeout=delay) + + if stop_time <= 0 and last_start_ts != start_trigger_ts: + print_timestamp( + "# Start sampling (trigger@{})".format( + start_trigger_ts)) + + if not options.stop_trigger: + stop_time = time_ns() + \ + options.sample_time * 1000000000 + + if last_stop_ts != stop_trigger_ts: + break + + except KeyboardInterrupt: + keyboard_interrupt = True + break + + if options.stop_trigger and not capture_running(): + print_timestamp("# Stop sampling (trigger@{})".format( + stop_trigger_ts)) + else: + print_timestamp("# Stop sampling") + + if stop_trigger_ts != 0 and start_trigger_ts != 0: + trigger_delta = stop_trigger_ts - start_trigger_ts + else: + trigger_delta = None + + if not trigger_delta or trigger_delta >= options.trigger_delta: + stop_capture(force=True) # Prevent a new trigger to start. + process_results(syscall_events=syscall_events, + trigger_delta=trigger_delta) + elif trigger_delta: + sample_count -= 1 + print_timestamp("# Sample dump skipped, delta {:,} ns".format( + trigger_delta)) + + reset_capture() + stop_capture() + + if keyboard_interrupt: + break + + if options.sample_interval > 0: + time.sleep(options.sample_interval) + + # + # Report lost events. + # + dropcnt = bpf.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all events were captured, {} were " + "dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count)) + + if options.sample_count > 1: + trigger_miss = bpf.get_table("trigger_miss") + for k in trigger_miss.keys(): + count = trigger_miss.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all start triggers were successful. " + "{} were missed due to\n# slow userspace " + "processing!".format(count)) + + +# +# Start main() as the default entry point... +# +if __name__ == "__main__": + main() diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst new file mode 100644 index 00000000000..0f6f916a71e --- /dev/null +++ b/utilities/usdt-scripts/kernel_delay.rst @@ -0,0 +1,620 @@ +Troubleshooting Open vSwitch: Is the kernel to blame? +===================================================== +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be left +wondering if the issue is really OVS-related, or if it's a problem with the +kernel being overloaded. Messages in the log like +``Unreasonably long XXXXms poll interval`` might suggest it's OVS, but from +experience, these are mostly related to an overloaded Linux Kernel. +The kernel_delay.py tool can help you quickly identify if the focus of your +investigation should be OVS or the Linux kernel. + + +Introduction +------------ +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_ +framework to install eBPF probes. The data the eBPF probes collect will be +analyzed and presented to the user by the Python script. Some of the presented +data can also be captured by the individual scripts included in the BBC [#BCC]_ +framework. + +kernel_delay.py has two modes of operation: + +- In **time mode**, the tool runs for a specific time and collects the + information. +- In **trigger mode**, event collection can be started and/or stopped based on + a specific eBPF probe. Currently, the following probes are supported: + - USDT probes + - Kernel tracepoints + - kprobe + - kretprobe + - uprobe + - uretprobe + + +In addition, the option, ``--sample-count``, exists to specify how many +iterations you would like to do. When using triggers, you can also ignore +samples if they are less than a number of nanoseconds with the +``--trigger-delta`` option. The latter might be useful when debugging Linux +syscalls which take a long time to complete. More on this later. Finally, you +can configure the delay between two sample runs with the ``--sample-interval`` +option. + +Before getting into more details, you can run the tool without any options +to see what the output looks like. Notice that it will try to automatically +get the process ID of the running ``ovs-vswitchd``. You can overwrite this +with the ``--pid`` option. + +.. code-block:: console + + $ sudo ./kernel_delay.py + # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC) + # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC) + # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 27090 ovs-vswitchd [SYSCALL STATISTICS] + + + 31741 revalidator122 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + poll 7 5 184,193,176 184,191,520 + recvmsg 47 494 125,208,756 310,331 + futex 202 8 18,768,758 4,023,039 + sendto 44 10 375,861 266,867 + sendmsg 46 4 43,294 11,213 + write 1 1 5,949 5,949 + getrusage 98 1 1,424 1,424 + read 0 1 1,292 1,292 + TOTAL( - poll): 519 144,405,334 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 6 136,764,071 1,480 115,146,424 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 7 11,334 6,636 + + [THREAD STOPPED STATISTICS] + STOP_CNT TOTAL ns MAX ns + 3 3,045,728,323 1,015,739,474 + + [HARD IRQ STATISTICS] + NAME COUNT TOTAL ns MAX ns + eno8303-rx-1 1 3,586 3,586 + TOTAL: 1 3,586 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + net_rx 3 1 17,699 17,699 + sched 7 6 13,820 3,226 + rcu 9 16 13,586 1,554 + timer 1 3 10,259 3,815 + TOTAL: 26 55,364 + + +By default, the tool will run for half a second in `time mode`. To extend this +you can use the ``--sample-time`` option. + + +What will it report +------------------- +The above sample output separates the captured data on a per-thread basis. +For this, it displays the thread's id (``TID``) and name (``THREAD``), +followed by resource-specific data. Which are: + +- ``SYSCALL STATISTICS`` +- ``THREAD RUN STATISTICS`` +- ``THREAD READY STATISTICS`` +- ``THREAD STOPPED STATISTICS`` +- ``HARD IRQ STATISTICS`` +- ``SOFT IRQ STATISTICS`` + +The following sections will describe in detail what statistics they report. + + +``SYSCALL STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~ +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during +the measurement interval. This includes the number of times the syscall was +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``), +and the worst-case duration of a single call (``MAX ns``). + +It also shows the total of all system calls, but it excludes the poll system +call, as the purpose of this call is to wait for activity on a set of sockets, +and usually, the thread gets swapped out. + +Note that it only counts calls that started and stopped during the +measurement interval! + + +``THREAD RUN STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU +during the measurement interval. + +Note that these statistics only count events where the thread started and +stopped running on a CPU during the measurement interval. For example, if +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``. +If not, there might be a misconfiguration. + + +``THREAD READY STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD READY STATISTICS`` tell you the time between the thread being ready +to run and it actually running on the CPU. + +Note that these statistics only count events where the thread was getting +ready to run and started running during the measurement interval. + + +``THREAD STOPPED STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``THREAD STOPPED STATISTICS`` reveal the number of instances where the thread +has been scheduled out while in the running state due to its transition to +the TASK_STOPPED state. + +This behavior can be replicated by manually placing the thread in the stopped +state and subsequently resuming it. For instance: + +.. code-block:: console + + # kill -STOP $(pidof ovs-vswitchd); \ + sleep 1; \ + kill -CONT $(pidof ovs-vswitchd); + +Note that these statistics only count events where the thread was running at +the time it was put to stopped state. + + +``HARD IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), the number of interrupts (``COUNT``), +the total time spent in the interrupt handler (``TOTAL ns``), and the +worst-case duration (``MAX ns``). + + +``SOFT IRQ STATISTICS`` +~~~~~~~~~~~~~~~~~~~~~~~ +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft +interrupts during the threads run time. + +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the +number of interrupts (``COUNT``), the total time spent in the interrupt +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``). + + +The ``--syscall-events`` option +------------------------------- +In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``, +the tool can also report each individual syscall. This can be a usefull +second step if the ``SYSCALL_STATISTICS`` show high latency numbers. + +All you need to do is add the ``--syscall-events`` option, with or without +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter +allows you to exclude events that take less than the supplied time. + +The ``--skip-syscall-poll-events`` option allows you to exclude poll +syscalls from the report. + +Below is an example run, note that the resource-specific data is removed +to highlight the syscall events: + +.. code-block:: console + + $ sudo ./kernel_delay.py --syscall-events 50000 --skip-syscall-poll-events + # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC) + # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC) + # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3359686 ipf_clean2 [SYSCALL STATISTICS] + ... + 3359635 ovs-vswitchd [SYSCALL STATISTICS] + ... + 3359697 revalidator12 [SYSCALL STATISTICS] + ... + 3359698 revalidator13 [SYSCALL STATISTICS] + ... + 3359699 revalidator14 [SYSCALL STATISTICS] + ... + 3359700 revalidator15 [SYSCALL STATISTICS] + ... + + # SYSCALL EVENTS: + ENTRY (ns) EXIT (ns) TID COMM DELTA (us) SYSCALL + ------------------- ------------------- ---------- ---------------- ---------- ---------------- + 2161821694935486 2161821695031201 3359699 revalidator14 95 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695276882 2161821695333687 3359698 revalidator13 56 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003134313a63 + 2161821695275820 2161821695405733 3359700 revalidator15 129 futex + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI___lll_lock_wait+0x30 [libc.so.6] + ovs_mutex_lock_at+0x18 [ovs-vswitchd] + [unknown] 0x696c003936313a63 + 2161821695964969 2161821696052021 3359635 ovs-vswitchd 87 accept + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode_prepare+0x161 [kernel] + syscall_exit_to_user_mode+0x9 [kernel] + do_syscall_64+0x68 [kernel] + entry_SYSCALL_64_after_hwframe+0x72 [kernel] + __GI_accept+0x4d [libc.so.6] + pfd_accept+0x3a [ovs-vswitchd] + [unknown] 0x7fff19f2bd00 + [unknown] 0xe4b8001f0f + +As you can see above, the output also shows the stackback trace. You can +disable this using the ``--stack-trace-size 0`` option. + +As you can see above, the backtrace does not show a lot of useful information +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further +analyze system call backtraces, you could use perf. The following perf +script can do this for you (refer to the embedded instructions): + +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py + + +Using triggers +-------------- +The tool supports start and, or stop triggers. This will allow you to capture +statistics triggered by a specific event. The following combinations of +stop-and-start triggers can be used. + +If you only use ``--start-trigger``, the inspection start when the trigger +happens and runs until the ``--sample-time`` number of seconds has passed. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \ + --sample-count 4 --sample-interval 1 + + +If you only use ``--stop-trigger``, the inspection starts immediately and +stops when the trigger happens. The example below shows all the supported +options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 + + +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the +statistics are captured between the two first occurrences of these events. +The example below shows all the supported options in this scenario. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run \ + --stop-trigger upr:bridge_run \ + --sample-count 4 --sample-interval 1 \ + --trigger-delta 50000 + +What triggers are supported? Note that what ``kernel_delay.py`` calls triggers, +BCC [#BCC]_, calls events; these are eBPF tracepoints you can attach to. +For more details on the supported tracepoints, check out the BCC +documentation [#BCC_EVENT]_. + +The list below shows the supported triggers and their argument format: + +**USDT probes:** + [u|usdt]:{provider}:{probe} +**Kernel tracepoint:** + [t:trace]:{system}:{event} +**kprobe:** + [k:kprobe]:{kernel_function} +**kretprobe:** + [kr:kretprobe]:{kernel_function} +**uprobe:** + [up:uprobe]:{function} +**uretprobe:** + [upr:uretprobe]:{function} + +Here are a couple of trigger examples, more use-case-specific examples can be +found in the *Examples* section. + +.. code-block:: console + + --start|stop-trigger u:udpif_revalidator:start_dump + --start|stop-trigger t:openvswitch:ovs_dp_upcall + --start|stop-trigger k:ovs_dp_process_packet + --start|stop-trigger kr:ovs_dp_process_packet + --start|stop-trigger up:bridge_run + --start|stop-trigger upr:bridge_run + + +Examples +-------- +This section will give some examples of how to use this tool in real-world +scenarios. Let's start with the issue where Open vSwitch reports +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note +that there is a blog available explaining how the revalidator process works +in OVS [#REVAL_BLOG]_. + +First, let me explain this log message. It gets logged if the time delta +between two ``poll_block()`` calls is more than 1 second. In other words, +the process was spending a lot of time processing stuff that was made +available by the return of the ``poll_block()`` function. + +Do a run with the tool using the existing USDT revalidator probes as a start +and stop trigger (Note that the resource-specific data is removed from the none +revalidator threads): + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 (12:52:00 UTC) + # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 (12:52:01 UTC) + # Triggered sample dump, stop-start delta 1,187,986,454 ns @2023-06-14T14:52:01.298021 (12:52:01 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457761 handler24 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + sendmsg 46 6110 123,274,761 41,776 + recvmsg 47 136299 99,397,508 49,896 + futex 202 51 7,655,832 7,536,776 + poll 7 4068 1,202,883 2,907 + getrusage 98 2034 586,602 1,398 + sendto 44 9 213,682 27,417 + TOTAL( - poll): 144503 231,128,385 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 1 1,438 1,438 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 21 59,145 3,769 + rcu 9 50 42,917 2,234 + TOTAL: 71 102,062 + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 73 572,576,329 19,621,600 + recvmsg 47 815 296,697,618 405,338 + sendto 44 3 78,302 26,837 + sendmsg 46 3 38,712 13,250 + write 1 1 5,073 5,073 + TOTAL( - poll): 895 869,396,034 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 48 394,350,393 1,729 140,455,796 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 49 23,650 1,559 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 14 26,889 3,041 + rcu 9 28 23,024 1,600 + TOTAL: 42 49,913 + + +Above you see from the start of the output that the trigger took more than a +second (1,187,986,454 ns), which is already know, by looking at the output of +the ``ovs-vsctl upcall/show`` command. + +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics you can see it +spent almost 870ms handling syscalls, and there were no poll() calls being +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit misleading, +as it looks like OVS only spent 394ms on the CPU. But earlier, it was mentioned +that this time does not include the time being on the CPU at the start or stop +of an event. What is exactly the case here, because USDT probes were used. + +From the above data and maybe some ``top`` output, it can be determined that +the *revalidator55* thread is taking a lot of CPU time, probably because it +has to do a lot of revalidator work by itself. The solution here is to increase +the number of revalidator threads, so more work could be done in parallel. + +Here is another run of the same command in another scenario: + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done + # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 (13:48:23 UTC) + # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 (13:48:23 UTC) + # Triggered sample dump, stop-start delta 263,181,230 ns @2023-06-14T15:48:23.781414 (13:48:23 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1457733 ovs-vswitchd [SYSCALL STATISTICS] + ... + 1457792 revalidator55 [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + recvmsg 47 284 193,422,110 46,248,418 + sendto 44 2 46,685 23,665 + sendmsg 46 2 24,916 12,703 + write 1 1 6,534 6,534 + TOTAL( - poll): 289 193,500,245 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 2 47,333,558 331,516 47,002,042 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 3 87,000,403 45,999,712 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + sched 7 2 9,504 5,109 + TOTAL: 2 9,504 + + +Here you can see the revalidator run took about 263ms, which does not look +odd, however, the ``THREAD READY STATISTICS`` information shows that OVS was +waiting 87ms for a CPU to be run on. This means the revalidator process could +have finished 87ms faster. Looking at the ``MAX ns`` value, a worst-case delay +of almost 46ms can be seen, which hints at an overloaded system. + +One final example that uses a ``uprobe`` to get some statistics on a +``bridge_run()`` execution that takes more than 1ms. + +.. code-block:: console + + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run --trigger-delta 1000000 + # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 (14:21:10 UTC) + # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 (14:21:10 UTC) + # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 (14:21:10 UTC) + # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 (14:21:10 UTC) + # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 (14:21:10 UTC) + # Triggered sample dump, stop-start delta 1,609,374 ns @2023-06-14T16:21:10.542565 (14:21:10 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 3371035 [SYSCALL STATISTICS] + ... + 3371102 handler66 [SYSCALL STATISTICS] + ... + 3366258 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + futex 202 43 403,469 199,312 + clone3 435 13 174,394 30,731 + munmap 11 8 115,774 21,861 + poll 7 5 92,969 38,307 + unlink 87 2 49,918 35,741 + mprotect 10 8 47,618 13,201 + accept 43 10 31,360 6,976 + mmap 9 8 30,279 5,776 + write 1 6 27,720 11,774 + rt_sigprocmask 14 28 12,281 970 + read 0 6 9,478 2,318 + recvfrom 45 3 7,024 4,024 + sendto 44 1 4,684 4,684 + getrusage 98 5 4,594 1,342 + close 3 2 2,918 1,627 + recvmsg 47 1 2,722 2,722 + TOTAL( - poll): 144 924,233 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + 13 817,605 5,433 524,376 + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + 14 28,646 11,566 + + [SOFT IRQ STATISTICS] + NAME VECT_NR COUNT TOTAL ns MAX ns + rcu 9 1 2,838 2,838 + TOTAL: 1 2,838 + + 3371110 revalidator74 [SYSCALL STATISTICS] + ... + 3366311 urcu3 [SYSCALL STATISTICS] + ... + + +OVS removed some of the threads and their resource-specific data, but based +on the ```` thread name, you can determine that some +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some +``clone3`` syscalls, indicating threads were created. In this example, it was +due to the deletion of a bridge, which resulted in the recreation of the +revalidator and handler threads. + + +Use with Openshift +------------------ +This section describes how you would use the tool on a node in an OpenShift +cluster. It assumes you have console access to the node, either directly or +through a debug container. + +A base fedora38 container will be used through podman, as this will allow the +use of some additional tools and packages needed. + +First the containers need to be started: + +.. code-block:: console + + [core@localhost ~]$ sudo podman run -it --rm \ + -e PS1='[(DEBUG)\u@\h \W]\$ ' \ + --privileged --network=host --pid=host \ + -v /lib/modules:/lib/modules:ro \ + -v /sys/kernel/debug:/sys/kernel/debug \ + -v /proc:/proc \ + -v /:/mnt/rootdir \ + quay.io/fedora/fedora:38-x86_64 + + [(DEBUG)root@localhost /]# + + +Next add the ``linux_delay.py`` dependencies: + +.. code-block:: console + + [(DEBUG)root@localhost /]# dnf install -y bcc-tools perl-interpreter \ + python3-pytz python3-psutil + + +You need to install the devel, debug and source RPMs for your OVS and kernel +version: + +.. code-block:: console + + [(DEBUG)root@localhost home]# rpm -i \ + openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \ + openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \ + kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm + + +Now the tool can be started. Here the above ``bridge_run()`` example is used: + +.. code-block:: console + + [(DEBUG)root@localhost home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run + # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 (11:44:07 UTC) + # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 (11:44:07 UTC) + # Triggered sample dump, stop-start delta 100,467 ns @2023-06-15T11:44:07.628569 (11:44:07 UTC) + TID THREAD + ---------- ---------------- ---------------------------------------------------------------------------- + 1246 ovs-vswitchd [SYSCALL STATISTICS] + NAME NUMBER COUNT TOTAL ns MAX ns + getdents64 217 2 8,560 8,162 + openat 257 1 6,951 6,951 + accept 43 4 6,942 3,763 + recvfrom 45 1 3,726 3,726 + recvmsg 47 2 2,880 2,188 + stat 4 2 1,946 1,384 + close 3 1 1,393 1,393 + fstat 5 1 1,324 1,324 + TOTAL( - poll): 14 33,722 + + [THREAD RUN STATISTICS] + SCHED_CNT TOTAL ns MIN ns MAX ns + + [THREAD READY STATISTICS] + SCHED_CNT TOTAL ns MAX ns + + +.. rubric:: Footnotes + +.. [#BCC] https://github.com/iovisor/bcc +.. [#BCC_EVENT] https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments +.. [#REVAL_BLOG] https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained diff --git a/utilities/usdt-scripts/reval_monitor.py b/utilities/usdt-scripts/reval_monitor.py new file mode 100755 index 00000000000..5f69998c806 --- /dev/null +++ b/utilities/usdt-scripts/reval_monitor.py @@ -0,0 +1,894 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2022 Red Hat, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script information: +# ------------------- +# reval_monitor.py uses various user-defined tracepoints to get all the +# revalidator-process related variables and will display them in a (dynamic) +# graph. In addition, it will also dump the data to the console +# in a CSV format. Note that all the graphical output can be disabled. +# +# All the USDT events can be saved to a file and than can be used to +# replay the trace offline and still get the plots. +# +# The script can simple be invoked without any options, and it will try +# to find the running ovs-vswitchd instance: +# +# # ./reval_monitor.py +# # Starting trace @2022-09-20T04:07:43.588749 (08:07:43 UTC) +# ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, max_n_flows, +# flow_limit, dump_duration, poll_wait, actual_wait +# 1741367714251645, 1741367714532545, 0, 0, 0, 10000, 69000, 1, 500, 500.52 +# 1741368215056961, 1741368215318223, 0, 0, 0, 10000, 69000, 1, 500, 500.55 +# 1741368715865871, 1741368716107089, 0, 0, 0, 10000, 69000, 1, 500, 499.48 +# ^C# Stopping trace @2022-09-20T04:07:49.893827 (08:07:49 UTC) +# +# IMPORTANT NOTE: This script only works when a single datapath is configured! +# 2nd IMPORTANT NOTE: ovs-vswitchd either needs to be built with debug info +# or the debug info package needs to be installed! +# +# The following are the available options: +# +# usage: reval_monitor.py [-h] [-c] [--buffer-page-count NUMBER] +# [-D [DEBUG]] [-g] [--no-ukey-count] +# [-p VSWITCHD_PID] [-P PAHOLE] [-r FILE] [-R] +# [-u SECONDS] [-w FILE] [-W FILE] +# +# options: +# -h, --help show this help message and exit +# -c, --compress-output +# Compress output, i.e. only dump changes in +# the dataset +# --buffer-page-count NUMBER +# Number of BPF ring buffer pages, default +# 1024 +# -D [DEBUG], --debug [DEBUG] +# Enable eBPF debugging +# -g, --no-gui Do not use the gui to display plots +# --no-ukey-count No revalidate_ukey() counting +# -p VSWITCHD_PID, --pid VSWITCHD_PID +# ovs-vswitch's PID +# -P PAHOLE, --pahole PAHOLE +# Pahole executable to use, default pahole +# -r FILE, --read-events FILE +# Read events from instead of +# installing tracepoints +# -R, --no-realtime-plots +# Do not show realtime plot while tracing +# -u SECONDS, --update-interval SECONDS +# Seconds to wait between real time update, +# default 1 +# -w FILE, --write-events FILE +# Write events to +# -W FILE, --write-charts FILE +# Write overall charts to .png + +# [-D [DEBUG]] [-g] [--no-ukey-count] +# [-p VSWITCHD_PID] [-r FILE] [-R] +# [-u SECONDS] [-w FILE] [-W FILE] +# +# The -g option disabled all GUI output of matplotlib, -R only disables the +# real-time plots. As real-time plots are rather slow, the -u option can be +# used to only update the graph every x seconds, which might speed up the +# processing. +# +# The --no-ukey-count option disables counting of the number of flows actually +# being revalidated against the current OpenFlow ruleset. This will not install +# the specific tracepoint which would be called for each flow being +# revalidated. +# +# What is plotted in the graphs (and dumped in the CSV output)? +# - n_flows: Number of flows active in the system. +# - n_reval_flows: Number of flows that where revalidated against the OpenFlow +# ruleset. +# - dump_duration: Time it took to dump and process all flows. +# - avg_n_flows: Average number of flows in the system. +# - max_n_flows: Maximum number of flows in the system. +# - flow_limit: Dynamic flow limit. +# - poll_wait: Time requested for the poll wait. +# - actual_wait: Time it took to be woken up. +# +# Dependencies: +# This script needs the 'readelf' binary to be available. In addition, it also +# needs pahole to be installed, and it needs a version that is equal or newer +# than the following commit on the next branch: +# +# https://git.kernel.org/pub/scm/devel/pahole/pahole.git/?h=next +# c55b13b9d785 ("WIP: Remove DW_TAG_atomic_type when encoding BTF") +# +# To use a locally compiled pahole the --pahole option can be used. +# For example: +# # ./reval_monitor.py --pahole ~/pahole/build/pahole -g +# Starting trace @2022-12-20T14:57:26.077815 (13:57:26 UTC) +# ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, max_n_flows, \ +# flow_limit, dump_duration, poll_wait, actual_wait +# 4202771850983494, 4202771851472838, 0, 0, 0, 0, 10000, 1, 500, 15.06 +# 4202771866531996, 4202771867713366, 0, 0, 0, 0, 10000, 1, 500, 4.23 +# 4202771871941979, 4202771872749915, 0, 0, 0, 0, 10000, 1, 500, 500.02 +# 4202772372770361, 4202772373531820, 0, 0, 0, 0, 10000, 1, 500, 499.96 +# 4202772873487942, 4202772874514753, 0, 0, 0, 0, 10000, 1, 500, 500.01 +# 4202773374528435, 4202773375695054, 0, 0, 0, 0, 10000, 1, 500, 500.01 +# 4202773875701559, 4202773876880763, 0, 0, 0, 0, 10000, 1, 500, 500.04 +# 4202774376925058, 4202774377905799, 0, 0, 0, 0, 10000, 1, 500, 500.03 +# ^C# Stopping trace @2022-12-20T14:57:40.391730 (13:57:40 UTC) +# + +try: + from bcc import BPF, USDT, USDTException +except ModuleNotFoundError: + print("WARNING: Can't find the BPF Compiler Collection (BCC) tools!") + print(" This is NOT problem if you analyzing previously collected" + " data.\n") + +from collections import namedtuple +from enum import IntEnum +from pathlib import Path + +import argparse +import ast +import datetime +import re +import subprocess +import sys + +import pytz +import psutil +import matplotlib.pyplot as plt + +# +# Actual eBPF source code +# +EBPF_SOURCE = """ +#include + + + +enum { + +}; + +struct event_t { + u64 ts; + u32 pid; + u32 id; + u64 n_flows; + u32 avg_n_flows; + u32 max_n_flows; + u32 flow_limit; + u32 dump_duration; + u32 poll_wait; +}; + + +BPF_RINGBUF_OUTPUT(events, ); +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); + +static struct event_t *get_event(uint32_t id) { + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); + + if (!event) { + dropcnt.increment(0); + return NULL; + } + + event->id = id; + event->ts = bpf_ktime_get_ns(); + event->pid = bpf_get_current_pid_tgid(); + + return event; +} + +int probe__start_dump(struct pt_regs *ctx) { + struct event_t *event = get_event(EVENT_START_DUMP); + if (!event) + return 1; + + events.ringbuf_submit(event, 0); + return 0; +}; + +int probe__sweep_done(struct pt_regs *ctx) { + struct udpif udpif; + + bpf_usdt_readarg_p(1, ctx, &udpif, sizeof(udpif)); + + struct event_t *event = get_event(EVENT_SWEEP_DONE); + if (!event) + return 1; + + event->avg_n_flows = udpif.avg_n_flows; + event->max_n_flows = udpif.max_n_flows; + event->flow_limit = udpif.flow_limit; + event->dump_duration = udpif.dump_duration; + + bpf_usdt_readarg(2, ctx, &event->n_flows); + bpf_usdt_readarg(3, ctx, &event->poll_wait); + + events.ringbuf_submit(event, 0); + return 0; +}; + +int probe__reval_entry(struct pt_regs *ctx) { + struct event_t *event = get_event(EVENT_REVAL_ENTRY); + if (!event) + return 1; + + events.ringbuf_submit(event, 0); + return 0; +}; +""" + + +# +# event_to_dict() +# +def event_to_dict(event): + return dict([(field, getattr(event, field)) + for (field, _) in event._fields_ + if isinstance(getattr(event, field), (int, bytes))]) + + +# +# print_csv_header() +# +def print_csv_header(): + print("ts_start, ts_complete, n_flows, n_reval_flows, avg_n_flows, " + "max_n_flows, flow_limit, dump_duration, poll_wait, actual_wait") + + +# +# Event enum +# +Event = IntEnum("Event", ["START_DUMP", + "SWEEP_DONE", + "REVAL_ENTRY"], start=0) + + +# +# process_event() +# +def process_event(ctx, data, size): + event = b['events'].event(data) + _process_event(event) + + +def _process_event(event): + global graph + + if export_file is not None: + export_file.write("event = {}\n".format(event_to_dict(event))) + + if event.id == Event.START_DUMP and not state['running']: + start = state["last_start"] + done = state["last_done"] + if done and start: + actual_wait = (event.ts - done.ts) / 1000000 + csv = "{}, {}, {}, {}, {}, {}, {}, {}, {}, {:.2f}".format( + start.ts, done.ts, done.n_flows, graph.ukey_count, + done.avg_n_flows, done.max_n_flows, done.flow_limit, + done.dump_duration, done.poll_wait, actual_wait) + + if graph.base_time == 0: + graph = graph._replace(base_time=done.ts) + + graph.time.append((done.ts - graph.base_time) / 1000000000) + graph.n_flows.append(done.n_flows) + graph.n_reval_flows.append(graph.ukey_count) + graph.avg_n_flows.append(done.avg_n_flows) + graph.max_n_flows.append(done.max_n_flows) + graph.flow_limit.append(done.flow_limit) + graph.dump_duration.append(done.dump_duration) + graph.poll_wait.append(done.poll_wait) + graph.actual_wait.append(actual_wait) + + if not options.no_gui and not options.no_realtime_plots: + updated_graph = dynamic_plot_update( + graph, refresh=options.update_interval) + if updated_graph is None: + raise KeyboardInterrupt + graph = updated_graph + + if options.compress_output: + last_csv = state["last_csv"] + if not last_csv or \ + csv.split(",")[2:-1] != last_csv.split(",")[2:-1] or \ + abs((event.ts - done.ts) / 1000000 - done.poll_wait) > 100: + print(csv) + else: + state["last_not_printed_csv"] = csv + + state["last_csv"] = csv + else: + print(csv) + + state["last_start"] = event + state['running'] = True + graph = graph._replace(ukey_count=0) + elif event.id == Event.SWEEP_DONE and state['running']: + state["last_done"] = event + state['running'] = False + elif event.id == Event.REVAL_ENTRY and state['running']: + graph = graph._replace(ukey_count=graph.ukey_count + 1) + + +# +# run_program() +# +def run_program(command): + try: + process = subprocess.run(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf8', + check=True) + + except subprocess.CalledProcessError as perror: + return perror.returncode, perror.stdout + + return 0, process.stdout + + +# +# get_ovs_definitions() +# +def get_ovs_definitions(objects, pahole="pahole", pid=None): + if pid is None: + raise ValueError("A valid pid value should be supplied!") + + if not isinstance(objects, list): + objects = [objects] + + if len(objects) == 0: + raise ValueError("Must supply at least one object!") + + vswitchd = Path("/proc/{}/exe".format(str(pid))).resolve() + + object_str = ','.join(objects) + + def run_pahole(debug_file): + error, result = run_program([pahole, "-C", object_str, "--compile", + debug_file]) + + if error: + if "pahole: {}: Invalid argument".format(debug_file) not in result: + print("ERROR: Pahole failed to get ovs-vswitchd data " + "structures!\n{}".format(re.sub('^', ' ' * 7, + result.rstrip(), + flags=re.MULTILINE))) + sys.exit(-1) + + return None + + if bool(re.search("pahole: type .* not found", result)): + return None + + return result + + def run_readelf(bin_file): + error, result = run_program(['readelf', "-n", + "--debug-dump=links", bin_file]) + + if error: + print("ERROR: Failed 'readelf' on \"{}\"!\n{}". + format(bin_file, re.sub('^', ' ' * 7, result, + flags=re.MULTILINE))) + sys.exit(-1) + + return result + + def get_debug_file(bin_file): + elf_result = run_readelf(bin_file) + match = re.search("Build ID: ([0-9a-fA-F]+)", elf_result) + if not match: + print("ERROR: Can't find build ID to read debug symbols!") + sys.exit(-1) + + dbg_file = "/usr/lib/debug/.build-id/{}/{}.debug".format( + match.group(1)[:2], match.group(1)[2:]) + + return dbg_file + + def get_from_shared_library(debug_file): + ovs_libs = ['libofproto', 'libopenvswitch', 'libovsdb', 'libsflow', + 'libvtep'] + error, ldd_result = run_program(['ldd', debug_file]) + + if error: + print("ERROR: Failed 'ldd' on \"{}\"!\n{}". + format(debug_file, re.sub('^', ' ' * 7, ldd_result, + flags=re.MULTILINE))) + sys.exit(-1) + + for lib in ovs_libs: + match = re.search(r"^\s*{}.* => (.*) \(.*\)$".format(lib), + ldd_result, flags=re.MULTILINE) + if match is None: + continue + + result = run_pahole(match.group(1)) + if result is None: + result = run_pahole(get_debug_file(match.group(1))) + + if result: + return result + + return None + + # + # First try to find the debug data as part of the executable. + # + result = run_pahole(vswitchd) + + if result is None: + print("INFO: Failed to find debug info in \"{}\"!".format(vswitchd)) + + # + # Get additional .debug information if available. + # + dbg_file = get_debug_file(vswitchd) + result = run_pahole(dbg_file) + if result is None: + print("INFO: Failed to find debug info in \"{}\"!".format( + dbg_file)) + + # + # Try to get information from shared libraries if used. + # + result = get_from_shared_library(vswitchd) + + if result is None: + print("ERROR: Failed to find needed data structures through pahole!") + sys.exit(-1) + + # + # We need an empty _Atomic definition to avoid compiler complaints. + # + result = "#define _Atomic\n" + result + + # + # Remove the uint64_t definition as it conflicts with the kernel one. + # + result = re.sub("^typedef.*uint64_t;$", "", result, flags=re.MULTILINE) + + return result + + +# +# next_power_of_two() +# +def next_power_of_two(val): + np = 1 + while np < val: + np *= 2 + return np + + +# +# dynamic_plot_init() +# +def dynamic_plot_init(real_time=True): + + if real_time: + lines = [] + fig, axs = plt.subplots(4, figsize=(19, 10)) + fig.suptitle('Revalidator Handling') + for ax in axs: + ax.grid() + + axs[0].set_ylabel("Numer of flows", weight='bold') + axs[1].set_ylabel("Time spend (ms)", weight='bold') + axs[2].set_ylabel("Numer of flows", weight='bold') + axs[3].set_ylabel("Time spend (ms)", weight='bold') + axs[3].set_xlabel("Time (seconds since start)", weight='bold') + + lines.append(axs[0].plot([], [], label="n_flows", marker='o')[0]) + lines.append(axs[0].plot([], [], label="n_reval_flows")[0]) + axs[0].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[0].set_xlim(0, 30) + axs[0].set_ylim(-4, 104) + + lines.append(axs[1].plot([], [], color="orange", + label="dump_duration")[0]) + axs[1].legend(bbox_to_anchor=(1, 1), + loc='upper left', borderaxespad=0.5) + axs[1].set_xlim(0, 30) + axs[1].set_ylim(-0.4, 10.4) + + lines.append(axs[2].plot([], [], label="avg_n_flows")[0]) + lines.append(axs[2].plot([], [], label="max_n_flows")[0]) + lines.append(axs[2].plot([], [], label="flow_limit")[0]) + axs[2].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[2].set_xlim(0, 30) + axs[2].set_ylim(-600, 15600) + + lines.append(axs[3].plot([], [], label="poll_wait")[0]) + lines.append(axs[3].plot([], [], label="actual_wait")[0]) + axs[3].legend(bbox_to_anchor=(1, 1), loc='upper left', + borderaxespad=0.5) + axs[3].set_xlim(0, 30) + axs[3].set_ylim(-20, 520) + + fig.tight_layout() + + plt.ion() + plt.show() + else: + fig = None + axs = None + lines = None + + graph_data = {"base_time": 0, + "l_index": 0, + "fig": fig, + "axs": axs, + "lines": lines, + "last_update": 0, + "ukey_count": 0, + "time": [], + "n_flows": [], + "n_reval_flows": [], + "avg_n_flows": [], + "max_n_flows": [], + "flow_limit": [], + "dump_duration": [], + "poll_wait": [], + "actual_wait": []} + + return namedtuple("GraphData", graph_data.keys())(*graph_data.values()) + + +# +# dynamic_plot_update() +# +def dynamic_plot_update(graph_data, refresh=1): + + if graph_data.last_update != 0 and \ + (graph_data.time[-1] - graph_data.last_update) < refresh: + return graph_data + + graph_data = graph_data._replace(last_update=graph_data.time[-1]) + + if (graph_data.time[-1] - graph_data.time[graph_data.l_index]) > 30: + for i in range(graph_data.l_index + 1, len(graph_data.time)): + if (graph_data.time[-1] - graph_data.time[i]) <= 30: + graph_data = graph_data._replace(l_index=i) + break + + for line in graph_data.lines: + line.set_xdata(graph_data.time[graph_data.l_index:]) + + graph_data.lines[0].set_ydata(graph_data.n_flows[graph_data.l_index:]) + graph_data.lines[1].set_ydata( + graph_data.n_reval_flows[graph_data.l_index:]) + graph_data.lines[2].set_ydata( + graph_data.dump_duration[graph_data.l_index:]) + graph_data.lines[3].set_ydata(graph_data.avg_n_flows[graph_data.l_index:]) + graph_data.lines[4].set_ydata(graph_data.max_n_flows[graph_data.l_index:]) + graph_data.lines[5].set_ydata(graph_data.flow_limit[graph_data.l_index:]) + graph_data.lines[6].set_ydata(graph_data.poll_wait[graph_data.l_index:]) + graph_data.lines[7].set_ydata(graph_data.actual_wait[graph_data.l_index:]) + + for ax in graph_data.axs: + if graph_data.l_index == 0: + ax.autoscale(enable=True, axis='y') + else: + ax.autoscale(enable=True) + + ax.relim(visible_only=True) + ax.autoscale_view(tight=True, scalex=True, scaley=True) + + try: + graph_data.fig.canvas.draw() + graph_data.fig.canvas.flush_events() + except KeyboardInterrupt: + return None + + return graph_data + + +# +# show_graph() +# +def show_graph(graph_data, gui=False, file_name=None): + + if len(graph_data.time) == 0 or (not gui and file_name is None): + return + + plt.ioff() + + fig, (nf_ax, dd_ax, f_ax, t_ax) = plt.subplots(4, figsize=(19, 10)) + fig.suptitle('Revalidator Handling') + nf_ax.grid() + f_ax.grid() + dd_ax.grid() + t_ax.grid() + + nf_ax.set_ylabel("Numer of flows", weight='bold') + f_ax.set_ylabel("Numer of flows", weight='bold') + dd_ax.set_ylabel("Time spend (ms)", weight='bold') + t_ax.set_ylabel("Time spend (ms)", weight='bold') + t_ax.set_xlabel("Time (seconds since start)", weight='bold') + + nf_ax.plot(graph_data.time, graph_data.n_flows, label="n_flows") + nf_ax.plot(graph_data.time, graph_data.n_reval_flows, + label="n_reval_flows") + nf_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + dd_ax.plot(graph_data.time, graph_data.dump_duration, color="orange", + label="dump_duration") + dd_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + f_ax.plot(graph_data.time, graph_data.avg_n_flows, label="avg_n_flows") + f_ax.plot(graph_data.time, graph_data.max_n_flows, label="max_n_flows") + f_ax.plot(graph_data.time, graph_data.flow_limit, label="flow_limit") + f_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + t_ax.plot(graph_data.time, graph_data.poll_wait, label="poll_wait") + t_ax.plot(graph_data.time, graph_data.actual_wait, label="actual_wait") + t_ax.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=0.5) + + fig.tight_layout() + + if file_name is not None and file_name != "": + fig.savefig(file_name + '.png') + + if gui: + try: + plt.show() + except KeyboardInterrupt: + pass + + plt.close(fig) + + +# +# process_events_from_file() +# +def process_events_from_file(file_name): + try: + with open(file_name, 'r') as fd: + print("- Reading events from \"{}\"...".format(file_name)) + + print_csv_header() + for entry in fd: + entry.rstrip() + if entry.startswith('event = {'): + event = ast.literal_eval(entry[8:]) + event = namedtuple("EventObject", + event.keys())(*event.values()) + try: + _process_event(event) + except KeyboardInterrupt: + break + + except (FileNotFoundError, PermissionError): + print("ERROR: Can't open file \"{}\" for reading!".format(file_name)) + sys.exit(-1) + + show_graph(graph, gui=not options.no_gui, file_name=options.write_charts) + + +# +# main() +# +def main(): + # + # Don't like these globals, but ctx passing does not seem to work with the + # existing open_ring_buffer() API :( + # + global b + global export_file + global options + global state + global graph + + # + # Argument parsing + # + parser = argparse.ArgumentParser() + + parser.add_argument("-c", "--compress-output", action="store_true", + help="Compress output, i.e. only dump changes in " + "the dataset") + parser.add_argument("--buffer-page-count", + help="Number of BPF ring buffer pages, default 1024", + type=int, default=1024, metavar="NUMBER") + parser.add_argument("-D", "--debug", + help="Enable eBPF debugging", + type=int, const=0x3f, default=0, nargs='?') + parser.add_argument("-g", "--no-gui", action="store_true", + help="Do not use the gui to display plots") + parser.add_argument("--no-ukey-count", action="store_true", + help="No revalidate_ukey() counting") + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", + help="ovs-vswitch's PID", + type=int, default=None) + parser.add_argument("-P", "--pahole", metavar="PAHOLE", + help="Pahole executable to use, default pahole", + type=str, default="pahole") + parser.add_argument("-r", "--read-events", + help="Read events from instead of installing " + "tracepoints", type=str, default=None, metavar="FILE") + parser.add_argument("-R", "--no-realtime-plots", action="store_true", + help="Do not show realtime plot while tracing") + parser.add_argument("-u", "--update-interval", + help="Seconds to wait between real time update, " + "default 1", type=float, default=1, metavar="SECONDS") + parser.add_argument("-w", "--write-events", + help="Write events to ", + type=str, default=None, metavar="FILE") + parser.add_argument("-W", "--write-charts", + help="Write overall charts to .png", + type=str, default=None, metavar="FILE") + + options = parser.parse_args() + + # + # Find the PID of the ovs-vswitchd daemon if not specified. + # + if options.pid is None and options.read_events is None: + for proc in psutil.process_iter(): + if 'ovs-vswitchd' in proc.name(): + if options.pid is not None: + print("ERROR: Multiple ovs-vswitchd daemons running, " + "use the -p option!") + sys.exit(-1) + + options.pid = proc.pid + + # + # Error checking on input parameters. + # + if options.pid is None and options.read_events is None: + print("ERROR: Failed to find ovs-vswitchd's PID!") + sys.exit(-1) + + if options.read_events is not None and options.write_events is not None: + print("ERROR: Either supply the read or write events option, " + "not both!") + sys.exit(-1) + + options.buffer_page_count = next_power_of_two(options.buffer_page_count) + + # + # Define the state and graph. + # + state = {"last_start": None, + "last_done": None, + "running": False, + "last_csv": None, + "last_not_printed_csv": None} + + export_file = None + + graph = dynamic_plot_init(real_time=(not options.no_gui + and not options.no_realtime_plots)) + + # + # Process events from file if required. + # + if options.read_events is not None: + process_events_from_file(options.read_events) + sys.exit(0) + + # + # Open write handle if needed. + # + if options.write_events is not None: + try: + export_file = open(options.write_events, "w") + except (FileNotFoundError, IOError, PermissionError) as e: + print("ERROR: Can't create export file \"{}\": {}".format( + options.write_events, e.strerror)) + sys.exit(-1) + + # + # Attach the usdt probe. + # + u = USDT(pid=int(options.pid)) + try: + u.enable_probe(probe="start_dump", fn_name="probe__start_dump") + u.enable_probe(probe="sweep_done", fn_name="probe__sweep_done") + if not options.no_ukey_count: + u.enable_probe(probe="revalidate_ukey__:entry", + fn_name="probe__reval_entry") + except USDTException as e: + print("ERROR: {}".format( + (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip(). + replace("--with-dtrace or --enable-dtrace", + "--enable-usdt-probes"))) + sys.exit(-1) + + # + # Attach probe to running process. + # + source = EBPF_SOURCE.replace("", "\n".join( + [" EVENT_{} = {},".format( + event.name, event.value) for event in Event])) + source = source.replace("", + str(options.buffer_page_count)) + source = source.replace("", + get_ovs_definitions("udpif", pid=options.pid, + pahole=options.pahole)) + + b = BPF(text=source, usdt_contexts=[u], debug=options.debug) + + # + # Print header. + # + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "# Starting trace @{} ({} UTC)".format( + ltz.isoformat(), utc.strftime("%H:%M:%S")) + + if export_file is not None: + export_file.write(time_string + "\n") + + print(time_string) + print_csv_header() + + # + # Process all events. + b['events'].open_ring_buffer(process_event) + while 1: + try: + b.ring_buffer_poll() + except KeyboardInterrupt: + break + + dropcnt = b.get_table("dropcnt") + for k in dropcnt.keys(): + count = dropcnt.sum(k).value + if k.value == 0 and count > 0: + print("\n# WARNING: Not all upcalls were captured, {} were " + "dropped!\n# Increase the BPF ring buffer size " + "with the --buffer-page-count option.".format(count)) + + # + # Display footer. + # + if state["last_not_printed_csv"] is not None: + print(state["last_not_printed_csv"]) + + ltz = datetime.datetime.now() + utc = ltz.astimezone(pytz.utc) + time_string = "# Stopping trace @{} ({} UTC)".format( + ltz.isoformat(), utc.strftime("%H:%M:%S")) + + if export_file is not None: + export_file.write(time_string + "\n") + + print(time_string) + + # + # Close event file is used. + # + if options.write_events is not None: + export_file.close() + + # + # Do final graph if requested. + # + show_graph(graph, gui=not options.no_gui, file_name=options.write_charts) + + +# +# Start main() as the default entry point... +# +if __name__ == '__main__': + main() diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 5f810270774..4d7c4f819f2 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -184,6 +184,8 @@ struct aa_mapping { /* Internal representation of conntrack zone configuration table in OVSDB. */ struct ct_zone { uint16_t zone_id; + int64_t limit; /* Limit of allowed entries. '-1' if not + * specified. */ struct simap tp; /* A map from timeout policy attribute to * timeout value. */ struct hmap_node node; /* Node in 'struct datapath' 'ct_zones' @@ -195,14 +197,15 @@ struct ct_zone { /* Internal representation of datapath configuration table in OVSDB. */ struct datapath { - char *type; /* Datapath type. */ - struct hmap ct_zones; /* Map of 'struct ct_zone' elements, indexed - * by 'zone'. */ - struct hmap_node node; /* Node in 'all_datapaths' hmap. */ - struct smap caps; /* Capabilities. */ - unsigned int last_used; /* The last idl_seqno that this 'datapath' - * used in OVSDB. This number is used for - * garbage collection. */ + char *type; /* Datapath type. */ + struct hmap ct_zones; /* Map of 'struct ct_zone' elements, + * indexed by 'zone'. */ + struct hmap_node node; /* Node in 'all_datapaths' hmap. */ + struct smap caps; /* Capabilities. */ + unsigned int last_used; /* The last idl_seqno that this 'datapath' + * used in OVSDB. This number is used for + * garbage collection. */ + int64_t ct_zone_default_limit; /* Default CT limit for all zones. */ }; /* All bridges, indexed by name. */ @@ -313,6 +316,7 @@ static void bridge_configure_mac_table(struct bridge *); static void bridge_configure_mcast_snooping(struct bridge *); static void bridge_configure_sflow(struct bridge *, int *sflow_bridge_number); static void bridge_configure_ipfix(struct bridge *); +static void bridge_configure_lsample(struct bridge *); static void bridge_configure_spanning_tree(struct bridge *); static void bridge_configure_tables(struct bridge *); static void bridge_configure_dp_desc(struct bridge *); @@ -698,6 +702,7 @@ ct_zone_alloc(uint16_t zone_id, struct ovsrec_ct_timeout_policy *tp_cfg) struct ct_zone *ct_zone = xzalloc(sizeof *ct_zone); ct_zone->zone_id = zone_id; + ct_zone->limit = -1; simap_init(&ct_zone->tp); get_timeout_policy_from_ovsrec(&ct_zone->tp, tp_cfg); return ct_zone; @@ -706,6 +711,14 @@ ct_zone_alloc(uint16_t zone_id, struct ovsrec_ct_timeout_policy *tp_cfg) static void ct_zone_remove_and_destroy(struct datapath *dp, struct ct_zone *ct_zone) { + if (!simap_is_empty(&ct_zone->tp)) { + ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); + } + + if (ct_zone->limit > -1) { + ofproto_ct_zone_limit_update(dp->type, ct_zone->zone_id, NULL); + } + hmap_remove(&dp->ct_zones, &ct_zone->node); simap_destroy(&ct_zone->tp); free(ct_zone); @@ -742,6 +755,7 @@ datapath_create(const char *type) { struct datapath *dp = xzalloc(sizeof *dp); dp->type = xstrdup(type); + dp->ct_zone_default_limit = -1; hmap_init(&dp->ct_zones); hmap_insert(&all_datapaths, &dp->node, hash_string(type, 0)); smap_init(&dp->caps); @@ -758,6 +772,12 @@ datapath_destroy(struct datapath *dp) ct_zone_remove_and_destroy(dp, ct_zone); } + if (dp->ct_zone_default_limit > -1) { + ofproto_ct_zone_limit_update(dp->type, OVS_ZONE_LIMIT_DEFAULT_ZONE, + NULL); + } + + ofproto_ct_zone_limit_protection_update(dp->type, false); hmap_remove(&all_datapaths, &dp->node); hmap_destroy(&dp->ct_zones); free(dp->type); @@ -770,6 +790,7 @@ static void ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) { struct ct_zone *ct_zone; + bool protected = false; /* Add new 'ct_zone's or update existing 'ct_zone's based on the database * state. */ @@ -779,29 +800,55 @@ ct_zones_reconfigure(struct datapath *dp, struct ovsrec_datapath *dp_cfg) struct ovsrec_ct_timeout_policy *tp_cfg = zone_cfg->timeout_policy; ct_zone = ct_zone_lookup(&dp->ct_zones, zone_id); - if (ct_zone) { - struct simap new_tp = SIMAP_INITIALIZER(&new_tp); - get_timeout_policy_from_ovsrec(&new_tp, tp_cfg); - if (update_timeout_policy(&ct_zone->tp, &new_tp)) { + if (!ct_zone) { + ct_zone = ct_zone_alloc(zone_id, tp_cfg); + hmap_insert(&dp->ct_zones, &ct_zone->node, hash_int(zone_id, 0)); + } + + struct simap new_tp = SIMAP_INITIALIZER(&new_tp); + get_timeout_policy_from_ovsrec(&new_tp, tp_cfg); + + if (update_timeout_policy(&ct_zone->tp, &new_tp)) { + if (simap_count(&ct_zone->tp)) { ofproto_ct_set_zone_timeout_policy(dp->type, ct_zone->zone_id, &ct_zone->tp); + } else { + ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); } - } else { - ct_zone = ct_zone_alloc(zone_id, tp_cfg); - hmap_insert(&dp->ct_zones, &ct_zone->node, hash_int(zone_id, 0)); - ofproto_ct_set_zone_timeout_policy(dp->type, ct_zone->zone_id, - &ct_zone->tp); } + + int64_t desired_limit = zone_cfg->limit ? *zone_cfg->limit : -1; + if (ct_zone->limit != desired_limit) { + ofproto_ct_zone_limit_update(dp->type, zone_id, zone_cfg->limit); + ct_zone->limit = desired_limit; + } + ct_zone->last_used = idl_seqno; + + protected = protected || !!zone_cfg->limit; } /* Purge 'ct_zone's no longer found in the database. */ HMAP_FOR_EACH_SAFE (ct_zone, node, &dp->ct_zones) { if (ct_zone->last_used != idl_seqno) { - ofproto_ct_del_zone_timeout_policy(dp->type, ct_zone->zone_id); ct_zone_remove_and_destroy(dp, ct_zone); } } + + /* Reconfigure default CT zone limit if needed. */ + int64_t default_limit = dp_cfg->ct_zone_default_limit + ? *dp_cfg->ct_zone_default_limit + : -1; + + if (dp->ct_zone_default_limit != default_limit) { + ofproto_ct_zone_limit_update(dp->type, OVS_ZONE_LIMIT_DEFAULT_ZONE, + dp_cfg->ct_zone_default_limit); + dp->ct_zone_default_limit = default_limit; + } + + protected = protected || !!dp_cfg->ct_zone_default_limit; + + ofproto_ct_zone_limit_protection_update(dp->type, protected); } static void @@ -868,6 +915,9 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) ofproto_set_min_revalidate_pps( smap_get_uint(&ovs_cfg->other_config, "min-revalidate-pps", OFPROTO_MIN_REVALIDATE_PPS_DEFAULT)); + ofproto_set_offloaded_stats_delay( + smap_get_uint(&ovs_cfg->other_config, "offloaded-stats-delay", + OFPROTO_OFFLOADED_STATS_DELAY)); ofproto_set_vlan_limit(smap_get_int(&ovs_cfg->other_config, "vlan-limit", LEGACY_MAX_VLAN_HEADERS)); ofproto_set_bundle_idle_timeout(smap_get_uint(&ovs_cfg->other_config, @@ -876,6 +926,10 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) smap_get_int(&ovs_cfg->other_config, "n-handler-threads", 0), smap_get_int(&ovs_cfg->other_config, "n-revalidator-threads", 0)); + ofproto_set_explicit_sampled_drops( + smap_get_bool(&ovs_cfg->other_config, "explicit-sampled-drops", + OFPROTO_EXPLICIT_SAMPLED_DROPS_DEFAULT)); + /* Destroy "struct bridge"s, "struct port"s, and "struct iface"s according * to 'ovs_cfg', with only very minimal configuration otherwise. * @@ -976,6 +1030,7 @@ bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg) bridge_configure_netflow(br); bridge_configure_sflow(br, &sflow_bridge_number); bridge_configure_ipfix(br); + bridge_configure_lsample(br); bridge_configure_spanning_tree(br); bridge_configure_tables(br); bridge_configure_dp_desc(br); @@ -1528,10 +1583,11 @@ ovsrec_ipfix_is_valid(const struct ovsrec_ipfix *ipfix) return ipfix && ipfix->n_targets > 0; } -/* Returns whether a Flow_Sample_Collector_Set row is valid. */ +/* Returns whether a Flow_Sample_Collector_Set row contains a valid IPFIX + * configuration. */ static bool -ovsrec_fscs_is_valid(const struct ovsrec_flow_sample_collector_set *fscs, - const struct bridge *br) +ovsrec_fscs_is_valid_ipfix(const struct ovsrec_flow_sample_collector_set *fscs, + const struct bridge *br) { return ovsrec_ipfix_is_valid(fscs->ipfix) && fscs->bridge == br->cfg; } @@ -1549,7 +1605,7 @@ bridge_configure_ipfix(struct bridge *br) const char *virtual_obs_id; OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH(fe_cfg, idl) { - if (ovsrec_fscs_is_valid(fe_cfg, br)) { + if (ovsrec_fscs_is_valid_ipfix(fe_cfg, br)) { n_fe_opts++; } } @@ -1582,15 +1638,26 @@ bridge_configure_ipfix(struct bridge *br) if (be_cfg->cache_max_flows) { be_opts.cache_max_flows = *be_cfg->cache_max_flows; } + if (be_cfg->stats_interval) { + be_opts.stats_interval = *be_cfg->stats_interval; + } else { + be_opts.stats_interval = OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + } + if (be_cfg->template_interval) { + be_opts.template_interval = *be_cfg->template_interval; + } else { + be_opts.template_interval = + OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + } be_opts.enable_tunnel_sampling = smap_get_bool(&be_cfg->other_config, "enable-tunnel-sampling", true); - be_opts.enable_input_sampling = !smap_get_bool(&be_cfg->other_config, - "enable-input-sampling", false); + be_opts.enable_input_sampling = smap_get_bool(&be_cfg->other_config, + "enable-input-sampling", true); - be_opts.enable_output_sampling = !smap_get_bool(&be_cfg->other_config, - "enable-output-sampling", false); + be_opts.enable_output_sampling = smap_get_bool(&be_cfg->other_config, + "enable-output-sampling", true); virtual_obs_id = smap_get(&be_cfg->other_config, "virtual_obs_id"); be_opts.virtual_obs_id = nullable_xstrdup(virtual_obs_id); @@ -1601,7 +1668,7 @@ bridge_configure_ipfix(struct bridge *br) fe_opts = xcalloc(n_fe_opts, sizeof *fe_opts); opts = fe_opts; OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH(fe_cfg, idl) { - if (ovsrec_fscs_is_valid(fe_cfg, br)) { + if (ovsrec_fscs_is_valid_ipfix(fe_cfg, br)) { opts->collector_set_id = fe_cfg->id; sset_init(&opts->targets); sset_add_array(&opts->targets, fe_cfg->ipfix->targets, @@ -1610,6 +1677,12 @@ bridge_configure_ipfix(struct bridge *br) ? *fe_cfg->ipfix->cache_active_timeout : 0; opts->cache_max_flows = fe_cfg->ipfix->cache_max_flows ? *fe_cfg->ipfix->cache_max_flows : 0; + opts->stats_interval = fe_cfg->ipfix->stats_interval + ? *fe_cfg->ipfix->stats_interval + : OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; + opts->template_interval = fe_cfg->ipfix->template_interval + ? *fe_cfg->ipfix->template_interval + : OFPROTO_IPFIX_DEFAULT_TEMPLATE_INTERVAL; opts->enable_tunnel_sampling = smap_get_bool( &fe_cfg->ipfix->other_config, "enable-tunnel-sampling", true); @@ -1641,6 +1714,71 @@ bridge_configure_ipfix(struct bridge *br) } } +/* Returns whether a Flow_Sample_Collector_Set row contains a valid local + * sampling configuration. */ +static bool +ovsrec_fscs_is_valid_local(const struct ovsrec_flow_sample_collector_set *fscs, + const struct bridge *br) +{ + return fscs->local_group_id && fscs->n_local_group_id == 1 && + fscs->bridge == br->cfg; +} + +/* Set local sample configuration on 'br'. */ +static void +bridge_configure_lsample(struct bridge *br) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + const struct ovsrec_flow_sample_collector_set *fscs; + struct ofproto_lsample_options *opts_array, *opts; + size_t n_opts = 0; + int ret; + + /* Iterate the Flow_Sample_Collector_Set table twice. + * First to get the number of valid configuration entries, then to process + * each of them and build an array of options. */ + OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH (fscs, idl) { + if (ovsrec_fscs_is_valid_local(fscs, br)) { + n_opts++; + } + } + + if (n_opts == 0) { + ofproto_set_local_sample(br->ofproto, NULL, 0); + return; + } + + opts_array = xcalloc(n_opts, sizeof *opts_array); + opts = opts_array; + + OVSREC_FLOW_SAMPLE_COLLECTOR_SET_FOR_EACH (fscs, idl) { + if (!ovsrec_fscs_is_valid_local(fscs, br)) { + continue; + } + opts->collector_set_id = fscs->id; + opts->group_id = *fscs->local_group_id; + opts++; + } + + ret = ofproto_set_local_sample(br->ofproto, opts_array, n_opts); + + if (ret == EOPNOTSUPP) { + if (n_opts) { + VLOG_WARN_RL(&rl, + "bridge %s: ignoring local sampling configuration: " + "not supported by this datapath", + br->name); + } + } else if (ret) { + VLOG_ERR_RL(&rl, "bridge %s: error configuring local sampling: %s", + br->name, ovs_strerror(ret)); + } + + if (n_opts > 0) { + free(opts_array); + } +} + static void port_configure_stp(const struct ofproto *ofproto, struct port *port, struct ofproto_port_stp_settings *port_s, @@ -1714,11 +1852,12 @@ port_configure_stp(const struct ofproto *ofproto, struct port *port, if (config_str) { port_s->path_cost = strtoul(config_str, NULL, 10); } else { - enum netdev_features current; - unsigned int mbps; + uint32_t mbps; - netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, 100 * 1000 * 1000) / 1000000; + netdev_get_speed(iface->netdev, &mbps, NULL); + if (!mbps) { + mbps = NETDEV_DEFAULT_BPS / 1000000; + } port_s->path_cost = stp_convert_speed_to_cost(mbps); } @@ -1797,11 +1936,12 @@ port_configure_rstp(const struct ofproto *ofproto, struct port *port, if (config_str) { port_s->path_cost = strtoul(config_str, NULL, 10); } else { - enum netdev_features current; - unsigned int mbps; + uint32_t mbps; - netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - mbps = netdev_features_to_bps(current, 100 * 1000 * 1000) / 1000000; + netdev_get_speed(iface->netdev, &mbps, NULL); + if (!mbps) { + mbps = NETDEV_DEFAULT_BPS / 1000000; + } port_s->path_cost = rstp_convert_speed_to_cost(mbps); } @@ -2626,6 +2766,7 @@ iface_refresh_netdev_status(struct iface *iface) struct eth_addr mac; int64_t bps, mtu_64, ifindex64, link_resets; int mtu, error; + uint32_t mbps; if (iface_is_synthetic(iface)) { return; @@ -2664,14 +2805,19 @@ iface_refresh_netdev_status(struct iface *iface) ovsrec_interface_set_link_resets(iface->cfg, &link_resets, 1); error = netdev_get_features(iface->netdev, ¤t, NULL, NULL, NULL); - bps = !error ? netdev_features_to_bps(current, 0) : 0; - if (bps) { + if (!error) { ovsrec_interface_set_duplex(iface->cfg, netdev_features_is_full_duplex(current) ? "full" : "half"); - ovsrec_interface_set_link_speed(iface->cfg, &bps, 1); } else { ovsrec_interface_set_duplex(iface->cfg, NULL); + } + + netdev_get_speed(iface->netdev, &mbps, NULL); + if (mbps) { + bps = mbps * 1000000ULL; + ovsrec_interface_set_link_speed(iface->cfg, &bps, 1); + } else { ovsrec_interface_set_link_speed(iface->cfg, NULL, 0); } @@ -2847,13 +2993,16 @@ iface_refresh_stats(struct iface *iface) IFACE_STAT(tx_512_to_1023_packets, "tx_512_to_1023_packets") \ IFACE_STAT(tx_1024_to_1522_packets, "tx_1024_to_1522_packets") \ IFACE_STAT(tx_1523_to_max_packets, "tx_1523_to_max_packets") \ + IFACE_STAT(multicast, "rx_multicast_packets") \ IFACE_STAT(tx_multicast_packets, "tx_multicast_packets") \ IFACE_STAT(rx_broadcast_packets, "rx_broadcast_packets") \ IFACE_STAT(tx_broadcast_packets, "tx_broadcast_packets") \ IFACE_STAT(rx_undersized_errors, "rx_undersized_errors") \ IFACE_STAT(rx_oversize_errors, "rx_oversize_errors") \ IFACE_STAT(rx_fragmented_errors, "rx_fragmented_errors") \ - IFACE_STAT(rx_jabber_errors, "rx_jabber_errors") + IFACE_STAT(rx_jabber_errors, "rx_jabber_errors") \ + IFACE_STAT(upcall_packets, "upcall_packets") \ + IFACE_STAT(upcall_errors, "upcall_errors") #define IFACE_STAT(MEMBER, NAME) + 1 enum { N_IFACE_STATS = IFACE_STATS }; @@ -3549,7 +3698,8 @@ bridge_run(void) vlog_enable_async(); - VLOG_INFO_ONCE("%s (Open vSwitch) %s", program_name, VERSION); + VLOG_INFO_ONCE("%s (Open vSwitch) %s", program_name, + VERSION VERSION_SUFFIX); } } @@ -5315,6 +5465,7 @@ mirror_configure(struct mirror *m) { const struct ovsrec_mirror *cfg = m->cfg; struct ofproto_mirror_settings s; + int ret; /* Set name. */ if (strcmp(cfg->name, m->name)) { @@ -5383,8 +5534,18 @@ mirror_configure(struct mirror *m) /* Get VLAN selection. */ s.src_vlans = vlan_bitmap_from_array(cfg->select_vlan, cfg->n_select_vlan); + /* Set the filter, mirror_set() will strdup this pointer. */ + s.filter = cfg->filter; + /* Configure. */ - ofproto_mirror_register(m->bridge->ofproto, m, &s); + ret = ofproto_mirror_register(m->bridge->ofproto, m, &s); + if (ret == EOPNOTSUPP) { + VLOG_ERR("ofproto %s: does not support mirroring", + m->bridge->ofproto->name); + } else if (ret) { + VLOG_ERR("bridge %s: mirror %s configuration is invalid", + m->bridge->name, m->name); + } /* Clean up. */ if (s.srcs != s.dsts) { diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in index 9569265fcb6..98e58951dcf 100644 --- a/vswitchd/ovs-vswitchd.8.in +++ b/vswitchd/ovs-vswitchd.8.in @@ -68,10 +68,11 @@ load the Open vSwitch kernel module. .PP .SH OPTIONS .IP "\fB\-\-mlockall\fR" -Causes \fBovs\-vswitchd\fR to call the \fBmlockall()\fR function, to -attempt to lock all of its process memory into physical RAM, -preventing the kernel from paging any of its memory to disk. This -helps to avoid networking interruptions due to system memory pressure. +Causes \fBovs\-vswitchd\fR to call the \fBmlockall()\fR function, to attempt to +lock all of its process memory into physical RAM on page faults (on allocation, +when running on Linux kernel 4.4 or older), preventing the kernel from paging +any of its memory to disk. This helps to avoid networking interruptions due to +system memory pressure. .IP Some systems do not support \fBmlockall()\fR at all, and other systems only allow privileged users, such as the superuser, to use it. @@ -81,6 +82,15 @@ unavailable or unsuccessful. .SS "DPDK Options" For details on initializing \fBovs\-vswitchd\fR to use DPDK ports, refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5). +.SS "DPDK HW Access Options" +.IP "\fB\-\-hw\-rawio\-access\fR" +Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability, +to allow userspace drivers access to raw hardware memory. This will +also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and +\fBioperm()\fR functions as well as access memory devices to set port +access. This is a \fBvery\fR powerful capability, so generally only +enable as needed for specific hardware (for example mlx5 with full +hardware offload via rte_flow). .SS "Daemon Options" .ds DD \ \fBovs\-vswitchd\fR detaches only after it has connected to the \ diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c index ac36165860a..be68ff7f7ef 100644 --- a/vswitchd/ovs-vswitchd.c +++ b/vswitchd/ovs-vswitchd.c @@ -60,28 +60,32 @@ VLOG_DEFINE_THIS_MODULE(vswitchd); -/* --mlockall: If set, locks all process memory into physical RAM, preventing +/* --mlockall: If set, locks all present process memory pages into physical + * RAM and all the new pages the moment they are faulted in, preventing * the kernel from paging any of its memory to disk. */ static bool want_mlockall; +/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */ +static bool hw_rawio_access; + static unixctl_cb_func ovs_vswitchd_exit; static char *parse_options(int argc, char *argv[], char **unixctl_path); OVS_NO_RETURN static void usage(void); -struct ovs_vswitchd_exit_args { - bool *exiting; - bool *cleanup; -}; +static struct ovs_vswitchd_exit_args { + struct unixctl_conn **conns; + size_t n_conns; + bool exiting; + bool cleanup; +} exit_args; int main(int argc, char *argv[]) { - char *unixctl_path = NULL; struct unixctl_server *unixctl; + char *unixctl_path = NULL; char *remote; - bool exiting, cleanup; - struct ovs_vswitchd_exit_args exit_args = {&exiting, &cleanup}; int retval; set_program_name(argv[0]); @@ -93,14 +97,20 @@ main(int argc, char *argv[]) remote = parse_options(argc, argv, &unixctl_path); fatal_ignore_sigpipe(); - daemonize_start(true); + daemonize_start(true, hw_rawio_access); if (want_mlockall) { #ifdef HAVE_MLOCKALL - if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - VLOG_ERR("mlockall failed: %s", ovs_strerror(errno)); - } else { - set_memory_locked(); +/* MCL_ONFAULT introduced in Linux kernel 4.4. */ +#ifndef MCL_ONFAULT +#define MCL_ONFAULT 4 +#endif + if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + VLOG_ERR("mlockall failed: %s", ovs_strerror(errno)); + } else { + set_all_memory_locked(); + } } #else VLOG_ERR("mlockall not supported on this system"); @@ -112,14 +122,12 @@ main(int argc, char *argv[]) exit(EXIT_FAILURE); } unixctl_command_register("exit", "[--cleanup]", 0, 1, - ovs_vswitchd_exit, &exit_args); + ovs_vswitchd_exit, NULL); bridge_init(remote); free(remote); - exiting = false; - cleanup = false; - while (!exiting) { + while (!exit_args.exiting) { OVS_USDT_PROBE(main, run_start); memory_run(); if (memory_should_report()) { @@ -138,16 +146,22 @@ main(int argc, char *argv[]) bridge_wait(); unixctl_server_wait(unixctl); netdev_wait(); - if (exiting) { + if (exit_args.exiting) { poll_immediate_wake(); } OVS_USDT_PROBE(main, poll_block); poll_block(); if (should_service_stop()) { - exiting = true; + exit_args.exiting = true; } } - bridge_exit(cleanup); + bridge_exit(exit_args.cleanup); + + for (size_t i = 0; i < exit_args.n_conns; i++) { + unixctl_command_reply(exit_args.conns[i], NULL); + } + free(exit_args.conns); + unixctl_server_destroy(unixctl); service_stop(); vlog_disable_async(); @@ -173,6 +187,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) OPT_DPDK, SSL_OPTION_ENUMS, OPT_DUMMY_NUMA, + OPT_HW_RAWIO_ACCESS, #if defined(P4OVS) OPT_GRPC_ADDR, #endif @@ -192,6 +207,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) {"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE}, {"dpdk", optional_argument, NULL, OPT_DPDK}, {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA}, + {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS}, #if defined(P4OVS) {"grpc-addr", optional_argument, NULL, 'g'}, #endif @@ -259,6 +275,10 @@ parse_options(int argc, char *argv[], char **unixctl_pathp) ovs_numa_set_dummy(optarg); break; + case OPT_HW_RAWIO_ACCESS: + hw_rawio_access = true; + break; + #if defined(P4OVS) case OPT_GRPC_ADDR: case 'g': @@ -317,10 +337,14 @@ usage(void) static void ovs_vswitchd_exit(struct unixctl_conn *conn, int argc, - const char *argv[], void *exit_args_) + const char *argv[], void *args OVS_UNUSED) { - struct ovs_vswitchd_exit_args *exit_args = exit_args_; - *exit_args->exiting = true; - *exit_args->cleanup = argc == 2 && !strcmp(argv[1], "--cleanup"); - unixctl_command_reply(conn, NULL); + exit_args.n_conns++; + exit_args.conns = xrealloc(exit_args.conns, + exit_args.n_conns * sizeof *exit_args.conns); + exit_args.conns[exit_args.n_conns - 1] = conn; + exit_args.exiting = true; + if (!exit_args.cleanup) { + exit_args.cleanup = argc == 2 && !strcmp(argv[1], "--cleanup"); + } } diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema index 4873cfde72d..68689fe2a30 100644 --- a/vswitchd/vswitch.ovsschema +++ b/vswitchd/vswitch.ovsschema @@ -1,6 +1,6 @@ {"name": "Open_vSwitch", - "version": "8.3.0", - "cksum": "3781850481 26690", + "version": "8.7.0", + "cksum": "3751637058 27869", "tables": { "Open_vSwitch": { "columns": { @@ -280,7 +280,8 @@ "min": 0, "max": "unlimited"}}, "bfd_status": { "type": {"key": "string", "value": "string", - "min": 0, "max": "unlimited"}}, + "min": 0, "max": "unlimited"}, + "ephemeral": true}, "cfm_mpid": { "type": { "key": {"type": "integer"}, @@ -460,6 +461,9 @@ "type": {"key": "string", "value": "integer", "min": 0, "max": "unlimited"}, "ephemeral": true}, + "filter": { + "type": {"key": {"type": "string"}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, @@ -530,6 +534,16 @@ "minInteger": 0, "maxInteger": 4294967295}, "min": 0, "max": 1}}, + "stats_interval": { + "type": {"key": {"type": "integer", + "minInteger": 1, + "maxInteger": 3600}, + "min": 0, "max": 1}}, + "template_interval": { + "type": {"key": {"type": "integer", + "minInteger": 1, + "maxInteger": 3600}, + "min": 0, "max": 1}}, "other_config": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}, @@ -551,6 +565,11 @@ "type": {"key": {"type": "uuid", "refTable": "IPFIX"}, "min": 0, "max": 1}}, + "local_group_id": { + "type": {"key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}, @@ -659,6 +678,11 @@ "capabilities": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}, + "ct_zone_default_limit": { + "type": { "key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, @@ -668,6 +692,11 @@ "type": {"key": {"type": "uuid", "refTable": "CT_Timeout_Policy"}, "min": 0, "max": 1}}, + "limit": { + "type": { "key": {"type": "integer", + "minInteger": 0, + "maxInteger": 4294967295}, + "min": 0, "max": 1}}, "external_ids": { "type": {"key": "string", "value": "string", "min": 0, "max": "unlimited"}}}}, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 36388e3c42d..36cb4e49516 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -205,16 +205,30 @@ + type='{"type": "integer", "minInteger": 0}'>

Set minimum pps that flow must have in order to be revalidated when revalidation duration exceeds half of max-revalidator config variable. + Setting to 0 means always revalidate flows regardless of pps.

The default is 5.

+ +

+ Set worst case delay (in ms) it might take before statistics of + offloaded flows are updated. Offloaded flows younger than this + delay will always be revalidated regardless of + . +

+

+ The default is 2000. +

+
+

@@ -788,6 +802,53 @@ The default value is 25%.

+ +

+ Specifies the maximum sleep time that will be requested in + microseconds per iteration for a PMD thread which has received zero + or a small amount of packets from the Rx queues it is polling. +

+

+ The actual sleep time requested is based on the load + of the Rx queues that the PMD polls and may be less than + the maximum value. +

+

+ The default value is 0 microseconds, which means + that the PMD will not sleep regardless of the load from the + Rx queues that it polls. +

+

+ The maximum value is 10000 microseconds. +

+

+ other_config:pmd-sleep-max=<pmd-sleep-list> +

+

where

+

+

    +
  • + <pmd-sleep-list> ::= NULL | <non-empty-list> +
  • +
  • + <non-empty-list> ::= <pmd-sleep-value> | + <pmd-sleep-value> , + <non-empty-list> +
  • +
  • + <pmd-sleep-value> ::= <global-default-sleep-value> | + <pmd-core-sleep-pair> +
  • +
  • + <global-default-sleep-value> ::= <max-sleep-time> +
  • +
  • + <pmd-core-sleep-pair> ::= <core> : + <max-sleep-time> +
  • +
+

+

@@ -808,6 +869,30 @@ The feature is considered experimental.

+ + +

+ When a flow is installed in the datapath with an empty action list, + it indicates an implicit "drop" action. Most datapaths report this + for event for statistics and monitoring (in datapath-specific ways). +

+

+ However, if any of the per-bridge or per-flow sampling functionalities + are enabled (e.g: sFlow, IPFIX, local sampling), the action list might + not be empty, but contain an action to implement such functionality. + This makes the datapaths not report the packet drop. +

+

+ This knob makes Open vSwitch detect when the last datapath action + comes from these sampling features and add an explicit drop action at + the end to keep drop statistics accurate. +

+

+ The default value is false. +

+
+ @@ -2098,7 +2183,7 @@ - If a slave interface with this name exists in the bond and + If a member interface with this name exists in the bond and is up, it will be made active. Relevant only when is active-backup or if balance-tcp falls back @@ -2350,7 +2435,7 @@ lowest port-id is elected as the root. - The port path cost. The Port's contribution, when it is the Root Port, to the Root Path Cost for the Bridge. By default the @@ -2809,6 +2894,16 @@

+
srv6
+
+

+ Segment Routing IPv6 (SRv6) tunnel encapsulates L3 traffic as + "IPv6 in IPv6" or "IPv4 in IPv6" with Segment Routing Header (SRH) + defined in RFC 8754. The segment list in SRH can be set using a + SRv6 specific option. +

+
+
@@ -2817,8 +2912,8 @@

These options apply to interfaces with of geneve, bareudp, gre, - ip6gre, vxlan, lisp and - stt. + ip6gre, vxlan, lisp, + stt and srv6.

@@ -2831,7 +2926,8 @@ considered more specific than if a port defines one and another port defines the other. is not applicable for bareudp - tunnels. Hence it is not considered while identifying a bareudp tunnel. + and srv6 tunnels. Hence it is not considered while identifying + bareudp or srv6 tunnels.

@@ -2899,8 +2995,9 @@

- Optional, not applicable for bareudp. The key that - received packets must contain, one of: + Optional, not applicable for bareudp and + srv6. The key that received packets must contain, + one of:

    @@ -2929,8 +3026,9 @@

    - Optional, not applicable for bareudp. The key to be set - on outgoing packets, one of: + Optional, not applicable for bareudp and + srv6. The key to be set on outgoing packets, + one of:

      @@ -3133,9 +3231,15 @@

      Optional. Compute encapsulation header (either GRE or UDP) - checksums on outgoing packets. Default is disabled, set to - true to enable. Checksums present on incoming - packets will be validated regardless of this setting. + checksums on outgoing packets. When unset (the default value), + checksum computing for outgoing packets is enabled for UDP IPv6 + tunnels, and disabled for GRE and IPv4 UDP tunnels. When set to + false, no checksums will be computed for outgoing + tunnel encapsulation headers. When true, checksums + will be computed for all outgoing tunnel encapsulation headers. + Checksums present on incoming packets will be validated + regardless of this setting. Incoming packets without a checksum + will also be accepted regardless of this setting.

      @@ -3228,6 +3332,44 @@ + + +

      + Specifies the segment list in Segment Routing Header (SRH). + It consists of a comma-separated list of segments represented + in IPv6 format, e.g. "fc00:100::1,fc00:200::1,fc00:300::1". + Note that the first segment must be the same as + . +

      +
      + +

      + Optional. + This option controls how flowlabel in outer IPv6 header is + configured. It gives the benefit of IPv6 flow label based + load balancing, which is supported by some popular vendor + appliances. Like net.ipv6.seg6_flowlabel sysconfig, it is + one of the three values below: +

      +
        +
      • + By default, or if this option is copy, copy the + flowlabel of inner IPv6 header to the flowlabel of outer IPv6 + header. If inner header is not IPv6, it is set to 0. +
      • +
      • + If this option is zero, simply set flowlabel to 0. +
      • +
      • + If this option is compute, set flowlabel to a hash + over the L3/L4 fields of the inner packet. +
      • +
      +
      + +

      These options apply only to patch ports, that is, interfaces @@ -3430,6 +3572,50 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

      This option may only be used with dpdk VF representors.

      + +

      + Configure hardware Rx queue steering policy. +

      +

      + This option takes one of the following values: +

      +
      +
      rss
      +
      + Distribution of ingress packets in all Rx queues according to the + RSS algorithm. This is the default behaviour. +
      +
      rss+lacp
      +
      + Distribution of ingress packets according to the RSS algorithm on + all but the last Rx queue. An extra Rx queue is allocated for LACP + packets. +
      +
      +

      + If the user has already configured multiple on the port, an additional one will + be allocated for the specified protocols. Even if the hardware cannot + satisfy the requested number of requested Rx queues, the last Rx + queue will be used. If only one Rx queue is available or if the + hardware does not support the rte_flow matchers/actions required to + redirect the selected protocols, custom rx-steering will + fall back to default rss mode. +

      +

      + This feature is mutually exclusive with + + as it may conflict with the offloaded flows. If both are enabled, + rx-steering will fall back to default rss + mode. +

      +

      + This option is only applicable to interfaces with type + dpdk. +

      +
      + @@ -3658,6 +3844,18 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Maximum number of VMDq pools. + + Number of Rx queues. + + + + Number of Tx queues. + + + + Whether Rx Checksum offload is enabled or not. + + Interface type ID according to IANA ifTYPE MIB definitions. @@ -3666,14 +3864,73 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Interface description string. - - Vendor ID of PCI device. + + Bus name and bus info such as Vendor ID and Device ID of PCI + device. + + + + Ethernet address set for this VF interface. Only reported for dpdk + VF representors. + + + + Hardware Rx queue steering policy in use. - - Device ID of PCI device. + + ID of rx steering queue. Only reported if rx-steering + is supported by hardware. + + IDs of rss queues. Only reported if rx-steering is + supported by hardware. + +
      + + +

      + dpdkvhostuser and dpdkvhostuserclient + netdev specific interface status information. +

      + + client (connecting) or server (listening) in the socket + communication. + + + virtio features bitmap as per virtio specification. + + + The number of available virtqueues. + + + The numa id of the device and guest memory. + + + The path to the socket used for communication. + + + Status of connection to the device. + + + Each virtqueue will have it's size reported, where n is the + virtqueue number from 0..(num_of_vrings-1). + + + Whether userspace-tso is enabled or disabled. + +
      + + +

      + AF_XDP specific interface status options. +

      + + + XDP mode currently in use. See for description of possible values. +
      @@ -4420,12 +4677,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

      - Set this value to true to configure interrupt mode for - Link State Change (LSC) detection instead of poll mode for the DPDK - interface. + Set this value to false to configure poll mode for + Link State Change (LSC) detection instead of interrupt mode for the + DPDK interface.

      - If this value is not set, poll mode is configured. + If this value is not set, interrupt mode is configured.

      This parameter has an effect only on netdev dpdk interfaces. @@ -4776,7 +5033,7 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Maximum rate shared by all queued traffic, in bit/s. Optional. If not specified, for physical interfaces, the default is the link rate. For other interfaces or if the link rate cannot be determined, the default - is currently 100 Mbps. + is currently 10 Gbps. @@ -4872,6 +5129,10 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ Adds an independent loss probability to the packets outgoing from the chosen network interface. + + Adds the provided jitter to the latency outgoing to the + chosen network interface. The jitter value expressed in us. + @@ -5025,6 +5286,21 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ VLANs on which packets are selected for mirroring. An empty set selects packets on all VLANs. + +

      + When set, only packets that match are + selected for mirroring. Packets that do not match are ignored + by thie mirror. The syntax is described + in ovs-fields(7). However, the in_port + field is not supported; should be + used to limit the mirror to a source port. +

      +

      + This filter is applied after , , , and + . +

      +
      @@ -6060,7 +6336,7 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ and avoids recirculation of packet in datapath. It is supported only for balance-tcp bond mode in netdev datapath. The new action gives higher performance by using bond buckets instead of post - recirculation flows for selection of slave port from bond. By default + recirculation flows for selection of member port from bond. By default this new action is disabled, however it can be enabled by setting in table. @@ -6274,8 +6550,27 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ translated to an ephemeral port. If there is no collision, no SNAT is performed. + + True if the datapath supports CT flush OpenFlow Nicira extension + called NXT_CT_FLUSH. The NXT_CT_FLUSH + extensions allows to flush CT entries based on specified parameters. + + + True if the datapath supports OVS_ACTION_ATTR_PSAMPLE. If false, + local sampling will not be supported with the Linux kernel datapath. + + + Default connection tracking zone limit that is applied to all zones + that didn't specify the + explicitly. If the limit is unspecified the default limit + configuration for the datapath is left intact. The value 0 means + unlimited. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. @@ -6292,6 +6587,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ is not specified, it defaults to the timeout policy in the system. + + Connection tracking limit for this zone. If the limit is unspecified + the will be used. + The value 0 means unlimited. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. @@ -6569,6 +6870,26 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ disabled. + +

      + Interval (in seconds) for sending IPFIX exporting process statistics + according to IETF RFC 5101 Section 4.3. +

      +

      + Default value is 600 +

      +
      + + +

      + Interval (in seconds) for sending IPFIX Template information for each + Observation Domain ID. +

      +

      + Default value is 600 +

      +
      +

      @@ -6737,10 +7058,37 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \

- A set of IPFIX collectors of packet samples generated by OpenFlow - sample actions. This table is used only for IPFIX - flow-based sampling, not for per-bridge sampling (see the table for a description of the two forms). + A set of IPFIX or local sampling collectors of packet samples generated + by OpenFlow sample actions. +

+ +

+ If the column ipfix contains a reference to a + valid IPFIX entry, samples will be emitted via IPFIX. This mechanism + is known as flow-based IPFIX sampling, as opposed to bridge-based + sampling (see the table for a description of the + two forms). +

+ +

+ If the column local_group_id contains an integer and the + running datapath supports local sample emission, packets will be sent + to some local sample collector. Samples will contain the group number + specified by local_group_id which helps identify their + source as well as a 64-bit cookie result from the concatenation of the + observation_domain_id an the observation_point_id in network byte order. + + The way the sample is emitted and made available for local collectors + is datapath-specific. + + Currently only Linux kernel datapath supports local sampling which is + implemented by sending the packet to the psample netlink + multicast group. +

+ +

+ Note: both local_group_id and ipfix can be + configured simultaneously.

@@ -6759,6 +7107,12 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ record per sampled packet to. + + Configuration of the sample group id to be used in local sampling. + + The overall purpose of these columns is described under Common Columns at the beginning of this document. diff --git a/vtep/vtep-ctl.c b/vtep/vtep-ctl.c index 99c4adcd53d..26b8540b4a6 100644 --- a/vtep/vtep-ctl.c +++ b/vtep/vtep-ctl.c @@ -1065,6 +1065,7 @@ vtep_ctl_context_populate_cache(struct ctl_context *ctx) continue; } ps = shash_find_data(&vtepctl_ctx->pswitches, ps_cfg->name); + ovs_assert(ps); for (j = 0; j < ps_cfg->n_ports; j++) { struct vteprec_physical_port *port_cfg = ps_cfg->ports[j]; struct vtep_ctl_port *port; @@ -1858,18 +1859,21 @@ del_mcast_entry(struct ctl_context *ctx, const char *encap, const char *dst_ip, bool local) { struct vtep_ctl_context *vtepctl_ctx = vtep_ctl_context_cast(ctx); + struct vteprec_physical_locator_set *ploc_set_cfg; + struct vteprec_physical_locator *ploc_cfg; struct vtep_ctl_mcast_mac *mcast_mac; struct shash *mcast_shash; - struct vteprec_physical_locator *ploc_cfg; - struct vteprec_physical_locator_set *ploc_set_cfg; + struct shash_node *mcast_node; mcast_shash = local ? &ls->mcast_local : &ls->mcast_remote; - mcast_mac = shash_find_data(mcast_shash, mac); - if (!mcast_mac) { + mcast_node = shash_find(mcast_shash, mac); + if (!mcast_node || !mcast_node->data) { return; } + mcast_mac = mcast_node->data; + ploc_cfg = find_ploc(vtepctl_ctx, encap, dst_ip); if (!ploc_cfg) { /* Couldn't find the physical locator, so just ignore. */ @@ -1882,8 +1886,6 @@ del_mcast_entry(struct ctl_context *ctx, del_ploc_from_mcast_mac(mcast_mac, ploc_cfg); if (ovs_list_is_empty(&mcast_mac->locators)) { - struct shash_node *node = shash_find(mcast_shash, mac); - vteprec_physical_locator_set_delete(ploc_set_cfg); if (local) { @@ -1892,8 +1894,8 @@ del_mcast_entry(struct ctl_context *ctx, vteprec_mcast_macs_remote_delete(mcast_mac->remote_cfg); } - free(node->data); - shash_delete(mcast_shash, node); + free(mcast_node->data); + shash_delete(mcast_shash, mcast_node); } else { if (local) { vteprec_mcast_macs_local_set_locator_set(mcast_mac->local_cfg, @@ -2207,9 +2209,9 @@ static const struct ctl_table_class tables[VTEPREC_N_TABLES] = { static void vtep_ctl_context_init_command(struct vtep_ctl_context *vtepctl_ctx, - struct ctl_command *command) + struct ctl_command *command, bool last_command) { - ctl_context_init_command(&vtepctl_ctx->base, command); + ctl_context_init_command(&vtepctl_ctx->base, command, last_command); vtepctl_ctx->verified_ports = false; } @@ -2304,7 +2306,8 @@ do_vtep_ctl(const char *args, struct ctl_command *commands, } vtep_ctl_context_init(&vtepctl_ctx, NULL, idl, txn, vtep_global, symtab); for (c = commands; c < &commands[n_commands]; c++) { - vtep_ctl_context_init_command(&vtepctl_ctx, c); + vtep_ctl_context_init_command(&vtepctl_ctx, c, + c == &commands[n_commands - 1]); if (c->syntax->run) { (c->syntax->run)(&vtepctl_ctx.base); }