I have a few systems (workstations and servers) that I use to run multiple QEMU virtual machines for various purposes, all but one experience the same issue. When using a virtual network device for the guest, whether it be user mode or a tap interface, the guest will freeze for several seconds at a time with high network load. This can sometimes disconnect the VM from the network for a minute or more. Some of these systems run Debian, OpenSUSE, and ArchLinux all of varying versions. The only system that does not experience this problem is a Debian 9 system with a single integrated Realtek NIC, all other systems use Intel NICs. Unfortunately I do not have access to this system to help diagnose my issue as it is a former client 's system.
Does anyone have experience with QEMU virtual network devices and/or Linux tap interfaces and have suggestions on how to improve this issue?
Workarounds that I use:
-
Cap network bandwidth
-
Use a physical NIC via vfio for each VM that needs high network bandwidth, this completely solves the issue but it not a practical solution for most of my use cases.
Remedies I have tried that have little or no improvment:
-
QEMU I/O and vCPU thread isolation
-
Realtime Linux kernel with Realtime process priority
-
QEMU I/O and vCPU thread affinity pinning
-
Using virtio device model and guest driver
-
Every virtual network device model experiences the same problem
-
Using Message Signaled Interrupt mode for virtual NIC on guest
-
Try multiple guest OSes, issue present in Windows, Linux, MacOS, etc.
-
Try multiple host OSes, Issue present on all Linux Distros I have tried
-
Try multiple versions of QEMU
-
Disabling bonded network and using a single NIC
-
Using only 1 tap interface
-
Restrict VM to a single memory node and CPU
-
Using hugepages for QEMU VMs and memory preallocation
-
Removing dedicated router from network and using the host to route network traffic
-
Trying various MTU sizes
-
And more…
The /etc/network/interfaces
configuration of one of my Debian 9 servers that experiences this problem. 4 bonded NICs that are bridged to the tap interfaces.
source /etc/network/interfaces.d/*
auto lo
iface lo inet loopback
allow-hotplug enp7s0
auto enp7s0
iface enp7s0 inet manual
bond-master bond0
allow-hotplug enp8s0
auto enp8s0
iface enp8s0 inet manual
bond-master bond0
allow-hotplug enp131s0f0
auto enp131s0f0
iface enp131s0f0 inet manual
bond-master bond0
allow-hotplug enp131s0f1
auto enp131s0f1
iface enp131s0f1 inet manual
bond-master bond0
iface bond0 inet manual
bond-mode 0
bond-miimon 150
bond-downdelay 300
bond-updelay 300
allow-hotplug tap0
allow-hotplug tap1
allow-hotplug tap2
allow-hotplug tap3
allow-hotplug tap4
allow-hotplug tap5
allow-hotplug tap6
allow-hotplug tap7
allow-hotplug tap8
allow-hotplug tap9
auto br0
iface br0 inet static
pre-up ip link set bond0 up
bridge_ports bond0 tap0 tap1 tap2 tap3 tap4 tap5 tap6 tap7 tap8 tap9
bridge_stp on
bridge_maxwait 0
bridge_fd 0
hwaddress ether bc:ef:7b:03:32:21
address 10.0.0.100
netmask 255.255.255.0
network 10.0.0.0
gateway 10.0.0.1
A simplified version of a worstation’s Windows 10 dGPU VM launch script.
#!/bin/bash
sync; echo 1 > /proc/sys/vm/drop_caches
echo 12 > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
sleep 3
MEM=$(cat /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/free_hugepages)
# clear options
OPTS=""
# vm name
NAME="WIN10"
# CPU topology configuration
HOST_CPU_THREADS="8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31"
# qemu options
OPTS="$OPTS -name $NAME,debug-threads=on"
OPTS="$OPTS -serial none"
OPTS="$OPTS -parallel none"
OPTS="$OPTS -nodefaults"
OPTS="$OPTS -enable-kvm"
OPTS="$OPTS -cpu host,kvm=off,hv_time,hv_vendor_id=null,hv_relaxed,hv_vapic,hv_spinlocks=0x1fff,pmu=off"
OPTS="$OPTS -smp 16,cores=8,sockets=1,threads=2"
OPTS="$OPTS -m $MEM\G"
OPTS="$OPTS -mem-path /dev/hugepages"
OPTS="$OPTS -mem-prealloc"
OPTS="$OPTS -rtc base=localtime,clock=host"
OPTS="$OPTS -drive if=virtio,format=raw,aio=threads,file=/media/null/scratch/vms/disk-images/windows-10.img"
OPTS="$OPTS -drive if=virtio,format=raw,aio=threads,file=/media/null/scratch/vms/disk-images/scratch.img"
OPTS="$OPTS -drive file=/dev/sdb,format=raw,if=virtio"
OPTS="$OPTS -device vfio-pci,host=00:1a.0" # USB bus 1
#OPTS="$OPTS -device vfio-pci,host=83:00.0" # Intel GBE 1
#OPTS="$OPTS -device vfio-pci,host=83:00.1" # Intel GBE 2
OPTS="$OPTS -device vfio-pci,host=05:00.0,multifunction=on,x-vga=on" # GTX 1070 GPU
OPTS="$OPTS -device vfio-pci,host=05:00.1" # GTX 1070 HDMI Audio
OPTS="$OPTS -net nic,model=virtio,macaddr=66:b3:11:1b:4e:98"
OPTS="$OPTS -net tap,ifname=tap0,script=no,downscript=no"
OPTS="$OPTS -monitor tcp:127.0.0.1:2562,server,nowait"
OPTS="$OPTS -display none"
OPTS="$OPTS -vga none"
function vmcpu-performance {
echo "setting VM CPU cores to performance mode..."
for (( i=0; i<32; i++ )); do echo performance > /sys/devices/system/cpu/cpu$i/cpufreq/scaling_governor; done
}
function memfree {
echo 0 > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages
}
function run-vm {
echo "running \"$NAME\" virtual machine..."
sh -c "echo \$$ > /sys/fs/cgroup/cpuset/qemu-vcpu/tasks && /usr/bin/qemu-system-x86_64 $OPTS"
}
function kernel-setup {
sysctl kernel.sched_rt_runtime_us=980000
sysctl kernel.sched_latency_ns=1000000
sysctl kernel.sched_min_granularity_ns=500000
sysctl kernel.sched_wakeup_granularity_ns=500000
}
function shield-create {
echo "creating system cpuset..."
mkdir /sys/fs/cgroup/cpuset/system
echo 0 > /sys/fs/cgroup/cpuset/system/cpuset.mems
echo 0-6,16-22 > /sys/fs/cgroup/cpuset/system/cpuset.cpus
cat /sys/fs/cgroup/cpuset/tasks | xargs -n1 -i echo {} > /sys/fs/cgroup/cpuset/system/tasks
echo > /sys/fs/cgroup/cpuset/tasks
echo "creating qemu-virt cpuset..."
mkdir /sys/fs/cgroup/cpuset/qemu-virt
echo 0-1 > /sys/fs/cgroup/cpuset/qemu-virt/cpuset.mems
echo 7,23 > /sys/fs/cgroup/cpuset/qemu-virt/cpuset.cpus
echo "creating qemu-vcpu cpuset..."
mkdir /sys/fs/cgroup/cpuset/qemu-vcpu
echo 1 > /sys/fs/cgroup/cpuset/qemu-vcpu/cpuset.mems
echo 8-15,24-31 > /sys/fs/cgroup/cpuset/qemu-vcpu/cpuset.cpus
echo 1 > /sys/fs/cgroup/cpuset/system/cpuset.memory_migrate
echo 1 > /sys/fs/cgroup/cpuset/qemu-virt/cpuset.memory_migrate
#echo 1 > /sys/fs/cgroup/cpuset/qemu-vcpu/cpuset.mem_exclusive
echo 1 > /sys/fs/cgroup/cpuset/qemu-vcpu/cpuset.memory_migrate
}
function shield-destroy {
echo "removing system cpuset and returning threads to root cpuset..."
cat /sys/fs/cgroup/cpuset/system/tasks | xargs -n1 -i echo {} > /sys/fs/cgroup/cpuset/tasks
rmdir /sys/fs/cgroup/cpuset/system
echo "removing qemu-virt cpuset and returning threads to root cpuset..."
cat /sys/fs/cgroup/cpuset/qemu-virt/tasks | xargs -n1 -i echo {} > /sys/fs/cgroup/cpuset/tasks
rmdir /sys/fs/cgroup/cpuset/qemu-virt
echo "removing qemu-vcpu cpuset and returning threads to root cpuset..."
cat /sys/fs/cgroup/cpuset/qemu-vcpu/tasks | xargs -n1 -i echo {} > /sys/fs/cgroup/cpuset/tasks
rmdir /sys/fs/cgroup/cpuset/qemu-vcpu
}
function shield-threads {
sleep 20 &&
echo "shielding vm cores from host processes..."
echo $(pstree -pa $(pidof qemu-system-x86_64) | grep $NAME | awk -F',' '{print $2}' | awk '{print $1}') > /sys/fs/cgroup/cpuset/qemu-virt/tasks ;
HOST_THREAD=0
for PID in $(pstree -pa $(pstree -pa $(pidof qemu-system-x86_64) | grep $NAME | awk -F',' '{print $2}' | awk '{print $1}') | grep CPU | pstree -pa $(pstree -pa $(pidof qemu-system-x86_64) | grep $NAME | cut -d',' -f2 | cut -d' ' -f1) | grep CPU | sort | awk -F',' '{print $2}')
do let HOST_THREAD+=1
echo $PID > /sys/fs/cgroup/cpuset/qemu-vcpu/tasks
echo "taskset -pc $(echo $HOST_CPU_THREADS | cut -d',' -f$HOST_THREAD) $PID" | bash
done
}
function set-vm-realtime {
sleep 40 &&
echo "making virtual machine threads realtime..."
VPS=$(pstree -pa $(pidof qemu-system-x86_64) | grep $NAME | cut -d',' -f2 | cut -d' ' -f1)
pstree -pa $VPS | cut -d',' -f2 | cut -d' ' -f1 | xargs -L1 echo "chrt -f -p 99" | bash
}
#-------------------------------------------------------------------------------------------------------
vmcpu-performance
kernel-setup
init-pci
set-vm-realtime &
shield-create
shield-threads &
run-vm
shield-destroy
memfree
exit 0