Hi everyone!
I am trying to set up a server with 2 VMs for doing graphics test workloads. One of the VMs has an AMD RX6800, the other one an Nvida RTX3080. The system has 64 core epyc CPU with about 80GB or working RAM (turns out the RAM was not officially supported so we only got 80 of the 128GB working. For the host OS I am using Ubuntu Server 20.04. I wasn’t sure what the best host OS was, but I found a guide of someone doing GPU passthrough with Ubuntu.
So far I have managed to get the VM with the AMD GPU set up all correctly and it’s working wonderfully. The only real issue I had was that I had to manually install the graphics driver via a .inf file.
Unfortunately, I have had less luck with the other VM. I first had blue screens, but fixed those by specifying the vfio driver in the grub bootloader for both the GPUs in the system (also including the audio PCI device). After that I am managed to get the GPU working with the latest driver (511.79) installed. I managed to run vkcube and vulkaninfo without issues. However, at some point the GPU will stop working and gives me an error 43 in the device manager. After rebooting the VM it will sometimes work again for a while, but it eventually errors.
From what I understand, error 43 was Nvidia blocking GPU passthrough on geforce by detecting if it was running in a VM or not. But that was unlocked almost a year ago in the driver as a ‘beta’ feature.
When googling around I can only find articles working around the error 43 by hiding the fact that it is running in a VM, but I don’t think that is related to my issue. I tried some of the workarounds, but they don’t seem to make a difference.
I was wondering, since there are a lot of people going GPU passthrough here, if anyone could help me debug this issue. I am not sure where I can even find any error information other than error 43 in windows.
Any help would be greatly appreciated.
Thanks!
PS For reference this is the XML of my NV VM configuration:
<domain type='kvm' id='7' xmlns:qemu='http://libvirt.org/schemas/domain/qemu/1.0'>
<name>gronhaj-win-rtx3080</name>
<uuid>d6835cd8-ed04-4c82-be75-2aa9b18e8598</uuid>
<metadata>
<libosinfo:libosinfo xmlns:libosinfo="http://libosinfo.org/xmlns/libvirt/domain/1.0">
<libosinfo:os id="http://microsoft.com/win/2k19"/>
</libosinfo:libosinfo>
</metadata>
<memory unit='KiB'>33554432</memory>
<currentMemory unit='KiB'>33554432</currentMemory>
<vcpu placement='static'>62</vcpu>
<resource>
<partition>/machine</partition>
</resource>
<os>
<type arch='x86_64' machine='pc-q35-4.2'>hvm</type>
</os>
<features>
<acpi/>
<apic/>
<hyperv>
<relaxed state='on'/>
<vapic state='on'/>
<spinlocks state='on' retries='8191'/>
</hyperv>
</features>
<cpu mode='custom' match='exact' check='full'>
<model fallback='forbid'>EPYC-Rome</model>
<vendor>AMD</vendor>
<topology sockets='1' cores='31' threads='2'/>
<feature policy='require' name='x2apic'/>
<feature policy='require' name='tsc-deadline'/>
<feature policy='require' name='hypervisor'/>
<feature policy='require' name='tsc_adjust'/>
<feature policy='require' name='spec-ctrl'/>
<feature policy='require' name='stibp'/>
<feature policy='require' name='arch-capabilities'/>
<feature policy='require' name='ssbd'/>
<feature policy='require' name='xsaves'/>
<feature policy='require' name='cmp_legacy'/>
<feature policy='require' name='ibrs'/>
<feature policy='require' name='amd-ssbd'/>
<feature policy='require' name='virt-ssbd'/>
<feature policy='require' name='rdctl-no'/>
<feature policy='require' name='skip-l1dfl-vmentry'/>
<feature policy='require' name='mds-no'/>
<feature policy='require' name='pschange-mc-no'/>
</cpu>
<clock offset='localtime'>
<timer name='rtc' tickpolicy='catchup'/>
<timer name='pit' tickpolicy='delay'/>
<timer name='hpet' present='no'/>
<timer name='hypervclock' present='yes'/>
</clock>
<on_poweroff>destroy</on_poweroff>
<on_reboot>restart</on_reboot>
<on_crash>destroy</on_crash>
<pm>
<suspend-to-mem enabled='no'/>
<suspend-to-disk enabled='no'/>
</pm>
<devices>
<emulator>/usr/bin/qemu-system-x86_64</emulator>
<disk type='file' device='cdrom'>
<driver name='qemu' type='raw'/>
<source file='/home/traverse/windows_server_2019_eval.iso' index='3'/>
<backingStore/>
<target dev='sda' bus='sata'/>
<readonly/>
<boot order='1'/>
<alias name='sata0-0-0'/>
<address type='drive' controller='0' bus='0' target='0' unit='0'/>
</disk>
<disk type='file' device='cdrom'>
<driver name='qemu' type='raw'/>
<source file='/home/traverse/virtio-win-0.1.215.iso' index='2'/>
<backingStore/>
<target dev='sdb' bus='sata'/>
<readonly/>
<alias name='sata0-0-1'/>
<address type='drive' controller='0' bus='0' target='0' unit='1'/>
</disk>
<disk type='file' device='disk'>
<driver name='qemu' type='raw'/>
<source file='/home/traverse/vm-disks/gronhaj-win-rtx3090.qcow2' index='1'/>
<backingStore/>
<target dev='vdb' bus='virtio'/>
<alias name='virtio-disk1'/>
<address type='pci' domain='0x0000' bus='0x03' slot='0x00' function='0x0'/>
</disk>
<controller type='usb' index='0' model='qemu-xhci' ports='15'>
<alias name='usb'/>
<address type='pci' domain='0x0000' bus='0x02' slot='0x00' function='0x0'/>
</controller>
<controller type='sata' index='0'>
<alias name='ide'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x1f' function='0x2'/>
</controller>
<controller type='pci' index='0' model='pcie-root'>
<alias name='pcie.0'/>
</controller>
<controller type='pci' index='1' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='1' port='0x10'/>
<alias name='pci.1'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0' multifunction='on'/>
</controller>
<controller type='pci' index='2' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='2' port='0x11'/>
<alias name='pci.2'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x1'/>
</controller>
<controller type='pci' index='3' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='3' port='0x12'/>
<alias name='pci.3'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x2'/>
</controller>
<controller type='pci' index='4' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='4' port='0x13'/>
<alias name='pci.4'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x3'/>
</controller>
<controller type='pci' index='5' model='pcie-root-port'>
<model name='pcie-root-port'/>
<target chassis='5' port='0x14'/>
<alias name='pci.5'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x4'/>
</controller>
<interface type='direct'>
<mac address='52:54:00:5c:da:cf'/>
<source dev='bond0' mode='vepa'/>
<target dev='macvtap1'/>
<model type='e1000e'/>
<alias name='net0'/>
<address type='pci' domain='0x0000' bus='0x01' slot='0x00' function='0x0'/>
</interface>
<serial type='pty'>
<source path='/dev/pts/2'/>
<target type='isa-serial' port='0'>
<model name='isa-serial'/>
</target>
<alias name='serial0'/>
</serial>
<console type='pty' tty='/dev/pts/2'>
<source path='/dev/pts/2'/>
<target type='serial' port='0'/>
<alias name='serial0'/>
</console>
<input type='tablet' bus='usb'>
<alias name='input0'/>
<address type='usb' bus='0' port='1'/>
</input>
<input type='mouse' bus='ps2'>
<alias name='input1'/>
</input>
<input type='keyboard' bus='ps2'>
<alias name='input2'/>
</input>
<graphics type='vnc' port='5900' autoport='yes' listen='0.0.0.0'>
<listen type='address' address='0.0.0.0'/>
</graphics>
<video>
<model type='qxl' ram='65536' vram='65536' vgamem='16384' heads='1' primary='yes'/>
<alias name='video0'/>
<address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/>
</video>
<hostdev mode='subsystem' type='pci' managed='yes'>
<driver name='vfio'/>
<source>
<address domain='0x0000' bus='0x81' slot='0x00' function='0x0'/>
</source>
<alias name='hostdev0'/>
<address type='pci' domain='0x0000' bus='0x05' slot='0x00' function='0x0' multifunction='on'/>
</hostdev>
<hostdev mode='subsystem' type='pci' managed='yes'>
<driver name='vfio'/>
<source>
<address domain='0x0000' bus='0x81' slot='0x00' function='0x1'/>
</source>
<alias name='hostdev1'/>
<address type='pci' domain='0x0000' bus='0x05' slot='0x00' function='0x1'/>
</hostdev>
<memballoon model='virtio'>
<alias name='balloon0'/>
<address type='pci' domain='0x0000' bus='0x04' slot='0x00' function='0x0'/>
</memballoon>
</devices>
<seclabel type='dynamic' model='apparmor' relabel='yes'>
<label>libvirt-d6835cd8-ed04-4c82-be75-2aa9b18e8598</label>
<imagelabel>libvirt-d6835cd8-ed04-4c82-be75-2aa9b18e8598</imagelabel>
</seclabel>
<seclabel type='dynamic' model='dac' relabel='yes'>
<label>+64055:+108</label>
<imagelabel>+64055:+108</imagelabel>
</seclabel>
<qemu:commandline>
<qemu:arg value='-cpu'/>
<qemu:arg value='host,hv_time,kvm=off,hv_vendor_id=null'/>
<qemu:arg value='-machine'/>
<qemu:arg value='q35,kernel_irqchip=on'/>
</qemu:commandline>
</domain>