From 87ecf6eb717f4b5939fc0f7172516b3ddede8fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 8 Jan 2025 05:25:21 +0100 Subject: [PATCH 1/6] Revert "Mark the version as 4.19.1.0" This reverts commit 346e0bcfbffae725903369ca0821a28d87228c22. --- .qubesbuilder | 2 +- archlinux/PKGBUILD.in | 4 ++-- xen.spec.in | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.qubesbuilder b/.qubesbuilder index 36817db..dc441cf 100644 --- a/.qubesbuilder +++ b/.qubesbuilder @@ -13,6 +13,6 @@ source: # - git-url: https://gitlab.com/xen-project/xen.git - git-url: https://xenbits.xenproject.org/git-http/xen.git tag: RELEASE-@VERSION@ - git-basename: xen-@VERSION@.0 + git-basename: xen-@VERSION@ pubkeys: - xen.org-key.asc diff --git a/archlinux/PKGBUILD.in b/archlinux/PKGBUILD.in index c5cf0b9..ed1e286 100644 --- a/archlinux/PKGBUILD.in +++ b/archlinux/PKGBUILD.in @@ -2,7 +2,7 @@ _upstream_pkgver=@VERSION@ pkgname=qubes-vm-xen -pkgbase="xen-$_upstream_pkgver.0" +pkgbase="xen-$_upstream_pkgver" pkgver=${_upstream_pkgver/-/\~} pkgrel=@REL@ pkgdesc="Xen is a virtual machine monitor" @@ -23,7 +23,7 @@ _patches=( 1102-docs-xen-headers-use-alphabetical-sorting-for-incont.patch 1103-Strip-build-path-directories-in-tools-xen-and-xen-ar.patch ) -source=(xen-$_upstream_pkgver.0.tar.gz "${_patches[@]}") +source=(xen-$_upstream_pkgver.tar.gz "${_patches[@]}") md5sums=(SKIP SKIP SKIP SKIP SKIP SKIP SKIP SKIP) prepare() { diff --git a/xen.spec.in b/xen.spec.in index ad073f8..0d4e672 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -72,7 +72,7 @@ Release: %{?rctag}@REL@%{?dist} Epoch: 2001 License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ -Source0: https://downloads.xenproject.org/release/xen/%{upstream_version}/xen-%{upstream_version}.0.tar.gz +Source0: https://downloads.xenproject.org/release/xen/%{upstream_version}/xen-%{upstream_version}.tar.gz Source2: %{name}.logrotate # .config file for xen hypervisor Source3: config @@ -377,7 +377,7 @@ manage Xen virtual machines. %endif %prep -%autosetup -p1 -n %{name}-%{upstream_version}.0 +%autosetup -p1 -n %{name}-%{upstream_version} # copy xen hypervisor .config file to change settings cp -v %{SOURCE3} xen/.config From 3dbaa67c3169876ce44162c283e5beba41021e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 8 Jan 2025 15:46:42 +0100 Subject: [PATCH 2/6] Update to Xen 4.20-unstable Take the current staging commit 25492368ea429fe3357748660c72456b9ba16528. Adjust patches, and drop those already included upstream. This invents 4.20.0-rc0 version, just to make the build scripts happy. --- .qubesbuilder | 3 +- 0203-xen.efi.build.patch | 6 +- ...xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch | 34 -- ...or-marking-only-part-of-a-MMIO-page-.patch | 516 ------------------ ...-sub-page-ro-API-to-make-just-xhci-d.patch | 90 --- ...pport-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch | 350 ------------ ...ent-early-exit-from-i8259-loop-detec.patch | 83 --- ...-t-use-EFI-s-GetTime-call-by-default.patch | 34 +- ...se-ACPI-for-CPUs-without-hardcoded-C.patch | 15 +- ...e-LZMA_BLOCK_SIZE-for-uncompressing-.patch | 62 --- ...ates-time-based-on-SOURCE_DATE_EPOCH.patch | 2 +- version | 2 +- xen.spec.in | 7 - 13 files changed, 31 insertions(+), 1173 deletions(-) delete mode 100644 0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch delete mode 100644 0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch delete mode 100644 0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch delete mode 100644 0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch delete mode 100644 0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch delete mode 100644 0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch diff --git a/.qubesbuilder b/.qubesbuilder index dc441cf..c38f532 100644 --- a/.qubesbuilder +++ b/.qubesbuilder @@ -12,7 +12,8 @@ source: # signature: https://downloads.xenproject.org/release/xen/@VERSION@/xen-@VERSION@.tar.gz.sig # - git-url: https://gitlab.com/xen-project/xen.git - git-url: https://xenbits.xenproject.org/git-http/xen.git - tag: RELEASE-@VERSION@ +# tag: RELEASE-@VERSION@ git-basename: xen-@VERSION@ + commit-id: 25492368ea429fe3357748660c72456b9ba16528 pubkeys: - xen.org-key.asc diff --git a/0203-xen.efi.build.patch b/0203-xen.efi.build.patch index 718e26b..76cc398 100644 --- a/0203-xen.efi.build.patch +++ b/0203-xen.efi.build.patch @@ -15,10 +15,10 @@ index 4f6c086988dd..0efc664bc919 100644 -c $(srctree)/$(efi-check).c -o $(efi-check).o,y) # Check if the linker supports PE. --EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 -+#EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 +-EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 --enable-long-section-names ++#EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 --enable-long-section-names +# use a reduced set of options from LDFLAGS -+EFI_LDFLAGS = --as-needed --build-id=sha1 -mi386pep --subsystem=10 ++EFI_LDFLAGS = --as-needed --build-id=sha1 -mi386pep --subsystem=10 --enable-long-section-names LD_PE_check_cmd = $(call ld-option,$(EFI_LDFLAGS) --image-base=0x100000000 -o $(efi-check).efi $(efi-check).o) XEN_BUILD_PE := $(LD_PE_check_cmd) diff --git a/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch b/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch deleted file mode 100644 index 5f26079..0000000 --- a/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 2e1e0504cc52901689d15517459163b4159c8110 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - -Date: Tue, 23 Jul 2024 13:59:12 +0200 -Subject: [PATCH] xen/list: add LIST_HEAD_RO_AFTER_INIT -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Similar to LIST_HEAD_READ_MOSTLY. - -Signed-off-by: Marek Marczykowski-Górecki -Acked-by: Jan Beulich ---- - xen/include/xen/list.h | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h -index 6506ac40893b..62169f46742e 100644 ---- a/xen/include/xen/list.h -+++ b/xen/include/xen/list.h -@@ -42,6 +42,9 @@ struct list_head { - #define LIST_HEAD_READ_MOSTLY(name) \ - struct list_head __read_mostly name = LIST_HEAD_INIT(name) - -+#define LIST_HEAD_RO_AFTER_INIT(name) \ -+ struct list_head __ro_after_init name = LIST_HEAD_INIT(name) -+ - static inline void INIT_LIST_HEAD(struct list_head *list) - { - list->next = list; --- -2.46.0 - diff --git a/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch b/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch deleted file mode 100644 index 282d5cf..0000000 --- a/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch +++ /dev/null @@ -1,516 +0,0 @@ -From 7439bbc83314e4d24a82afca4f6dbf1a1d002141 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - -Date: Mon, 20 Mar 2023 21:19:25 +0100 -Subject: [PATCH] x86/mm: add API for marking only part of a MMIO page - read only -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -In some cases, only few registers on a page needs to be write-protected. -Examples include USB3 console (64 bytes worth of registers) or MSI-X's -PBA table (which doesn't need to span the whole table either), although -in the latter case the spec forbids placing other registers on the same -page. Current API allows only marking whole pages pages read-only, -which sometimes may cover other registers that guest may need to -write into. - -Currently, when a guest tries to write to an MMIO page on the -mmio_ro_ranges, it's either immediately crashed on EPT violation - if -that's HVM, or if PV, it gets #PF. In case of Linux PV, if access was -from userspace (like, /dev/mem), it will try to fixup by updating page -tables (that Xen again will force to read-only) and will hit that #PF -again (looping endlessly). Both behaviors are undesirable if guest could -actually be allowed the write. - -Introduce an API that allows marking part of a page read-only. Since -sub-page permissions are not a thing in page tables (they are in EPT, -but not granular enough), do this via emulation (or simply page fault -handler for PV) that handles writes that are supposed to be allowed. -The new subpage_mmio_ro_add() takes a start physical address and the -region size in bytes. Both start address and the size need to be 8-byte -aligned, as a practical simplification (allows using smaller bitmask, -and a smaller granularity isn't really necessary right now). -It will internally add relevant pages to mmio_ro_ranges, but if either -start or end address is not page-aligned, it additionally adds that page -to a list for sub-page R/O handling. The list holds a bitmask which -qwords are supposed to be read-only and an address where page is mapped -for write emulation - this mapping is done only on the first access. A -plain list is used instead of more efficient structure, because there -isn't supposed to be many pages needing this precise r/o control. - -The mechanism this API is plugged in is slightly different for PV and -HVM. For both paths, it's plugged into mmio_ro_emulated_write(). For PV, -it's already called for #PF on read-only MMIO page. For HVM however, EPT -violation on p2m_mmio_direct page results in a direct domain_crash() for -non hardware domains. To reach mmio_ro_emulated_write(), change how -write violations for p2m_mmio_direct are handled - specifically, check -if they relate to such partially protected page via -subpage_mmio_write_accept() and if so, call hvm_emulate_one_mmio() for -them too. This decodes what guest is trying write and finally calls -mmio_ro_emulated_write(). The EPT write violation is detected as -npfec.write_access and npfec.present both being true (similar to other -places), which may cover some other (future?) cases - if that happens, -emulator might get involved unnecessarily, but since it's limited to -pages marked with subpage_mmio_ro_add() only, the impact is minimal. -Both of those paths need an MFN to which guest tried to write (to check -which part of the page is supposed to be read-only, and where -the page is mapped for writes). This information currently isn't -available directly in mmio_ro_emulated_write(), but in both cases it is -already resolved somewhere higher in the call tree. Pass it down to -mmio_ro_emulated_write() via new mmio_ro_emulate_ctxt.mfn field. - -This may give a bit more access to the instruction emulator to HVM -guests (the change in hvm_hap_nested_page_fault()), but only for pages -explicitly marked with subpage_mmio_ro_add() - so, if the guest has a -passed through a device partially used by Xen. -As of the next patch, it applies only configuration explicitly -documented as not security supported. - -The subpage_mmio_ro_add() function cannot be called with overlapping -ranges, and on pages already added to mmio_ro_ranges separately. -Successful calls would result in correct handling, but error paths may -result in incorrect state (like pages removed from mmio_ro_ranges too -early). Debug build has asserts for relevant cases. - -Signed-off-by: Marek Marczykowski-Górecki ---- -Shadow mode is not tested, but I don't expect it to work differently than -HAP in areas related to this patch. - -Changes in v7: -- refuse misaligned start in release build too, to have release build - running what was tested in debug build -- simplify return from subpage_mmio_ro_add_page -Changes in v6: -- fix return type of subpage_mmio_find_page() -- change 'iter' pointer to 'new_entry' bool and move list_add() -- comment why different error handling for unaligned start / size -- code style -Changes in v5: -- use subpage_mmio_find_page helper, simplifying several functions -- use LIST_HEAD_RO_AFTER_INIT -- don't use subpage_ro_lock in __init -- drop #ifdef in mm.h -- return error on unaligned size in subpage_mmio_ro_add() instead of - extending the size (in release build) -Changes in v4: -- rename SUBPAGE_MMIO_RO_ALIGN to MMIO_RO_SUBPAGE_GRAN -- guard subpage_mmio_write_accept with CONFIG_HVM, as it's used only - there -- rename ro_qwords to ro_elems -- use unsigned arguments for subpage_mmio_ro_remove_page() -- use volatile for __iomem -- do not set mmio_ro_ctxt.mfn for mmcfg case -- comment where fields of mmio_ro_ctxt are used -- use bool for result of __test_and_set_bit -- do not open-code mfn_to_maddr() -- remove leftover RCU -- mention hvm_hap_nested_page_fault() explicitly in the commit message -Changes in v3: -- use unsigned int for loop iterators -- use __set_bit/__clear_bit when under spinlock -- avoid ioremap() under spinlock -- do not cast away const -- handle unaligned parameters in release build -- comment fixes -- remove RCU - the add functions are __init and actual usage is only - much later after domains are running -- add checks overlapping ranges in debug build and document the - limitations -- change subpage_mmio_ro_add() so the error path doesn't potentially - remove pages from mmio_ro_ranges -- move printing message to avoid one goto in - subpage_mmio_write_emulate() -Changes in v2: -- Simplify subpage_mmio_ro_add() parameters -- add to mmio_ro_ranges from within subpage_mmio_ro_add() -- use ioremap() instead of caller-provided fixmap -- use 8-bytes granularity (largest supported single write) and a bitmap - instead of a rangeset -- clarify commit message -- change how it's plugged in for HVM domain, to not change the behavior for - read-only parts (keep it hitting domain_crash(), instead of ignoring - write) -- remove unused subpage_mmio_ro_remove() ---- - xen/arch/x86/hvm/emulate.c | 2 +- - xen/arch/x86/hvm/hvm.c | 4 +- - xen/arch/x86/include/asm/mm.h | 23 +++ - xen/arch/x86/mm.c | 261 ++++++++++++++++++++++++++++++++ - xen/arch/x86/pv/ro-page-fault.c | 6 +- - 5 files changed, 291 insertions(+), 5 deletions(-) - -diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c -index 02e378365b40..7253a87032dd 100644 ---- a/xen/arch/x86/hvm/emulate.c -+++ b/xen/arch/x86/hvm/emulate.c -@@ -2734,7 +2734,7 @@ int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla) - .write = mmio_ro_emulated_write, - .validate = hvmemul_validate, - }; -- struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla }; -+ struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla, .mfn = _mfn(mfn) }; - struct hvm_emulate_ctxt ctxt; - const struct x86_emulate_ops *ops; - unsigned int seg, bdf; -diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c -index 7f4b627b1f5f..a108870558bf 100644 ---- a/xen/arch/x86/hvm/hvm.c -+++ b/xen/arch/x86/hvm/hvm.c -@@ -2016,8 +2016,8 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, - goto out_put_gfn; - } - -- if ( (p2mt == p2m_mmio_direct) && is_hardware_domain(currd) && -- npfec.write_access && npfec.present && -+ if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present && -+ (is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)) && - (hvm_emulate_one_mmio(mfn_x(mfn), gla) == X86EMUL_OKAY) ) - { - rc = 1; -diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h -index 98b66edaca5e..a457f0d2b1b3 100644 ---- a/xen/arch/x86/include/asm/mm.h -+++ b/xen/arch/x86/include/asm/mm.h -@@ -522,9 +522,32 @@ extern struct rangeset *mmio_ro_ranges; - void memguard_guard_stack(void *p); - void memguard_unguard_stack(void *p); - -+/* -+ * Add more precise r/o marking for a MMIO page. Range specified here -+ * will still be R/O, but the rest of the page (not marked as R/O via another -+ * call) will have writes passed through. -+ * The start address and the size must be aligned to MMIO_RO_SUBPAGE_GRAN. -+ * -+ * This API cannot be used for overlapping ranges, nor for pages already added -+ * to mmio_ro_ranges separately. -+ * -+ * Since there is currently no subpage_mmio_ro_remove(), relevant device should -+ * not be hot-unplugged. -+ * -+ * Return values: -+ * - negative: error -+ * - 0: success -+ */ -+#define MMIO_RO_SUBPAGE_GRAN 8 -+int subpage_mmio_ro_add(paddr_t start, size_t size); -+bool subpage_mmio_write_accept(mfn_t mfn, unsigned long gla); -+ - struct mmio_ro_emulate_ctxt { - unsigned long cr2; -+ /* Used only for mmcfg case */ - unsigned int seg, bdf; -+ /* Used only for non-mmcfg case */ -+ mfn_t mfn; - }; - - int cf_check mmio_ro_emulated_write( -diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c -index 95795567f2a5..cfd487d06474 100644 ---- a/xen/arch/x86/mm.c -+++ b/xen/arch/x86/mm.c -@@ -152,6 +152,18 @@ struct rangeset *__read_mostly mmio_ro_ranges; - struct rangeset *__read_mostly mmio_ro_ranges; - - static uint32_t base_disallow_mask; -+ -+/* Handling sub-page read-only MMIO regions */ -+struct subpage_ro_range { -+ struct list_head list; -+ mfn_t mfn; -+ void __iomem *mapped; -+ DECLARE_BITMAP(ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN); -+}; -+ -+static LIST_HEAD_RO_AFTER_INIT(subpage_ro_ranges); -+static DEFINE_SPINLOCK(subpage_ro_lock); -+ - /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */ - #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL) - -@@ -4912,6 +4923,253 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) - return rc; - } - -+static struct subpage_ro_range *subpage_mmio_find_page(mfn_t mfn) -+{ -+ struct subpage_ro_range *entry; -+ -+ list_for_each_entry(entry, &subpage_ro_ranges, list) -+ if ( mfn_eq(entry->mfn, mfn) ) -+ return entry; -+ -+ return NULL; -+} -+ -+/* -+ * Mark part of the page as R/O. -+ * Returns: -+ * - 0 on success - first range in the page -+ * - 1 on success - subsequent range in the page -+ * - <0 on error -+ */ -+static int __init subpage_mmio_ro_add_page( -+ mfn_t mfn, -+ unsigned int offset_s, -+ unsigned int offset_e) -+{ -+ struct subpage_ro_range *entry = NULL; -+ bool new_entry = false; -+ unsigned int i; -+ -+ entry = subpage_mmio_find_page(mfn); -+ if ( !entry ) -+ { -+ entry = xzalloc(struct subpage_ro_range); -+ if ( !entry ) -+ return -ENOMEM; -+ entry->mfn = mfn; -+ list_add(&entry->list, &subpage_ro_ranges); -+ new_entry = true; -+ } -+ -+ for ( i = offset_s; i <= offset_e; i += MMIO_RO_SUBPAGE_GRAN ) -+ { -+ bool oldbit = __test_and_set_bit(i / MMIO_RO_SUBPAGE_GRAN, -+ entry->ro_elems); -+ ASSERT(!oldbit); -+ } -+ -+ return !new_entry; -+} -+ -+static void __init subpage_mmio_ro_remove_page( -+ mfn_t mfn, -+ unsigned int offset_s, -+ unsigned int offset_e) -+{ -+ struct subpage_ro_range *entry = NULL; -+ unsigned int i; -+ -+ entry = subpage_mmio_find_page(mfn); -+ if ( !entry ) -+ return; -+ -+ for ( i = offset_s; i <= offset_e; i += MMIO_RO_SUBPAGE_GRAN ) -+ __clear_bit(i / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems); -+ -+ if ( !bitmap_empty(entry->ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN) ) -+ return; -+ -+ list_del(&entry->list); -+ if ( entry->mapped ) -+ iounmap(entry->mapped); -+ xfree(entry); -+} -+ -+int __init subpage_mmio_ro_add( -+ paddr_t start, -+ size_t size) -+{ -+ mfn_t mfn_start = maddr_to_mfn(start); -+ paddr_t end = start + size - 1; -+ mfn_t mfn_end = maddr_to_mfn(end); -+ unsigned int offset_end = 0; -+ int rc; -+ bool subpage_start, subpage_end; -+ -+ ASSERT(IS_ALIGNED(start, MMIO_RO_SUBPAGE_GRAN)); -+ ASSERT(IS_ALIGNED(size, MMIO_RO_SUBPAGE_GRAN)); -+ if ( !IS_ALIGNED(start, MMIO_RO_SUBPAGE_GRAN) || -+ !IS_ALIGNED(size, MMIO_RO_SUBPAGE_GRAN) ) -+ return -EINVAL; -+ -+ if ( !size ) -+ return 0; -+ -+ if ( mfn_eq(mfn_start, mfn_end) ) -+ { -+ /* Both starting and ending parts handled at once */ -+ subpage_start = PAGE_OFFSET(start) || PAGE_OFFSET(end) != PAGE_SIZE - 1; -+ subpage_end = false; -+ } -+ else -+ { -+ subpage_start = PAGE_OFFSET(start); -+ subpage_end = PAGE_OFFSET(end) != PAGE_SIZE - 1; -+ } -+ -+ if ( subpage_start ) -+ { -+ offset_end = mfn_eq(mfn_start, mfn_end) ? -+ PAGE_OFFSET(end) : -+ (PAGE_SIZE - 1); -+ rc = subpage_mmio_ro_add_page(mfn_start, -+ PAGE_OFFSET(start), -+ offset_end); -+ if ( rc < 0 ) -+ goto err_unlock; -+ /* Check if not marking R/W part of a page intended to be fully R/O */ -+ ASSERT(rc || !rangeset_contains_singleton(mmio_ro_ranges, -+ mfn_x(mfn_start))); -+ } -+ -+ if ( subpage_end ) -+ { -+ rc = subpage_mmio_ro_add_page(mfn_end, 0, PAGE_OFFSET(end)); -+ if ( rc < 0 ) -+ goto err_unlock_remove; -+ /* Check if not marking R/W part of a page intended to be fully R/O */ -+ ASSERT(rc || !rangeset_contains_singleton(mmio_ro_ranges, -+ mfn_x(mfn_end))); -+ } -+ -+ rc = rangeset_add_range(mmio_ro_ranges, mfn_x(mfn_start), mfn_x(mfn_end)); -+ if ( rc ) -+ goto err_remove; -+ -+ return 0; -+ -+ err_remove: -+ if ( subpage_end ) -+ subpage_mmio_ro_remove_page(mfn_end, 0, PAGE_OFFSET(end)); -+ err_unlock_remove: -+ if ( subpage_start ) -+ subpage_mmio_ro_remove_page(mfn_start, PAGE_OFFSET(start), offset_end); -+ err_unlock: -+ return rc; -+} -+ -+static void __iomem *subpage_mmio_map_page( -+ struct subpage_ro_range *entry) -+{ -+ void __iomem *mapped_page; -+ -+ if ( entry->mapped ) -+ return entry->mapped; -+ -+ mapped_page = ioremap(mfn_to_maddr(entry->mfn), PAGE_SIZE); -+ -+ spin_lock(&subpage_ro_lock); -+ /* Re-check under the lock */ -+ if ( entry->mapped ) -+ { -+ spin_unlock(&subpage_ro_lock); -+ if ( mapped_page ) -+ iounmap(mapped_page); -+ return entry->mapped; -+ } -+ -+ entry->mapped = mapped_page; -+ spin_unlock(&subpage_ro_lock); -+ return entry->mapped; -+} -+ -+static void subpage_mmio_write_emulate( -+ mfn_t mfn, -+ unsigned int offset, -+ const void *data, -+ unsigned int len) -+{ -+ struct subpage_ro_range *entry; -+ volatile void __iomem *addr; -+ -+ entry = subpage_mmio_find_page(mfn); -+ if ( !entry ) -+ /* Do not print message for pages without any writable parts. */ -+ return; -+ -+ if ( test_bit(offset / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems) ) -+ { -+ write_ignored: -+ gprintk(XENLOG_WARNING, -+ "ignoring write to R/O MMIO 0x%"PRI_mfn"%03x len %u\n", -+ mfn_x(mfn), offset, len); -+ return; -+ } -+ -+ addr = subpage_mmio_map_page(entry); -+ if ( !addr ) -+ { -+ gprintk(XENLOG_ERR, -+ "Failed to map page for MMIO write at 0x%"PRI_mfn"%03x\n", -+ mfn_x(mfn), offset); -+ return; -+ } -+ -+ switch ( len ) -+ { -+ case 1: -+ writeb(*(const uint8_t*)data, addr); -+ break; -+ case 2: -+ writew(*(const uint16_t*)data, addr); -+ break; -+ case 4: -+ writel(*(const uint32_t*)data, addr); -+ break; -+ case 8: -+ writeq(*(const uint64_t*)data, addr); -+ break; -+ default: -+ /* mmio_ro_emulated_write() already validated the size */ -+ ASSERT_UNREACHABLE(); -+ goto write_ignored; -+ } -+} -+ -+#ifdef CONFIG_HVM -+bool subpage_mmio_write_accept(mfn_t mfn, unsigned long gla) -+{ -+ unsigned int offset = PAGE_OFFSET(gla); -+ const struct subpage_ro_range *entry; -+ -+ entry = subpage_mmio_find_page(mfn); -+ if ( !entry ) -+ return false; -+ -+ if ( !test_bit(offset / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems) ) -+ { -+ /* -+ * We don't know the write size at this point yet, so it could be -+ * an unaligned write, but accept it here anyway and deal with it -+ * later. -+ */ -+ return true; -+ } -+ -+ return false; -+} -+#endif -+ - int cf_check mmio_ro_emulated_write( - enum x86_segment seg, - unsigned long offset, -@@ -4930,6 +5188,9 @@ int cf_check mmio_ro_emulated_write( - return X86EMUL_UNHANDLEABLE; - } - -+ subpage_mmio_write_emulate(mmio_ro_ctxt->mfn, PAGE_OFFSET(offset), -+ p_data, bytes); -+ - return X86EMUL_OKAY; - } - -diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c -index cad28ef928ad..2ea1a6ad489c 100644 ---- a/xen/arch/x86/pv/ro-page-fault.c -+++ b/xen/arch/x86/pv/ro-page-fault.c -@@ -333,8 +333,10 @@ static int mmio_ro_do_page_fault(struct x86_emulate_ctxt *ctxt, - ctxt->data = &mmio_ro_ctxt; - if ( pci_ro_mmcfg_decode(mfn_x(mfn), &mmio_ro_ctxt.seg, &mmio_ro_ctxt.bdf) ) - return x86_emulate(ctxt, &mmcfg_intercept_ops); -- else -- return x86_emulate(ctxt, &mmio_ro_emulate_ops); -+ -+ mmio_ro_ctxt.mfn = mfn; -+ -+ return x86_emulate(ctxt, &mmio_ro_emulate_ops); - } - - int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs) --- -2.46.0 - diff --git a/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch b/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch deleted file mode 100644 index a37b131..0000000 --- a/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch +++ /dev/null @@ -1,90 +0,0 @@ -From 278c3f5336a02f6c3235772271e364f9d50c6034 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - -Date: Fri, 24 Mar 2023 18:24:41 +0100 -Subject: [PATCH] drivers/char: Use sub-page ro API to make just xhci - dbc cap RO -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Not the whole page, which may contain other registers too. The XHCI -specification describes DbC as designed to be controlled by a different -driver, but does not mandate placing registers on a separate page. In fact -on Tiger Lake and newer (at least), this page do contain other registers -that Linux tries to use. And with share=yes, a domU would use them too. -Without this patch, PV dom0 would fail to initialize the controller, -while HVM would be killed on EPT violation. - -With `share=yes`, this patch gives domU more access to the emulator -(although a HVM with any emulated device already has plenty of it). This -configuration is already documented as unsafe with untrusted guests and -not security supported. - -Signed-off-by: Marek Marczykowski-Górecki -Reviewed-by: Jan Beulich ---- -Changes in v4: -- restore mmio_ro_ranges in the fallback case -- set XHCI_SHARE_NONE in the fallback case -Changes in v3: -- indentation fix -- remove stale comment -- fallback to pci_ro_device() if subpage_mmio_ro_add() fails -- extend commit message -Changes in v2: - - adjust for simplified subpage_mmio_ro_add() API ---- - xen/drivers/char/xhci-dbc.c | 36 ++++++++++++++++++++++-------------- - 1 file changed, 22 insertions(+), 14 deletions(-) - -diff --git a/xen/drivers/char/xhci-dbc.c b/xen/drivers/char/xhci-dbc.c -index 8e2037f1a5f7..c45e4b6825cc 100644 ---- a/xen/drivers/char/xhci-dbc.c -+++ b/xen/drivers/char/xhci-dbc.c -@@ -1216,20 +1216,28 @@ static void __init cf_check dbc_uart_init_postirq(struct serial_port *port) - break; - } - #ifdef CONFIG_X86 -- /* -- * This marks the whole page as R/O, which may include other registers -- * unrelated to DbC. Xen needs only DbC area protected, but it seems -- * Linux's XHCI driver (as of 5.18) works without writting to the whole -- * page, so keep it simple. -- */ -- if ( rangeset_add_range(mmio_ro_ranges, -- PFN_DOWN((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) + -- uart->dbc.xhc_dbc_offset), -- PFN_UP((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) + -- uart->dbc.xhc_dbc_offset + -- sizeof(*uart->dbc.dbc_reg)) - 1) ) -- printk(XENLOG_INFO -- "Error while adding MMIO range of device to mmio_ro_ranges\n"); -+ if ( subpage_mmio_ro_add( -+ (uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) + -+ uart->dbc.xhc_dbc_offset, -+ sizeof(*uart->dbc.dbc_reg)) ) -+ { -+ printk(XENLOG_WARNING -+ "Error while marking MMIO range of XHCI console as R/O, " -+ "making the whole device R/O (share=no)\n"); -+ uart->dbc.share = XHCI_SHARE_NONE; -+ if ( pci_ro_device(0, uart->dbc.sbdf.bus, uart->dbc.sbdf.devfn) ) -+ printk(XENLOG_WARNING -+ "Failed to mark read-only %pp used for XHCI console\n", -+ &uart->dbc.sbdf); -+ if ( rangeset_add_range(mmio_ro_ranges, -+ PFN_DOWN((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) + -+ uart->dbc.xhc_dbc_offset), -+ PFN_UP((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) + -+ uart->dbc.xhc_dbc_offset + -+ sizeof(*uart->dbc.dbc_reg)) - 1) ) -+ printk(XENLOG_INFO -+ "Error while adding MMIO range of device to mmio_ro_ranges\n"); -+ } - #endif - } - --- -2.46.0 - diff --git a/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch b/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch deleted file mode 100644 index 4e697ee..0000000 --- a/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch +++ /dev/null @@ -1,350 +0,0 @@ -From db40e7b40bb68470684d6bef2c0318c448df34d8 Mon Sep 17 00:00:00 2001 -From: Andrew Cooper -Date: Fri, 20 Dec 2024 19:34:24 +0000 -Subject: [PATCH 305/306] x86/spec-ctrl: Support for SRSO_U/S_NO and - SRSO_MSR_FIX -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -AMD have updated the SRSO whitepaper[1] with further information. These -features exist on AMD Zen5 CPUs and are necessary for Xen to use. - -The two features are in principle unrelated: - - * SRSO_U/S_NO is an enumeration saying that SRSO attacks can't cross the - User(CPL3) / Supervisor(CPL<3) boundary. i.e. Xen don't need to use - IBPB-on-entry for PV64. PV32 guests are explicitly unsupported for - speculative issues, and excluded from consideration for simplicity. - - * SRSO_MSR_FIX is an enumeration identifying that the BP_SPEC_REDUCE bit is - available in MSR_BP_CFG. When set, SRSO attacks can't cross the host/guest - boundary. i.e. Xen don't need to use IBPB-on-entry for HVM. - -Extend ibpb_calculations() to account for these when calculating -opt_ibpb_entry_{pv,hvm} defaults. Add a `bp-spec-reduce=` option to -control the use of BP_SPEC_REDUCE, with it active by default. - -Because MSR_BP_CFG is core-scoped with a race condition updating it, repurpose -amd_check_erratum_1485() into amd_check_bp_cfg() and calculate all updates at -once. - -Xen also needs to to advertise SRSO_U/S_NO to guests to allow the guest kernel -to skip SRSO mitigations too: - - * This is trivial for HVM guests. It is also is accurate for PV32 guests - too, but we have already excluded them from consideration, and do so again - here to simplify the policy logic. - - * As written, SRSO_U/S_NO does not help for the PV64 user->kernel boundary. - However, after discussing with AMD, an implementation detail of having - BP_SPEC_REDUCE active causes the PV64 user->kernel boundary to have the - property described by SRSO_U/S_NO, so we can advertise SRSO_U/S_NO to - guests when the BP_SPEC_REDUCE precondition is met. - -Finally, fix a typo in the SRSO_NO's comment. - -[1] https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf - -Signed-off-by: Andrew Cooper -Reviewed-by: Roger Pau Monné ---- - docs/misc/xen-command-line.pandoc | 9 +++- - xen/arch/x86/cpu-policy.c | 21 +++++++++ - xen/arch/x86/cpu/amd.c | 29 +++++++++--- - xen/arch/x86/include/asm/msr-index.h | 1 + - xen/arch/x86/include/asm/spec_ctrl.h | 1 + - xen/arch/x86/spec_ctrl.c | 51 ++++++++++++++++----- - xen/include/public/arch-x86/cpufeatureset.h | 4 +- - 7 files changed, 96 insertions(+), 20 deletions(-) - -diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc -index 2096ae5841de..1944847172d7 100644 ---- a/docs/misc/xen-command-line.pandoc -+++ b/docs/misc/xen-command-line.pandoc -@@ -2392,7 +2392,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). - > {ibrs,ibpb,ssbd,psfd, - > eager-fpu,l1d-flush,branch-harden,srb-lock, - > unpriv-mmio,gds-mit,div-scrub,lock-harden, --> bhi-dis-s}= ]` -+> bhi-dis-s,bp-spec-reduce}= ]` - - Controls for speculative execution sidechannel mitigations. By default, Xen - will pick the most appropriate mitigations based on compiled in support, -@@ -2541,6 +2541,13 @@ boolean can be used to force or prevent Xen from using speculation barriers to - protect lock critical regions. This mitigation won't be engaged by default, - and needs to be explicitly enabled on the command line. - -+On hardware supporting SRSO_MSR_FIX, the `bp-spec-reduce=` option can be used -+to force or prevent Xen from using MSR_BP_CFG.BP_SPEC_REDUCE to mitigate the -+SRSO (Speculative Return Stack Overflow) vulnerability. Xen will use -+bp-spec-reduce when available, as it is preferable to using `ibpb-entry=hvm` -+to mitigate SRSO for HVM guests, and because it is a prerequisite to advertise -+SRSO_U/S_NO to PV guests. -+ - ### sync_console - > `= ` - -diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c -index 304dc20cfab8..1722f5f90392 100644 ---- a/xen/arch/x86/cpu-policy.c -+++ b/xen/arch/x86/cpu-policy.c -@@ -14,6 +14,7 @@ - #include - #include - #include -+#include - #include - - struct cpu_policy __read_mostly raw_cpu_policy; -@@ -605,6 +606,26 @@ static void __init calculate_pv_max_policy(void) - __clear_bit(X86_FEATURE_IBRS, fs); - } - -+ /* -+ * SRSO_U/S_NO means that the CPU is not vulnerable to SRSO attacks across -+ * the User (CPL3)/Supervisor (CPL<3) boundary. -+ * -+ * PV32 guests are unsupported for speculative issues, and excluded from -+ * consideration for simplicity. -+ * -+ * The PV64 user/kernel boundary is CPL3 on both sides, so SRSO_U/S_NO -+ * won't convey the meaning that a PV kernel expects. -+ * -+ * After discussions with AMD, an implementation detail of having -+ * BP_SPEC_REDUCE active causes the PV64 user/kernel boundary to have a -+ * property compatible with the meaning of SRSO_U/S_NO. -+ * -+ * If BP_SPEC_REDUCE isn't active, remove SRSO_U/S_NO from the PV max -+ * policy, which will cause it to filter out of PV default too. -+ */ -+ if ( !boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) || !opt_bp_spec_reduce ) -+ __clear_bit(X86_FEATURE_SRSO_US_NO, fs); -+ - guest_common_max_feature_adjustments(fs); - guest_common_feature_adjustments(fs); - -diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c -index ab92333673b9..c448997be551 100644 ---- a/xen/arch/x86/cpu/amd.c -+++ b/xen/arch/x86/cpu/amd.c -@@ -1009,16 +1009,33 @@ static void cf_check fam17_disable_c6(void *arg) - wrmsrl(MSR_AMD_CSTATE_CFG, val & mask); - } - --static void amd_check_erratum_1485(void) -+static void amd_check_bp_cfg(void) - { -- uint64_t val, chickenbit = (1 << 5); -+ uint64_t val, new = 0; - -- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x19 || !is_zen4_uarch()) -+ /* -+ * AMD Erratum #1485. Set bit 5, as instructed. -+ */ -+ if (!cpu_has_hypervisor && boot_cpu_data.x86 == 0x19 && is_zen4_uarch()) -+ new |= (1 << 5); -+ -+ /* -+ * On hardware supporting SRSO_MSR_FIX, activate BP_SPEC_REDUCE by -+ * default. This lets us do two things: -+ * -+ * 1) Avoid IBPB-on-entry to mitigate SRSO attacks from HVM guests. -+ * 2) Advertise SRSO_US_NO to PV guests. -+ */ -+ if (boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) && opt_bp_spec_reduce) -+ new |= BP_CFG_SPEC_REDUCE; -+ -+ /* Avoid reading BP_CFG if we don't intend to change anything. */ -+ if (!new) - return; - - rdmsrl(MSR_AMD64_BP_CFG, val); - -- if (val & chickenbit) -+ if ((val & new) == new) - return; - - /* -@@ -1027,7 +1044,7 @@ static void amd_check_erratum_1485(void) - * same time before the chickenbit is set. It's benign because the - * value being written is the same on both. - */ -- wrmsrl(MSR_AMD64_BP_CFG, val | chickenbit); -+ wrmsrl(MSR_AMD64_BP_CFG, val | new); - } - - static void cf_check init_amd(struct cpuinfo_x86 *c) -@@ -1297,7 +1314,7 @@ static void cf_check init_amd(struct cpuinfo_x86 *c) - disable_c1_ramping(); - - amd_check_zenbleed(); -- amd_check_erratum_1485(); -+ amd_check_bp_cfg(); - - if (fam17_c6_disabled) - fam17_disable_c6(NULL); -diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h -index 17dd857af802..b324356fd550 100644 ---- a/xen/arch/x86/include/asm/msr-index.h -+++ b/xen/arch/x86/include/asm/msr-index.h -@@ -412,6 +412,7 @@ - #define AMD64_DE_CFG_LFENCE_SERIALISE (_AC(1, ULL) << 1) - #define MSR_AMD64_EX_CFG 0xc001102cU - #define MSR_AMD64_BP_CFG 0xc001102eU -+#define BP_CFG_SPEC_REDUCE (_AC(1, ULL) << 4) - #define MSR_AMD64_DE_CFG2 0xc00110e3U - - #define MSR_AMD64_DR0_ADDRESS_MASK 0xc0011027U -diff --git a/xen/arch/x86/include/asm/spec_ctrl.h b/xen/arch/x86/include/asm/spec_ctrl.h -index 72347ef2b959..077225418956 100644 ---- a/xen/arch/x86/include/asm/spec_ctrl.h -+++ b/xen/arch/x86/include/asm/spec_ctrl.h -@@ -90,6 +90,7 @@ extern int8_t opt_xpti_hwdom, opt_xpti_domu; - - extern bool cpu_has_bug_l1tf; - extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu; -+extern bool opt_bp_spec_reduce; - - /* - * The L1D address mask, which might be wider than reported in CPUID, and the -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 40f6ae017010..35351044f901 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -83,6 +83,7 @@ static bool __initdata opt_unpriv_mmio; - static bool __ro_after_init opt_verw_mmio; - static int8_t __initdata opt_gds_mit = -1; - static int8_t __initdata opt_div_scrub = -1; -+bool __ro_after_init opt_bp_spec_reduce = true; - - static int __init cf_check parse_spec_ctrl(const char *s) - { -@@ -143,6 +144,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_unpriv_mmio = false; - opt_gds_mit = 0; - opt_div_scrub = 0; -+ opt_bp_spec_reduce = false; - } - else if ( val > 0 ) - rc = -EINVAL; -@@ -363,6 +365,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) - opt_gds_mit = val; - else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 ) - opt_div_scrub = val; -+ else if ( (val = parse_boolean("bp-spec-reduce", s, ss)) >= 0 ) -+ opt_bp_spec_reduce = val; - else - rc = -EINVAL; - -@@ -505,7 +509,7 @@ static void __init print_details(enum ind_thunk thunk) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -529,10 +533,11 @@ static void __init print_details(enum ind_thunk thunk) - (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBPB_RET)) ? " IBPB_RET" : "", - (e21a & cpufeat_mask(X86_FEATURE_IBPB_BRTYPE)) ? " IBPB_BRTYPE" : "", -- (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); -+ (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : "", -+ (e21a & cpufeat_mask(X86_FEATURE_SRSO_US_NO)) ? " SRSO_US_NO" : ""); - - /* Hardware features which need driving to mitigate issues. */ -- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || - (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || -@@ -551,7 +556,8 @@ static void __init print_details(enum ind_thunk thunk) - (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", - (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", - (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", -- (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); -+ (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : "", -+ (e21a & cpufeat_mask(X86_FEATURE_SRSO_MSR_FIX)) ? " SRSO_MSR_FIX" : ""); - - /* Compiled-in support which pertains to mitigations. */ - if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || -@@ -1120,7 +1126,7 @@ static void __init div_calculations(bool hw_smt_enabled) - - static void __init ibpb_calculations(void) - { -- bool def_ibpb_entry = false; -+ bool def_ibpb_entry_pv = false, def_ibpb_entry_hvm = false; - - /* Check we have hardware IBPB support before using it... */ - if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) ) -@@ -1145,22 +1151,43 @@ static void __init ibpb_calculations(void) - * Confusion. Mitigate with IBPB-on-entry. - */ - if ( !boot_cpu_has(X86_FEATURE_BTC_NO) ) -- def_ibpb_entry = true; -+ def_ibpb_entry_pv = def_ibpb_entry_hvm = true; - - /* -- * Further to BTC, Zen3/4 CPUs suffer from Speculative Return Stack -- * Overflow in most configurations. Mitigate with IBPB-on-entry if we -- * have the microcode that makes this an effective option. -+ * In addition to BTC, Zen3 and later CPUs suffer from Speculative -+ * Return Stack Overflow in most configurations. If we have microcode -+ * that makes IBPB-on-entry an effective mitigation, see about using -+ * it. - */ - if ( !boot_cpu_has(X86_FEATURE_SRSO_NO) && - boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) ) -- def_ibpb_entry = true; -+ { -+ /* -+ * SRSO_U/S_NO is a subset of SRSO_NO, identifying that SRSO isn't -+ * possible across the User (CPL3) / Supervisor (CPL<3) boundary. -+ * -+ * Ignoring PV32 (not security supported for speculative issues), -+ * this means we only need to use IBPB-on-entry for PV guests on -+ * hardware which doesn't enumerate SRSO_US_NO. -+ */ -+ if ( !boot_cpu_has(X86_FEATURE_SRSO_US_NO) ) -+ def_ibpb_entry_pv = true; -+ -+ /* -+ * SRSO_MSR_FIX enumerates that we can use MSR_BP_CFG.SPEC_REDUCE -+ * to mitigate SRSO across the host/guest boundary. We only need -+ * to use IBPB-on-entry for HVM guests if we haven't enabled this -+ * control. -+ */ -+ if ( !boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) || !opt_bp_spec_reduce ) -+ def_ibpb_entry_hvm = true; -+ } - } - - if ( opt_ibpb_entry_pv == -1 ) -- opt_ibpb_entry_pv = IS_ENABLED(CONFIG_PV) && def_ibpb_entry; -+ opt_ibpb_entry_pv = IS_ENABLED(CONFIG_PV) && def_ibpb_entry_pv; - if ( opt_ibpb_entry_hvm == -1 ) -- opt_ibpb_entry_hvm = IS_ENABLED(CONFIG_HVM) && def_ibpb_entry; -+ opt_ibpb_entry_hvm = IS_ENABLED(CONFIG_HVM) && def_ibpb_entry_hvm; - - if ( opt_ibpb_entry_pv ) - { -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index d9eba5e9a714..9c98e4992861 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -312,7 +312,9 @@ XEN_CPUFEATURE(FSRSC, 11*32+19) /*A Fast Short REP SCASB */ - XEN_CPUFEATURE(AMD_PREFETCHI, 11*32+20) /*A PREFETCHIT{0,1} Instructions */ - XEN_CPUFEATURE(SBPB, 11*32+27) /*A Selective Branch Predictor Barrier */ - XEN_CPUFEATURE(IBPB_BRTYPE, 11*32+28) /*A IBPB flushes Branch Type predictions too */ --XEN_CPUFEATURE(SRSO_NO, 11*32+29) /*A Hardware not vulenrable to Speculative Return Stack Overflow */ -+XEN_CPUFEATURE(SRSO_NO, 11*32+29) /*A Hardware not vulnerable to Speculative Return Stack Overflow */ -+XEN_CPUFEATURE(SRSO_US_NO, 11*32+30) /*A! Hardware not vulnerable to SRSO across the User/Supervisor boundary */ -+XEN_CPUFEATURE(SRSO_MSR_FIX, 11*32+31) /* MSR_BP_CFG.BP_SPEC_REDUCE available */ - - /* Intel-defined CPU features, CPUID level 0x00000007:1.ebx, word 12 */ - XEN_CPUFEATURE(INTEL_PPIN, 12*32+ 0) /* Protected Processor Inventory Number */ --- -2.46.0 - diff --git a/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch b/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch deleted file mode 100644 index 33000ba..0000000 --- a/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch +++ /dev/null @@ -1,83 +0,0 @@ -From d4c0b38763f75845693855d1ac419af94866eece Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Mon, 16 Dec 2024 19:33:29 +0100 -Subject: [PATCH 306/306] x86/io-apic: prevent early exit from i8259 loop - detection -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Avoid exiting early from the loop when a pin that could be connected to the -i8259 is found, as such early exit would leave the EOI handler translation -array only partially allocated and/or initialized. - -Otherwise on systems with multiple IO-APICs and an unmasked ExtINT pin on -any IO-APIC that's no the last one the following NULL pointer dereference -triggers: - -(XEN) Enabling APIC mode. Using 2 I/O APICs -(XEN) ----[ Xen-4.20-unstable x86_64 debug=y Not tainted ]---- -(XEN) CPU: 0 -(XEN) RIP: e008:[] __ioapic_write_entry+0x83/0x95 -[...] -(XEN) Xen call trace: -(XEN) [] R __ioapic_write_entry+0x83/0x95 -(XEN) [] F amd_iommu_ioapic_update_ire+0x1ea/0x273 -(XEN) [] F iommu_update_ire_from_apic+0xa/0xc -(XEN) [] F __ioapic_write_entry+0x93/0x95 -(XEN) [] F arch/x86/io_apic.c#clear_IO_APIC_pin+0x7c/0x10e -(XEN) [] F arch/x86/io_apic.c#clear_IO_APIC+0x2d/0x61 -(XEN) [] F enable_IO_APIC+0x2e3/0x34f -(XEN) [] F smp_prepare_cpus+0x254/0x27a -(XEN) [] F __start_xen+0x1ce1/0x23ae -(XEN) [] F __high_start+0x8e/0x90 -(XEN) -(XEN) Pagetable walk from 0000000000000000: -(XEN) L4[0x000] = 000000007dbfd063 ffffffffffffffff -(XEN) L3[0x000] = 000000007dbfa063 ffffffffffffffff -(XEN) L2[0x000] = 000000007dbcc063 ffffffffffffffff -(XEN) L1[0x000] = 0000000000000000 ffffffffffffffff -(XEN) -(XEN) **************************************** -(XEN) Panic on CPU 0: -(XEN) FATAL PAGE FAULT -(XEN) [error_code=0002] -(XEN) Faulting linear address: 0000000000000000 -(XEN) **************************************** -(XEN) -(XEN) Reboot in five seconds... - -Reported-by: Sergii Dmytruk -Fixes: 86001b3970fe ('x86/io-apic: fix directed EOI when using AMD-Vi interrupt remapping') -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -(cherry picked from commit f38fd27c4ceadf7ec4e82e82d0731b6ea415c51e) ---- - xen/arch/x86/io_apic.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c -index d2a313c4ac72..f8b2aad9cba5 100644 ---- a/xen/arch/x86/io_apic.c -+++ b/xen/arch/x86/io_apic.c -@@ -1307,14 +1307,14 @@ void __init enable_IO_APIC(void) - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ -- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { -+ if ( ioapic_i8259.pin == -1 && entry.mask == 0 && -+ entry.delivery_mode == dest_ExtINT ) -+ { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; -- goto found_i8259; - } - } - } -- found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the --- -2.46.0 - diff --git a/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch b/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch index 15ee9ff..ff1dc3f 100644 --- a/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch +++ b/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch @@ -19,34 +19,30 @@ diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index a97d78484105..45f6baf6270b 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c -@@ -1245,20 +1245,25 @@ static void __get_cmos_time(struct rtc_time *rtc) - rtc->year += 100; +@@ -1552,6 +1552,9 @@ static const char *__init wallclock_type + return ""; } - + +/* EFI's GetTime() is frequently broken so don't use it by default. */ +#undef USE_EFI_GET_TIME + - static unsigned long get_cmos_time(void) + static void __init probe_wallclock(void) { -- unsigned long res, flags; -+ unsigned long flags; - struct rtc_time rtc; - unsigned int seconds = 60; - static bool __read_mostly cmos_rtc_probe; - boolean_param("cmos-rtc-probe", cmos_rtc_probe); - + ASSERT(wallclock_source == WALLCLOCK_UNSET); +@@ -1561,11 +1564,13 @@ static void __init probe_wallclock(void) + wallclock_source = WALLCLOCK_XEN; + return; + } +#ifdef USE_EFI_GET_TIME - if ( efi_enabled(EFI_RS) ) + if ( efi_enabled(EFI_RS) && efi_get_time() ) { -- res = efi_get_time(); -+ unsigned long res = efi_get_time(); - if ( res ) - return res; + wallclock_source = WALLCLOCK_EFI; + return; } +#endif - - if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) ) - cmos_rtc_probe = false; + if ( cmos_rtc_probe() ) + { + wallclock_source = WALLCLOCK_CMOS; -- 2.44.0 diff --git a/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch b/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch index b108088..5bcf416 100644 --- a/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch +++ b/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch @@ -312,15 +312,18 @@ diff --git a/xen/arch/x86/include/asm/cpuidle.h b/xen/arch/x86/include/asm/cpuid index 707b3e948d45..3f5cd40fd596 100644 --- a/xen/arch/x86/include/asm/cpuidle.h +++ b/xen/arch/x86/include/asm/cpuidle.h -@@ -15,7 +15,7 @@ extern void (*lapic_timer_on)(void); - +@@ -15,9 +15,9 @@ extern void (*lapic_timer_on)(void); extern uint64_t (*cpuidle_get_tick)(void); + #ifdef CONFIG_INTEL -int mwait_idle_init(struct notifier_block *nfb); -+int mwait_idle_init(struct notifier_block *nfb, bool); - int cpuidle_init_cpu(unsigned int cpu); - void cf_check default_dead_idle(void); - void cf_check acpi_dead_idle(void); ++int mwait_idle_init(struct notifier_block *nfb, bool from_acpi); + #else +-static inline int mwait_idle_init(struct notifier_block *nfb) ++static inline int mwait_idle_init(struct notifier_block *nfb, bool from_acpi) + { + return -ENODEV; + } -- 2.44.0 diff --git a/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch b/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch deleted file mode 100644 index 18aa820..0000000 --- a/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch +++ /dev/null @@ -1,62 +0,0 @@ -From b3262b7069a51e460a9f044eec4fc5e2e5758db2 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= - -Date: Tue, 8 Oct 2024 23:24:31 +0200 -Subject: [PATCH] tools/xg: increase LZMA_BLOCK_SIZE for uncompressing the - kernel -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Linux 6.12-rc2 fails to decompress with the current 128MiB, contrary to -the code comment. It results in a failure like this: - - domainbuilder: detail: xc_dom_kernel_file: filename="/var/lib/qubes/vm-kernels/6.12-rc2-1.1.fc37/vmlinuz" - domainbuilder: detail: xc_dom_malloc_filemap : 12104 kB - domainbuilder: detail: xc_dom_module_file: filename="/var/lib/qubes/vm-kernels/6.12-rc2-1.1.fc37/initramfs" - domainbuilder: detail: xc_dom_malloc_filemap : 7711 kB - domainbuilder: detail: xc_dom_boot_xen_init: ver 4.19, caps xen-3.0-x86_64 hvm-3.0-x86_32 hvm-3.0-x86_32p hvm-3.0-x86_64 - domainbuilder: detail: xc_dom_parse_image: called - domainbuilder: detail: xc_dom_find_loader: trying multiboot-binary loader ... - domainbuilder: detail: loader probe failed - domainbuilder: detail: xc_dom_find_loader: trying HVM-generic loader ... - domainbuilder: detail: loader probe failed - domainbuilder: detail: xc_dom_find_loader: trying Linux bzImage loader ... - domainbuilder: detail: _xc_try_lzma_decode: XZ decompression error: Memory usage limit reached - xc: error: panic: xg_dom_bzimageloader.c:761: xc_dom_probe_bzimage_kernel unable to XZ decompress kernel: Invalid kernel - domainbuilder: detail: loader probe failed - domainbuilder: detail: xc_dom_find_loader: trying ELF-generic loader ... - domainbuilder: detail: loader probe failed - xc: error: panic: xg_dom_core.c:689: xc_dom_find_loader: no loader found: Invalid kernel - libxl: error: libxl_dom.c:566:libxl__build_dom: xc_dom_parse_image failed - -The important part: XZ decompression error: Memory usage limit reached - -This looks to be related to the following change in Linux: -8653c909922743bceb4800e5cc26087208c9e0e6 ("xz: use 128 MiB dictionary and force single-threaded mode") - -Fix this by increasing the block size to 256MiB. And remove the -misleading comment (from lack of better ideas). - -Signed-off-by: Marek Marczykowski-Górecki ---- - tools/libs/guest/xg_dom_bzimageloader.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/tools/libs/guest/xg_dom_bzimageloader.c b/tools/libs/guest/xg_dom_bzimageloader.c -index c6ee6d83e7c6..1fb4e5a1f728 100644 ---- a/tools/libs/guest/xg_dom_bzimageloader.c -+++ b/tools/libs/guest/xg_dom_bzimageloader.c -@@ -272,8 +272,7 @@ static int _xc_try_lzma_decode( - return retval; - } - --/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */ --#define LZMA_BLOCK_SIZE (128*1024*1024) -+#define LZMA_BLOCK_SIZE (256*1024*1024) - - static int xc_try_xz_decode( - struct xc_dom_image *dom, void **blob, size_t *size) --- -2.46.0 - diff --git a/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch b/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch index 9fdac44..9cfe865 100644 --- a/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch +++ b/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch @@ -55,7 +55,7 @@ index 84cba171cd6b..d94df1cd88b6 100644 -SMBIOS_REL_DATE ?= $(shell date +%m/%d/%Y) +SMBIOS_REL_DATE ?= $(shell date $(DATE_EPOCH_OPTS) +%m/%d/%Y) - CFLAGS += $(CFLAGS_xeninclude) -fno-pic + CFLAGS += $(CFLAGS_xeninclude) -fno-pic -mregparm=3 diff --git a/tools/firmware/vgabios/Makefile b/tools/firmware/vgabios/Makefile index 3284812fdec8..4350ef402127 100644 diff --git a/version b/version index 1fc0e81..554b652 100644 --- a/version +++ b/version @@ -1 +1 @@ -4.19.1 +4.20.0-rc0 diff --git a/xen.spec.in b/xen.spec.in index 0d4e672..e50a8d0 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -97,11 +97,6 @@ Patch0202: 0202-Add-xen.cfg-options-for-mapbs-and-noexitboot.patch Patch0203: 0203-xen.efi.build.patch # Backports (300+) -Patch0300: 0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch -Patch0301: 0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch -Patch0302: 0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch -Patch0305: 0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch -Patch0306: 0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch # Security fixes (500+) @@ -141,8 +136,6 @@ Patch0627: 0627-x86-msr-Allow-hardware-domain-to-read-package-C-stat.patch Patch0628: 0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch Patch0629: 0629-libxl_pci-Pass-power_mgmt-via-QMP.patch -Patch0630: 0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch - # Qubes specific patches Patch1000: 1000-Do-not-access-network-during-the-build.patch Patch1001: 1001-hotplug-store-block-params-for-cleanup.patch From aeb98de6b91abd026f86c49f179dc0445c2f189c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 8 Jan 2025 15:48:28 +0100 Subject: [PATCH 3/6] Fix build with Fedora-provided cflags See patch description for details. This solves similar problem as https://github.com/TrenchBoot/xen/pull/19/commits/11aee1127ff14acb08d066a3a123760ddcbbe8c3 --- ...text-gap-diff-to-work-with-64-bytes-.patch | 44 +++++++++++++++++++ xen.spec.in | 2 + 2 files changed, 46 insertions(+) create mode 100644 0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch diff --git a/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch b/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch new file mode 100644 index 0000000..bf4fd4c --- /dev/null +++ b/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch @@ -0,0 +1,44 @@ +From 77d5991de867a1b2d694147958d77c51a2b989ae Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Wed, 8 Jan 2025 15:37:13 +0100 +Subject: [PATCH] x86/boot: adjust text gap/diff to work with 64-bytes + alignment too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Xen compiled with -mtune=generic has .text alignment set to 64-bytes. +Setting text_diff to non-64-bytes-aligned number breaks stuff: + + Traceback (most recent call last): + File "/builddir/build/BUILD/xen-4.20.0-build/xen-4.20.0-rc0/xen/./tools/combine_two_binaries.py", line 96, in + raise Exception('File sizes do not match') + Exception: File sizes do not match: 70160 != 4080 + 66048 + +Adjust the numbers as suggested by Frediano to work with 64-bytes and +even 128-bytes alignment. + +Signed-off-by: Marek Marczykowski-Górecki +--- + xen/arch/x86/boot/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/boot/Makefile b/xen/arch/x86/boot/Makefile +index d45787665907..80c32163fbbd 100644 +--- a/xen/arch/x86/boot/Makefile ++++ b/xen/arch/x86/boot/Makefile +@@ -40,8 +40,8 @@ LD32 := $(LD) $(subst x86_64,i386,$(LDFLAGS_DIRECT)) + # are affected by both text_diff and text_gap. Ensure the sum of gap and diff + # is greater than 2^16 so that any 16bit relocations if present in the object + # file turns into a build-time error. +-text_gap := 0x010200 +-text_diff := 0x408020 ++text_gap := 0x010240 ++text_diff := 0x608040 + + $(obj)/build32.base.lds: AFLAGS-y += -DGAP=$(text_gap) -DTEXT_DIFF=$(text_diff) + $(obj)/build32.offset.lds: AFLAGS-y += -DGAP=$(text_gap) -DTEXT_DIFF=$(text_diff) -DAPPLY_OFFSET +-- +2.46.0 + diff --git a/xen.spec.in b/xen.spec.in index e50a8d0..8252188 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -136,6 +136,8 @@ Patch0627: 0627-x86-msr-Allow-hardware-domain-to-read-package-C-stat.patch Patch0628: 0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch Patch0629: 0629-libxl_pci-Pass-power_mgmt-via-QMP.patch +Patch0630: 0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch + # Qubes specific patches Patch1000: 1000-Do-not-access-network-during-the-build.patch Patch1001: 1001-hotplug-store-block-params-for-cleanup.patch From 03d4c203a2a40bfa12835a27699560efec1eea48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 8 Jan 2025 21:09:36 +0100 Subject: [PATCH 4/6] Apply PV-IOMMU series --- ...s-Add-a-design-document-for-PV-IOMMU.patch | 140 + ...-a-design-document-for-IOMMU-subsyst.patch | 437 +++ ...Introduce-redesigned-IOMMU-subsystem.patch | 1904 +++++++++++ ...d-Port-IOMMU-driver-to-new-subsystem.patch | 2965 +++++++++++++++++ ...troduce-PV-IOMMU-hypercall-interface.patch | 965 ++++++ xen.spec.in | 6 + 6 files changed, 6417 insertions(+) create mode 100644 0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch create mode 100644 0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch create mode 100644 0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch create mode 100644 0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch create mode 100644 0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch diff --git a/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch b/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch new file mode 100644 index 0000000..3ca961b --- /dev/null +++ b/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch @@ -0,0 +1,140 @@ +From 0b9a21e9fba41427921031f97346ead3f6b2a8d6 Mon Sep 17 00:00:00 2001 +From: Teddy Astie +Date: Mon, 4 Nov 2024 14:28:38 +0000 +Subject: [PATCH 400/404] docs/designs: Add a design document for PV-IOMMU + +Some operating systems want to use IOMMU to implement various features (e.g +VFIO) or DMA protection. +This patch introduce a proposal for IOMMU paravirtualization for Dom0. + +Signed-off-by Teddy Astie +--- + docs/designs/pv-iommu.md | 116 +++++++++++++++++++++++++++++++++++++++ + 1 file changed, 116 insertions(+) + create mode 100644 docs/designs/pv-iommu.md + +diff --git a/docs/designs/pv-iommu.md b/docs/designs/pv-iommu.md +new file mode 100644 +index 000000000000..7df9fa0b9489 +--- /dev/null ++++ b/docs/designs/pv-iommu.md +@@ -0,0 +1,116 @@ ++# IOMMU paravirtualization for Dom0 ++ ++Status: Experimental ++ ++# Background ++ ++By default, Xen only uses the IOMMU for itself, either to make device adress ++space coherent with guest adress space (x86 HVM/PVH) or to prevent devices ++from doing DMA outside it's expected memory regions including the hypervisor ++(x86 PV). ++ ++A limitation is that guests (especially privildged ones) may want to use ++IOMMU hardware in order to implement features such as DMA protection and ++VFIO [1] as IOMMU functionality is not available outside of the hypervisor ++currently. ++ ++[1] VFIO - "Virtual Function I/O" - https://www.kernel.org/doc/html/latest/driver-api/vfio.html ++ ++# Design ++ ++The operating system may want to have access to various IOMMU features such as ++context management and DMA remapping. We can create a new hypercall that allows ++the guest to have access to a new paravirtualized IOMMU interface. ++ ++This feature is only meant to be available for the Dom0, as DomU have some ++emulated devices that can't be managed on Xen side and are not hardware, we ++can't rely on the hardware IOMMU to enforce DMA remapping. ++ ++This interface is exposed under the `iommu_op` hypercall. ++ ++In addition, Xen domains are modified in order to allow existence of several ++IOMMU context including a default one that implement default behavior (e.g ++hardware assisted paging) and can't be modified by guest. DomU cannot have ++contexts, and therefore act as if they only have the default domain. ++ ++Each IOMMU context within a Xen domain is identified using a domain-specific ++context number that is used in the Xen IOMMU subsystem and the hypercall ++interface. ++ ++The number of IOMMU context a domain is specified by either the toolstack or ++the domain itself. ++ ++# IOMMU operations ++ ++## Initialize PV-IOMMU ++ ++Initialize PV-IOMMU for the domain. ++It can only be called once. ++ ++## Alloc context ++ ++Create a new IOMMU context for the guest and return the context number to the ++guest. ++Fail if the IOMMU context limit of the guest is reached. ++ ++A flag can be specified to create a identity mapping. ++ ++## Free context ++ ++Destroy a IOMMU context created previously. ++It is not possible to free the default context. ++ ++Reattach context devices to default context if specified by the guest. ++ ++Fail if there is a device in the context and reattach-to-default flag is not ++specified. ++ ++## Reattach device ++ ++Reattach a device to another IOMMU context (including the default one). ++The target IOMMU context number must be valid and the context allocated. ++ ++The guest needs to specify a PCI SBDF of a device he has access to. ++ ++## Map/unmap page ++ ++Map/unmap a page on a context. ++The guest needs to specify a gfn and target dfn to map. ++ ++Refuse to create the mapping if one already exist for the same dfn. ++ ++## Lookup page ++ ++Get the gfn mapped by a specific dfn. ++ ++## Remote command ++ ++Make a PV-IOMMU operation on behalf of another domain. ++Especially useful for implementing IOMMU emulation (e.g using QEMU) ++or initializing PV-IOMMU with enforced limits. ++ ++# Implementation considerations ++ ++## Hypercall batching ++ ++In order to prevent unneeded hypercalls and IOMMU flushing, it is advisable to ++be able to batch some critical IOMMU operations (e.g map/unmap multiple pages). ++ ++## Hardware without IOMMU support ++ ++Operating system needs to be aware on PV-IOMMU capability, and whether it is ++able to make contexts. However, some operating system may critically fail in ++case they are able to make a new IOMMU context. Which is supposed to happen ++if no IOMMU hardware is available. ++ ++The hypercall interface needs a interface to advertise the ability to create ++and manage IOMMU contexts including the amount of context the guest is able ++to use. Using these informations, the Dom0 may decide whether to use or not ++the PV-IOMMU interface. ++ ++## Page pool for contexts ++ ++In order to prevent unexpected starving on the hypervisor memory with a ++buggy Dom0. We can preallocate the pages the contexts will use and make ++map/unmap use these pages instead of allocating them dynamically. ++ +-- +2.46.0 + diff --git a/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch b/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch new file mode 100644 index 0000000..d4acbab --- /dev/null +++ b/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch @@ -0,0 +1,437 @@ +From e68760dd296108259247af8ad218200af830c324 Mon Sep 17 00:00:00 2001 +From: Teddy Astie +Date: Mon, 4 Nov 2024 14:28:38 +0000 +Subject: [PATCH 401/404] docs/designs: Add a design document for IOMMU + subsystem redesign + +Current IOMMU subsystem has some limitations that make PV-IOMMU practically impossible. +One of them is the assumtion that each domain is bound to a single "IOMMU domain", which +also causes complications with quarantine implementation. + +Moreover, current IOMMU subsystem is not entirely well-defined, for instance, the behavior +of map_page between ARM SMMUv3 and x86 VT-d/AMD-Vi greatly differs. On ARM, it can modifies +the domain page table while on x86, it may be forbidden (e.g using HAP with PVH), or only +modifying the devices PoV (e.g using PV). + +The goal of this redesign is to define more explicitely the behavior and interface of the +IOMMU subsystem while allowing PV-IOMMU to be effectively implemented. + +Signed-off-by Teddy Astie +--- + docs/designs/iommu-contexts.md | 403 +++++++++++++++++++++++++++++++++ + 1 file changed, 403 insertions(+) + create mode 100644 docs/designs/iommu-contexts.md + +diff --git a/docs/designs/iommu-contexts.md b/docs/designs/iommu-contexts.md +new file mode 100644 +index 000000000000..9d6fb9554953 +--- /dev/null ++++ b/docs/designs/iommu-contexts.md +@@ -0,0 +1,403 @@ ++# IOMMU context management in Xen ++ ++Status: Experimental ++Revision: 0 ++ ++# Background ++ ++The design for *IOMMU paravirtualization for Dom0* [1] explains that some guests may ++want to access to IOMMU features. In order to implement this in Xen, several adjustments ++needs to be made to the IOMMU subsystem. ++ ++This "hardware IOMMU domain" is currently implemented on a per-domain basis such as each ++domain actually has a specific *hardware IOMMU domain*, this design aims to allow a ++single Xen domain to manage several "IOMMU context", and allow some domains (e.g Dom0 ++[1]) to modify their IOMMU contexts. ++ ++In addition to this, quarantine feature can be refactored into using IOMMU contexts ++to reduce the complexity of platform-specific implementations and ensuring more ++consistency across platforms. ++ ++# IOMMU context ++ ++We define a "IOMMU context" as being a *hardware IOMMU domain*, but named as a context ++to avoid confusion with Xen domains. ++It represents some hardware-specific data structure that contains mappings from a device ++frame-number to a machine frame-number (e.g using a pagetable) that can be applied to ++a device using IOMMU hardware. ++ ++This structure is bound to a Xen domain, but a Xen domain may have several IOMMU context. ++These contexts may be modifiable using the interface as defined in [1] aside some ++specific cases (e.g modifying default context). ++ ++This is implemented in Xen as a new structure that will hold context-specific ++data. ++ ++```c ++struct iommu_context { ++ u16 id; /* Context id (0 means default context) */ ++ struct list_head devices; ++ ++ struct arch_iommu_context arch; ++ ++ bool opaque; /* context can't be modified nor accessed (e.g HAP) */ ++}; ++``` ++ ++A context is identified by a number that is domain-specific and may be used by IOMMU ++users such as PV-IOMMU by the guest. ++ ++struct arch_iommu_context is splited from struct arch_iommu ++ ++```c ++struct arch_iommu_context ++{ ++ spinlock_t pgtables_lock; ++ struct page_list_head pgtables; ++ ++ union { ++ /* Intel VT-d */ ++ struct { ++ uint64_t pgd_maddr; /* io page directory machine address */ ++ domid_t *didmap; /* per-iommu DID */ ++ unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the context uses */ ++ } vtd; ++ /* AMD IOMMU */ ++ struct { ++ struct page_info *root_table; ++ } amd; ++ }; ++}; ++ ++struct arch_iommu ++{ ++ spinlock_t mapping_lock; /* io page table lock */ ++ struct { ++ struct page_list_head list; ++ spinlock_t lock; ++ } pgtables; ++ ++ struct list_head identity_maps; ++ ++ union { ++ /* Intel VT-d */ ++ struct { ++ /* no more context-specific values */ ++ unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ ++ } vtd; ++ /* AMD IOMMU */ ++ struct { ++ unsigned int paging_mode; ++ struct guest_iommu *g_iommu; ++ } amd; ++ }; ++}; ++``` ++ ++IOMMU context information is now carried by iommu_context rather than being integrated to ++struct arch_iommu. ++ ++# Xen domain IOMMU structure ++ ++`struct domain_iommu` is modified to allow multiples context within a single Xen domain ++to exist : ++ ++```c ++struct iommu_context_list { ++ uint16_t count; /* Context count excluding default context */ ++ ++ /* if count > 0 */ ++ ++ uint64_t *bitmap; /* bitmap of context allocation */ ++ struct iommu_context *map; /* Map of contexts */ ++}; ++ ++struct domain_iommu { ++ /* ... */ ++ ++ struct iommu_context default_ctx; ++ struct iommu_context_list other_contexts; ++ ++ /* ... */ ++} ++``` ++ ++default_ctx is a special context with id=0 that holds the page table mapping the entire ++domain, which basically preserve the previous behavior. All devices are expected to be ++bound to this context during initialization. ++ ++Along with this default context that always exist, we use a pool of contexts that has a ++fixed size at domain initialization, where contexts can be allocated (if possible), and ++have a id matching their position in the map (considering that id != 0). ++These contexts may be used by IOMMU contexts users such as PV-IOMMU or quarantine domain ++(DomIO). ++ ++# Platform independent context management interface ++ ++A new platform independant interface is introduced in Xen hypervisor to allow ++IOMMU contexts users to create and manage contexts within domains. ++ ++```c ++/* Direct context access functions (not supposed to be used directly) */ ++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no); ++void iommu_put_context(struct iommu_context *ctx); ++ ++/* Flag for default context initialization */ ++#define IOMMU_CONTEXT_INIT_default (1 << 0) ++ ++/* Flag for quarantine contexts (scratch page, DMA Abort mode, ...) */ ++#define IOMMU_CONTEXT_INIT_quarantine (1 << 1) ++ ++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no, u32 flags); ++ ++/* Flag to specify that devices will need to be reattached to default domain */ ++#define IOMMU_TEARDOWN_REATTACH_DEFAULT (1 << 0) ++ ++/* ++ * Flag to specify that the context needs to be destroyed preemptively ++ * (multiple calls to iommu_context_teardown will be required) ++ */ ++#define IOMMU_TEARDOWN_PREEMPT (1 << 1) ++ ++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags); ++ ++/* Allocate a new context, uses CONTEXT_INIT flags */ ++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags); ++ ++/* Free a context, uses CONTEXT_TEARDOWN flags */ ++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags); ++ ++/* Move a device from one context to another, including between different domains. */ ++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom, ++ device_t *dev, u16 ctx_no); ++ ++/* Add a device to a context for first initialization */ ++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no); ++ ++/* Remove a device from a context, effectively removing it from the IOMMU. */ ++int iommu_detach_context(struct domain *d, device_t *dev); ++``` ++ ++This interface will use a new interface with drivers to implement these features. ++ ++Some existing functions will have a new parameter to specify on what context to do the operation. ++- iommu_map (iommu_legacy_map untouched) ++- iommu_unmap (iommu_legacy_unmap untouched) ++- iommu_lookup_page ++- iommu_iotlb_flush ++ ++These functions will modify the iommu_context structure to accomodate with the ++operations applied, these functions will be used to replace some operations previously ++made in the IOMMU driver. ++ ++# IOMMU platform_ops interface changes ++ ++The IOMMU driver needs to expose a way to create and manage IOMMU contexts, the approach ++taken here is to modify the interface to allow specifying a IOMMU context on operations, ++and at the same time, simplifying the interface by relying more on iommu ++platform-independent code. ++ ++Added functions in iommu_ops ++ ++```c ++/* Initialize a context (creating page tables, allocating hardware, structures, ...) */ ++int (*context_init)(struct domain *d, struct iommu_context *ctx, ++ u32 flags); ++/* Destroy a context, assumes no device is bound to the context. */ ++int (*context_teardown)(struct domain *d, struct iommu_context *ctx, ++ u32 flags); ++/* Put a device in a context (assumes the device is not attached to another context) */ ++int (*attach)(struct domain *d, device_t *dev, ++ struct iommu_context *ctx); ++/* Remove a device from a context, and from the IOMMU. */ ++int (*detach)(struct domain *d, device_t *dev, ++ struct iommu_context *prev_ctx); ++/* Move the device from a context to another, including if the new context is in ++ another domain. d corresponds to the target domain. */ ++int (*reattach)(struct domain *d, device_t *dev, ++ struct iommu_context *prev_ctx, ++ struct iommu_context *ctx); ++ ++#ifdef CONFIG_HAS_PCI ++/* Specific interface for phantom function devices. */ ++int (*add_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn, ++ struct iommu_context *ctx); ++int (*remove_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn, ++ struct iommu_context *ctx); ++#endif ++ ++/* Changes in existing to use a specified iommu_context. */ ++int __must_check (*map_page)(struct domain *d, dfn_t dfn, mfn_t mfn, ++ unsigned int flags, ++ unsigned int *flush_flags, ++ struct iommu_context *ctx); ++int __must_check (*unmap_page)(struct domain *d, dfn_t dfn, ++ unsigned int order, ++ unsigned int *flush_flags, ++ struct iommu_context *ctx); ++int __must_check (*lookup_page)(struct domain *d, dfn_t dfn, mfn_t *mfn, ++ unsigned int *flags, ++ struct iommu_context *ctx); ++ ++int __must_check (*iotlb_flush)(struct domain *d, ++ struct iommu_context *ctx, dfn_t dfn, ++ unsigned long page_count, ++ unsigned int flush_flags); ++ ++void (*clear_root_pgtable)(struct domain *d, struct iommu_context *ctx); ++``` ++ ++These functions are redundant with existing functions, therefore, the following functions ++are replaced with new equivalents : ++- quarantine_init : platform-independent code and IOMMU_CONTEXT_INIT_quarantine flag ++- add_device : attach and add_devfn (phantom) ++- assign_device : attach and add_devfn (phantom) ++- remove_device : detach and remove_devfn (phantom) ++- reassign_device : reattach ++ ++Some functionnal differences with previous functions, the following should be handled ++by platform-independent/arch-specific code instead of IOMMU driver : ++- identity mappings (unity mappings and rmrr) ++- device list in context and domain ++- domain of a device ++- quarantine ++ ++The idea behind this is to implement IOMMU context features while simplifying IOMMU ++drivers implementations and ensuring more consistency between IOMMU drivers. ++ ++## Phantom function handling ++ ++PCI devices may use additionnal devfn to do DMA operations, in order to support such ++devices, an interface is added to map specific device functions without implying that ++the device is mapped to a new context (that may cause duplicates in Xen data structures). ++ ++Functions add_devfn and remove_devfn allows to map a iommu context on specific devfn ++for a pci device, without altering platform-independent data structures. ++ ++It is important for the reattach operation to care about these devices, in order ++to prevent devices from being partially reattached to the new context (see XSA-449 [2]) ++by using a all-or-nothing approach for reattaching such devices. ++ ++# Quarantine refactoring using IOMMU contexts ++ ++The quarantine mecanism can be entirely reimplemented using IOMMU context, making ++it simpler, more consistent between platforms, ++ ++Quarantine is currently only supported with x86 platforms and works by creating a ++single *hardware IOMMU domain* per quarantined device. All the quarantine logic is ++the implemented in a platform-specific fashion while actually implementing the same ++concepts : ++ ++The *hardware IOMMU context* data structures for quarantine are currently stored in ++the device structure itself (using arch_pci_dev) and IOMMU driver needs to care about ++whether we are dealing with quarantine operations or regular operations (often dealt ++using macros such as QUARANTINE_SKIP or DEVICE_PGTABLE). ++ ++The page table that will apply on the quarantined device is created reserved device ++regions, and adding mappings to a scratch page if enabled (quarantine=scratch-page). ++ ++A new approach we can use is allowing the quarantine domain (DomIO) to manage IOMMU ++contexts, and implement all the quarantine logic using IOMMU contexts. ++ ++That way, the quarantine implementation can be platform-independent, thus have a more ++consistent implementation between platforms. It will also allows quarantine to work ++with other IOMMU implementations without having to implement platform-specific behavior. ++Moreover, quarantine operations can be implemented using regular context operations ++instead of relying on driver-specific code. ++ ++Quarantine implementation can be summarised as ++ ++```c ++int iommu_quarantine_dev_init(device_t *dev) ++{ ++ int ret; ++ u16 ctx_no; ++ ++ if ( !iommu_quarantine ) ++ return -EINVAL; ++ ++ ret = iommu_context_alloc(dom_io, &ctx_no, IOMMU_CONTEXT_INIT_quarantine); ++ ++ if ( ret ) ++ return ret; ++ ++ /** TODO: Setup scratch page, mappings... */ ++ ++ ret = iommu_reattach_context(dev->domain, dom_io, dev, ctx_no); ++ ++ if ( ret ) ++ { ++ ASSERT(!iommu_context_free(dom_io, ctx_no, 0)); ++ return ret; ++ } ++ ++ return ret; ++} ++``` ++ ++# Platform-specific considerations ++ ++## Reference counters on target pages ++ ++When mapping a guest page onto a IOMMU context, we need to make sure that ++this page is not reused for something else while being actually referenced ++by a IOMMU context. One way of doing it is incrementing the reference counter ++of each target page we map (excluding reserved regions), and decrementing it ++when the mapping isn't used anymore. ++ ++One consideration to have is when destroying the context while having existing ++mappings in it. We can walk through the entire page table and decrement the ++reference counter of all mappings. All of that assumes that there is no reserved ++region mapped (which should be the case as a requirement of teardown, or as a ++consequence of REATTACH_DEFAULT flag). ++ ++Another consideration is that the "cleanup mappings" operation may take a lot ++of time depending on the complexity of the page table. Making the teardown operation preemptable can allow the hypercall to be preempted if needed also preventing a malicious ++guest from stalling a CPU in a teardown operation with a specially crafted IOMMU ++context (e.g with several 1G superpages). ++ ++## Limit the amount of pages IOMMU contexts can use ++ ++In order to prevent a (eventually malicious) guest from causing too much allocations ++in Xen, we can enforce limits on the memory the IOMMU subsystem can use for IOMMU context. ++A possible implementation can be to preallocate a reasonably large chunk of memory ++and split it into pages for use by the IOMMU subsystem only for non-default IOMMU ++contexts (e.g PV-IOMMU interface), if this limitation is overcome, some operations ++may fail from the guest side. These limitations shouldn't impact "usual" operations ++of the IOMMU subsystem (e.g default context initialization). ++ ++## x86 Architecture ++ ++TODO ++ ++### Intel VT-d ++ ++VT-d uses DID to tag the *IOMMU domain* applied to a device and assumes that all entries ++with the same DID uses the same page table (i.e same IOMMU context). ++Under certain circonstances (e.g DRHD with DID limit below 16-bits), the *DID* is ++transparently converted into a DRHD-specific DID using a map managed internally. ++ ++The current implementation of the code reuses the Xen domain_id as DID. ++However, by using multiples IOMMU contexts per domain, we can't use the domain_id for ++contexts (otherwise, different page tables will be mapped with the same DID). ++The following strategy is used : ++- on the default context, reuse the domain_id (the default context is unique per domain) ++- on non-default context, use a id allocated in the pseudo_domid map, (actually used by ++quarantine) which is a DID outside of Xen domain_id range ++ ++### AMD-Vi ++ ++TODO ++ ++## Device-tree platforms ++ ++### SMMU and SMMUv3 ++ ++TODO ++ ++* * * ++ ++[1] See pv-iommu.md ++ ++[2] pci: phantom functions assigned to incorrect contexts ++https://xenbits.xen.org/xsa/advisory-449.html +\ No newline at end of file +-- +2.46.0 + diff --git a/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch b/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch new file mode 100644 index 0000000..1f55a5a --- /dev/null +++ b/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch @@ -0,0 +1,1904 @@ +From 5a6c3e24142e3b2eda546e1e1e277b9a5efa374e Mon Sep 17 00:00:00 2001 +From: Teddy Astie +Date: Mon, 4 Nov 2024 14:28:40 +0000 +Subject: [PATCH 402/404] IOMMU: Introduce redesigned IOMMU subsystem + +Based on docs/designs/iommu-contexts.md, implement the redesigned IOMMU subsystem. + +Signed-off-by Teddy Astie +--- + xen/arch/x86/domain.c | 2 +- + xen/arch/x86/mm/p2m-ept.c | 2 +- + xen/arch/x86/pv/dom0_build.c | 4 +- + xen/arch/x86/tboot.c | 4 +- + xen/common/memory.c | 4 +- + xen/drivers/passthrough/Makefile | 3 + + xen/drivers/passthrough/context.c | 711 +++++++++++++++++++++++++++ + xen/drivers/passthrough/iommu.c | 396 ++++++--------- + xen/drivers/passthrough/pci.c | 117 +---- + xen/drivers/passthrough/quarantine.c | 49 ++ + xen/include/xen/iommu.h | 117 ++++- + xen/include/xen/pci.h | 3 + + 12 files changed, 1032 insertions(+), 380 deletions(-) + create mode 100644 xen/drivers/passthrough/context.c + create mode 100644 xen/drivers/passthrough/quarantine.c + +diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c +index 78a13e6812c9..9b1946cbc0a1 100644 +--- a/xen/arch/x86/domain.c ++++ b/xen/arch/x86/domain.c +@@ -2392,7 +2392,7 @@ int domain_relinquish_resources(struct domain *d) + + PROGRESS(iommu_pagetables): + +- ret = iommu_free_pgtables(d); ++ ret = iommu_free_pgtables(d, iommu_default_context(d)); + if ( ret ) + return ret; + +diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c +index 21728397f9ac..5ddeefb82675 100644 +--- a/xen/arch/x86/mm/p2m-ept.c ++++ b/xen/arch/x86/mm/p2m-ept.c +@@ -974,7 +974,7 @@ out: + rc = iommu_iotlb_flush(d, _dfn(gfn), 1ul << order, + (iommu_flags ? IOMMU_FLUSHF_added : 0) | + (vtd_pte_present ? IOMMU_FLUSHF_modified +- : 0)); ++ : 0), 0); + else if ( need_iommu_pt_sync(d) ) + rc = iommu_flags ? + iommu_legacy_map(d, _dfn(gfn), mfn, 1ul << order, iommu_flags) : +diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c +index f54d1da5c6f4..345e6bec083f 100644 +--- a/xen/arch/x86/pv/dom0_build.c ++++ b/xen/arch/x86/pv/dom0_build.c +@@ -77,7 +77,7 @@ static __init void mark_pv_pt_pages_rdonly(struct domain *d, + * iommu_memory_setup() ended up mapping them. + */ + if ( need_iommu_pt_sync(d) && +- iommu_unmap(d, _dfn(mfn_x(page_to_mfn(page))), 1, 0, flush_flags) ) ++ iommu_unmap(d, _dfn(mfn_x(page_to_mfn(page))), 1, 0, flush_flags, 0) ) + BUG(); + + /* Read-only mapping + PGC_allocated + page-table page. */ +@@ -128,7 +128,7 @@ static void __init iommu_memory_setup(struct domain *d, const char *what, + + while ( (rc = iommu_map(d, _dfn(mfn_x(mfn)), mfn, nr, + IOMMUF_readable | IOMMUF_writable | IOMMUF_preempt, +- flush_flags)) > 0 ) ++ flush_flags, 0)) > 0 ) + { + mfn = mfn_add(mfn, rc); + nr -= rc; +diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c +index d5db60d335e3..25a5a6641261 100644 +--- a/xen/arch/x86/tboot.c ++++ b/xen/arch/x86/tboot.c +@@ -218,9 +218,9 @@ static void tboot_gen_domain_integrity(const uint8_t key[TB_KEY_SIZE], + + if ( is_iommu_enabled(d) && is_vtd ) + { +- const struct domain_iommu *dio = dom_iommu(d); ++ struct domain_iommu *dio = dom_iommu(d); + +- update_iommu_mac(&ctx, dio->arch.vtd.pgd_maddr, ++ update_iommu_mac(&ctx, iommu_default_context(d)->arch.vtd.pgd_maddr, + agaw_to_level(dio->arch.vtd.agaw)); + } + } +diff --git a/xen/common/memory.c b/xen/common/memory.c +index a6f2f6d1b348..acf305bcd0fd 100644 +--- a/xen/common/memory.c ++++ b/xen/common/memory.c +@@ -926,7 +926,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, + this_cpu(iommu_dont_flush_iotlb) = 0; + + ret = iommu_iotlb_flush(d, _dfn(xatp->idx - done), done, +- IOMMU_FLUSHF_modified); ++ IOMMU_FLUSHF_modified, 0); + if ( unlikely(ret) && rc >= 0 ) + rc = ret; + +@@ -940,7 +940,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp, + put_page(pages[i]); + + ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done, +- IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified); ++ IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified, 0); + if ( unlikely(ret) && rc >= 0 ) + rc = ret; + } +diff --git a/xen/drivers/passthrough/Makefile b/xen/drivers/passthrough/Makefile +index a1621540b78d..69327080abe6 100644 +--- a/xen/drivers/passthrough/Makefile ++++ b/xen/drivers/passthrough/Makefile +@@ -4,6 +4,9 @@ obj-$(CONFIG_X86) += x86/ + obj-$(CONFIG_ARM) += arm/ + + obj-y += iommu.o ++obj-y += context.o ++obj-y += quarantine.o ++ + obj-$(CONFIG_HAS_PCI) += pci.o + obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o + obj-$(CONFIG_HAS_PCI) += ats.o +diff --git a/xen/drivers/passthrough/context.c b/xen/drivers/passthrough/context.c +new file mode 100644 +index 000000000000..edf660b617a9 +--- /dev/null ++++ b/xen/drivers/passthrough/context.c +@@ -0,0 +1,711 @@ ++/* ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms and conditions of the GNU General Public License, ++ * version 2, as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ * more details. ++ * ++ * You should have received a copy of the GNU General Public License along with ++ * this program; If not, see . ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++bool iommu_check_context(struct domain *d, u16 ctx_no) { ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ if (ctx_no == 0) ++ return 1; /* Default context always exist. */ ++ ++ if ((ctx_no - 1) >= hd->other_contexts.count) ++ return 0; /* out of bounds */ ++ ++ return test_bit(ctx_no - 1, hd->other_contexts.bitmap); ++} ++ ++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no) { ++ struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx; ++ ++ if ( !iommu_check_context(d, ctx_no) ) ++ return NULL; ++ ++ if (ctx_no == 0) ++ ctx = &hd->default_ctx; ++ else ++ ctx = &hd->other_contexts.map[ctx_no - 1]; ++ ++ rspin_lock(&ctx->lock); ++ /* Check if the context is still valid at this point */ ++ if ( unlikely(!iommu_check_context(d, ctx_no)) ) ++ { ++ /* Context has been destroyed in between */ ++ rspin_unlock(&ctx->lock); ++ return NULL; ++ } ++ ++ return ctx; ++} ++ ++void iommu_put_context(struct iommu_context *ctx) ++{ ++ rspin_unlock(&ctx->lock); ++} ++ ++static unsigned int mapping_order(const struct domain_iommu *hd, ++ dfn_t dfn, mfn_t mfn, unsigned long nr) ++{ ++ unsigned long res = dfn_x(dfn) | mfn_x(mfn); ++ unsigned long sizes = hd->platform_ops->page_sizes; ++ unsigned int bit = ffsl(sizes) - 1, order = 0; ++ ++ ASSERT(bit == PAGE_SHIFT); ++ ++ while ( (sizes = (sizes >> bit) & ~1) ) ++ { ++ unsigned long mask; ++ ++ bit = ffsl(sizes) - 1; ++ mask = (1UL << bit) - 1; ++ if ( nr <= mask || (res & mask) ) ++ break; ++ order += bit; ++ nr >>= bit; ++ res >>= bit; ++ } ++ ++ return order; ++} ++ ++static long _iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0, ++ unsigned long page_count, unsigned int flags, ++ unsigned int *flush_flags, struct iommu_context *ctx) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ unsigned long i; ++ unsigned int order, j = 0; ++ int rc = 0; ++ ++ if ( !is_iommu_enabled(d) ) ++ return 0; ++ ++ ASSERT(!IOMMUF_order(flags)); ++ ++ for ( i = 0; i < page_count; i += 1UL << order ) ++ { ++ dfn_t dfn = dfn_add(dfn0, i); ++ mfn_t mfn = mfn_add(mfn0, i); ++ ++ order = mapping_order(hd, dfn, mfn, page_count - i); ++ ++ if ( (flags & IOMMUF_preempt) && ++ ((!(++j & 0xfff) && general_preempt_check()) || ++ i > LONG_MAX - (1UL << order)) ) ++ return i; ++ ++ rc = iommu_call(hd->platform_ops, map_page, d, dfn, mfn, ++ flags | IOMMUF_order(order), flush_flags, ctx); ++ ++ if ( likely(!rc) ) ++ continue; ++ ++ if ( !d->is_shutting_down && printk_ratelimit() ) ++ printk(XENLOG_ERR ++ "d%d: IOMMU mapping dfn %"PRI_dfn" to mfn %"PRI_mfn" failed: %d\n", ++ d->domain_id, dfn_x(dfn), mfn_x(mfn), rc); ++ ++ /* while statement to satisfy __must_check */ ++ while ( iommu_unmap(d, dfn0, i, 0, flush_flags, ctx->id) ) ++ break; ++ ++ if ( !ctx->id && !is_hardware_domain(d) ) ++ domain_crash(d); ++ ++ break; ++ } ++ ++ /* ++ * Something went wrong so, if we were dealing with more than a single ++ * page, flush everything and clear flush flags. ++ */ ++ if ( page_count > 1 && unlikely(rc) && ++ !iommu_iotlb_flush_all(d, *flush_flags) ) ++ *flush_flags = 0; ++ ++ return rc; ++} ++ ++long iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0, ++ unsigned long page_count, unsigned int flags, ++ unsigned int *flush_flags, u16 ctx_no) ++{ ++ struct iommu_context *ctx; ++ long ret; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ return -ENOENT; ++ ++ ret = _iommu_map(d, dfn0, mfn0, page_count, flags, flush_flags, ctx); ++ ++ iommu_put_context(ctx); ++ ++ return ret; ++} ++ ++int iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn, ++ unsigned long page_count, unsigned int flags) ++{ ++ struct iommu_context *ctx; ++ unsigned int flush_flags = 0; ++ int rc = 0; ++ ++ ASSERT(!(flags & IOMMUF_preempt)); ++ ++ if ( dom_iommu(d)->no_dma ) ++ return 0; ++ ++ ctx = iommu_get_context(d, 0); ++ ++ if ( !ctx->opaque ) ++ { ++ rc = iommu_map(d, dfn, mfn, page_count, flags, &flush_flags, 0); ++ ++ if ( !this_cpu(iommu_dont_flush_iotlb) && !rc ) ++ rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags, 0); ++ } ++ ++ iommu_put_context(ctx); ++ ++ return rc; ++} ++ ++static long _iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count, ++ unsigned int flags, unsigned int *flush_flags, ++ struct iommu_context *ctx) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ unsigned long i; ++ unsigned int order, j = 0; ++ int rc = 0; ++ ++ if ( !is_iommu_enabled(d) ) ++ return 0; ++ ++ ASSERT(!(flags & ~IOMMUF_preempt)); ++ ++ for ( i = 0; i < page_count; i += 1UL << order ) ++ { ++ dfn_t dfn = dfn_add(dfn0, i); ++ int err; ++ ++ order = mapping_order(hd, dfn, _mfn(0), page_count - i); ++ ++ if ( (flags & IOMMUF_preempt) && ++ ((!(++j & 0xfff) && general_preempt_check()) || ++ i > LONG_MAX - (1UL << order)) ) ++ return i; ++ ++ err = iommu_call(hd->platform_ops, unmap_page, d, dfn, ++ flags | IOMMUF_order(order), flush_flags, ++ ctx); ++ ++ if ( likely(!err) ) ++ continue; ++ ++ if ( !d->is_shutting_down && printk_ratelimit() ) ++ printk(XENLOG_ERR ++ "d%d: IOMMU unmapping dfn %"PRI_dfn" failed: %d\n", ++ d->domain_id, dfn_x(dfn), err); ++ ++ if ( !rc ) ++ rc = err; ++ ++ if ( !ctx->id && !is_hardware_domain(d) ) ++ { ++ domain_crash(d); ++ break; ++ } ++ } ++ ++ /* ++ * Something went wrong so, if we were dealing with more than a single ++ * page, flush everything and clear flush flags. ++ */ ++ if ( page_count > 1 && unlikely(rc) && ++ !iommu_iotlb_flush_all(d, *flush_flags) ) ++ *flush_flags = 0; ++ ++ return rc; ++} ++ ++long iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count, ++ unsigned int flags, unsigned int *flush_flags, ++ u16 ctx_no) ++{ ++ struct iommu_context *ctx; ++ long ret; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ return -ENOENT; ++ ++ ret = _iommu_unmap(d, dfn0, page_count, flags, flush_flags, ctx); ++ ++ iommu_put_context(ctx); ++ ++ return ret; ++} ++ ++int iommu_legacy_unmap(struct domain *d, dfn_t dfn, unsigned long page_count) ++{ ++ unsigned int flush_flags = 0; ++ struct iommu_context *ctx; ++ int rc; ++ ++ if ( dom_iommu(d)->no_dma ) ++ return 0; ++ ++ ctx = iommu_get_context(d, 0); ++ ++ if ( ctx->opaque ) ++ return 0; ++ ++ rc = iommu_unmap(d, dfn, page_count, 0, &flush_flags, 0); ++ ++ if ( !this_cpu(iommu_dont_flush_iotlb) && !rc ) ++ rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags, 0); ++ ++ iommu_put_context(ctx); ++ ++ return rc; ++} ++ ++int iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn, ++ unsigned int *flags, u16 ctx_no) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx; ++ int ret = 0; ++ ++ if ( !is_iommu_enabled(d) || !hd->platform_ops->lookup_page ) ++ return -EOPNOTSUPP; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ return -ENOENT; ++ ++ ret = iommu_call(hd->platform_ops, lookup_page, d, dfn, mfn, flags, ctx); ++ ++ iommu_put_context(ctx); ++ return ret; ++} ++ ++int iommu_iotlb_flush(struct domain *d, dfn_t dfn, unsigned long page_count, ++ unsigned int flush_flags, u16 ctx_no) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx; ++ int rc; ++ ++ if ( !is_iommu_enabled(d) || !hd->platform_ops->iotlb_flush || ++ !page_count || !flush_flags ) ++ return 0; ++ ++ if ( dfn_eq(dfn, INVALID_DFN) ) ++ return -EINVAL; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ return -ENOENT; ++ ++ rc = iommu_call(hd->platform_ops, iotlb_flush, d, ctx, dfn, page_count, ++ flush_flags); ++ if ( unlikely(rc) ) ++ { ++ if ( !d->is_shutting_down && printk_ratelimit() ) ++ printk(XENLOG_ERR ++ "d%d: IOMMU IOTLB flush failed: %d, dfn %"PRI_dfn", page count %lu flags %x\n", ++ d->domain_id, rc, dfn_x(dfn), page_count, flush_flags); ++ ++ if ( !ctx->id && !is_hardware_domain(d) ) ++ domain_crash(d); ++ } ++ ++ iommu_put_context(ctx); ++ ++ return rc; ++} ++ ++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no, ++ u32 flags) ++{ ++ if ( !dom_iommu(d)->platform_ops->context_init ) ++ return -ENOSYS; ++ ++ INIT_LIST_HEAD(&ctx->devices); ++ ctx->id = ctx_no; ++ ctx->dying = false; ++ ctx->opaque = false; /* assume opaque by default */ ++ ++ return iommu_call(dom_iommu(d)->platform_ops, context_init, d, ctx, flags); ++} ++ ++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags) ++{ ++ unsigned int i; ++ int ret; ++ struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx; ++ ++ do { ++ i = find_first_zero_bit(hd->other_contexts.bitmap, hd->other_contexts.count); ++ ++ if ( i >= hd->other_contexts.count ) ++ return -ENOSPC; ++ ++ ctx = &hd->other_contexts.map[i]; ++ ++ /* Try to lock the mutex, can fail on concurrent accesses */ ++ if ( !rspin_trylock(&ctx->lock) ) ++ continue; ++ ++ /* We can now set it as used, we keep the lock for initialization. */ ++ set_bit(i, hd->other_contexts.bitmap); ++ } while (0); ++ ++ *ctx_no = i + 1; ++ ++ ret = iommu_context_init(d, ctx, *ctx_no, flags); ++ ++ if ( ret ) ++ clear_bit(*ctx_no, hd->other_contexts.bitmap); ++ ++ iommu_put_context(ctx); ++ return ret; ++} ++ ++/** ++ * Attach dev phantom functions to ctx, override any existing ++ * mapped context. ++ */ ++static int iommu_reattach_phantom(struct domain *d, device_t *dev, ++ struct iommu_context *ctx) ++{ ++ int ret = 0; ++ uint8_t devfn = dev->devfn; ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ while ( dev->phantom_stride ) ++ { ++ devfn += dev->phantom_stride; ++ ++ if ( PCI_SLOT(devfn) != PCI_SLOT(dev->devfn) ) ++ break; ++ ++ ret = iommu_call(hd->platform_ops, add_devfn, d, dev, devfn, ctx); ++ ++ if ( ret ) ++ break; ++ } ++ ++ return ret; ++} ++ ++/** ++ * Detach all device phantom functions. ++ */ ++static int iommu_detach_phantom(struct domain *d, device_t *dev) ++{ ++ int ret = 0; ++ uint8_t devfn = dev->devfn; ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ while ( dev->phantom_stride ) ++ { ++ devfn += dev->phantom_stride; ++ ++ if ( PCI_SLOT(devfn) != PCI_SLOT(dev->devfn) ) ++ break; ++ ++ ret = iommu_call(hd->platform_ops, remove_devfn, d, dev, devfn); ++ ++ if ( ret ) ++ break; ++ } ++ ++ return ret; ++} ++ ++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no) ++{ ++ struct iommu_context *ctx = NULL; ++ int ret, rc; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ { ++ ret = -ENOENT; ++ goto unlock; ++ } ++ ++ pcidevs_lock(); ++ ++ if ( ctx->dying ) ++ { ++ ret = -EINVAL; ++ goto unlock; ++ } ++ ++ ret = iommu_call(dom_iommu(d)->platform_ops, attach, d, dev, ctx); ++ ++ if ( ret ) ++ goto unlock; ++ ++ /* See iommu_reattach_context() */ ++ rc = iommu_reattach_phantom(d, dev, ctx); ++ ++ if ( rc ) ++ { ++ printk(XENLOG_ERR "IOMMU: Unable to attach %pp phantom functions\n", ++ &dev->sbdf); ++ ++ if( iommu_call(dom_iommu(d)->platform_ops, detach, d, dev, ctx) ++ || iommu_detach_phantom(d, dev) ) ++ { ++ printk(XENLOG_ERR "IOMMU: Improperly detached %pp\n", &dev->sbdf); ++ WARN(); ++ } ++ ++ ret = -EIO; ++ goto unlock; ++ } ++ ++ dev->context = ctx_no; ++ list_add(&dev->context_list, &ctx->devices); ++ ++unlock: ++ pcidevs_unlock(); ++ ++ if ( ctx ) ++ iommu_put_context(ctx); ++ ++ return ret; ++} ++ ++int iommu_detach_context(struct domain *d, device_t *dev) ++{ ++ struct iommu_context *ctx; ++ int ret, rc; ++ ++ if ( !dev->domain ) ++ { ++ printk(XENLOG_WARNING "IOMMU: Trying to detach a non-attached device\n"); ++ WARN(); ++ return 0; ++ } ++ ++ /* Make sure device is actually in the domain. */ ++ ASSERT(d == dev->domain); ++ ++ pcidevs_lock(); ++ ++ ctx = iommu_get_context(d, dev->context); ++ ASSERT(ctx); /* device is using an invalid context ? ++ dev->context invalid ? */ ++ ++ ret = iommu_call(dom_iommu(d)->platform_ops, detach, d, dev, ctx); ++ ++ if ( ret ) ++ goto unlock; ++ ++ rc = iommu_detach_phantom(d, dev); ++ ++ if ( rc ) ++ printk(XENLOG_WARNING "IOMMU: " ++ "Improperly detached device functions (%d)\n", rc); ++ ++ list_del(&dev->context_list); ++ ++unlock: ++ pcidevs_unlock(); ++ iommu_put_context(ctx); ++ return ret; ++} ++ ++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom, ++ device_t *dev, u16 ctx_no) ++{ ++ u16 prev_ctx_no; ++ device_t *ctx_dev; ++ struct domain_iommu *prev_hd, *next_hd; ++ struct iommu_context *prev_ctx = NULL, *next_ctx = NULL; ++ int ret, rc; ++ bool same_domain; ++ ++ /* Make sure we actually are doing something meaningful */ ++ BUG_ON(!prev_dom && !next_dom); ++ ++ /// TODO: Do such cases exists ? ++ // /* Platform ops must match */ ++ // if (dom_iommu(prev_dom)->platform_ops != dom_iommu(next_dom)->platform_ops) ++ // return -EINVAL; ++ ++ if ( !prev_dom ) ++ return iommu_attach_context(next_dom, dev, ctx_no); ++ ++ if ( !next_dom ) ++ return iommu_detach_context(prev_dom, dev); ++ ++ prev_hd = dom_iommu(prev_dom); ++ next_hd = dom_iommu(next_dom); ++ ++ pcidevs_lock(); ++ ++ same_domain = prev_dom == next_dom; ++ ++ prev_ctx_no = dev->context; ++ ++ if ( !same_domain && (ctx_no == prev_ctx_no) ) ++ { ++ printk(XENLOG_DEBUG ++ "IOMMU: Reattaching %pp to same IOMMU context c%hu\n", ++ &dev, ctx_no); ++ ret = 0; ++ goto unlock; ++ } ++ ++ if ( !(prev_ctx = iommu_get_context(prev_dom, prev_ctx_no)) ) ++ { ++ ret = -ENOENT; ++ goto unlock; ++ } ++ ++ if ( !(next_ctx = iommu_get_context(next_dom, ctx_no)) ) ++ { ++ ret = -ENOENT; ++ goto unlock; ++ } ++ ++ if ( next_ctx->dying ) ++ { ++ ret = -EINVAL; ++ goto unlock; ++ } ++ ++ ret = iommu_call(prev_hd->platform_ops, reattach, next_dom, dev, prev_ctx, ++ next_ctx); ++ ++ if ( ret ) ++ goto unlock; ++ ++ /* ++ * We need to do special handling for phantom devices as they ++ * also use some other PCI functions behind the scenes. ++ */ ++ rc = iommu_reattach_phantom(next_dom, dev, next_ctx); ++ ++ if ( rc ) ++ { ++ /** ++ * Device is being partially reattached (we have primary function and ++ * maybe some phantom functions attached to next_ctx, some others to prev_ctx), ++ * some functions of the device will be attached to next_ctx. ++ */ ++ printk(XENLOG_WARNING "IOMMU: " ++ "Device %pp improperly reattached due to phantom function" ++ " reattach failure between %dd%dc and %dd%dc (%d)\n", dev, ++ prev_dom->domain_id, prev_ctx->id, next_dom->domain_id, ++ next_dom->domain_id, rc); ++ ++ /* Try reattaching to previous context, reverting into a consistent state. */ ++ if ( iommu_call(prev_hd->platform_ops, reattach, prev_dom, dev, next_ctx, ++ prev_ctx) || iommu_reattach_phantom(prev_dom, dev, prev_ctx) ) ++ { ++ printk(XENLOG_ERR "Unable to reattach %pp back to %dd%dc\n", ++ &dev->sbdf, prev_dom->domain_id, prev_ctx->id); ++ ++ if ( !is_hardware_domain(prev_dom) ) ++ domain_crash(prev_dom); ++ ++ if ( prev_dom != next_dom && !is_hardware_domain(next_dom) ) ++ domain_crash(next_dom); ++ ++ rc = -EIO; ++ } ++ ++ ret = rc; ++ goto unlock; ++ } ++ ++ /* Remove device from previous context, and add it to new one. */ ++ list_for_each_entry(ctx_dev, &prev_ctx->devices, context_list) ++ { ++ if ( ctx_dev == dev ) ++ { ++ list_del(&ctx_dev->context_list); ++ list_add(&ctx_dev->context_list, &next_ctx->devices); ++ break; ++ } ++ } ++ ++ if (!ret) ++ dev->context = ctx_no; /* update device context*/ ++ ++unlock: ++ pcidevs_unlock(); ++ ++ if ( prev_ctx ) ++ iommu_put_context(prev_ctx); ++ ++ if ( next_ctx ) ++ iommu_put_context(next_ctx); ++ ++ return ret; ++} ++ ++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ if ( !hd->platform_ops->context_teardown ) ++ return -ENOSYS; ++ ++ ctx->dying = true; ++ ++ /* first reattach devices back to default context if needed */ ++ if ( flags & IOMMU_TEARDOWN_REATTACH_DEFAULT ) ++ { ++ struct pci_dev *device; ++ list_for_each_entry(device, &ctx->devices, context_list) ++ iommu_reattach_context(d, d, device, 0); ++ } ++ else if (!list_empty(&ctx->devices)) ++ return -EBUSY; /* there is a device in context */ ++ ++ return iommu_call(hd->platform_ops, context_teardown, d, ctx, flags); ++} ++ ++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags) ++{ ++ int ret; ++ struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx; ++ ++ if ( ctx_no == 0 ) ++ return -EINVAL; ++ ++ if ( !(ctx = iommu_get_context(d, ctx_no)) ) ++ return -ENOENT; ++ ++ ret = iommu_context_teardown(d, ctx, flags); ++ ++ if ( !ret ) ++ clear_bit(ctx_no - 1, hd->other_contexts.bitmap); ++ ++ iommu_put_context(ctx); ++ return ret; ++} +diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c +index 9e74a1fc72fa..e109ebe40470 100644 +--- a/xen/drivers/passthrough/iommu.c ++++ b/xen/drivers/passthrough/iommu.c +@@ -12,15 +12,18 @@ + * this program; If not, see . + */ + ++#include ++#include ++#include ++#include + #include ++#include + #include +-#include +-#include +-#include + #include +-#include + #include +-#include ++#include ++#include ++#include + + #ifdef CONFIG_X86 + #include +@@ -35,26 +38,11 @@ bool __read_mostly force_iommu; + bool __read_mostly iommu_verbose; + static bool __read_mostly iommu_crash_disable; + +-#define IOMMU_quarantine_none 0 /* aka false */ +-#define IOMMU_quarantine_basic 1 /* aka true */ +-#define IOMMU_quarantine_scratch_page 2 +-#ifdef CONFIG_HAS_PCI +-uint8_t __read_mostly iommu_quarantine = +-# if defined(CONFIG_IOMMU_QUARANTINE_NONE) +- IOMMU_quarantine_none; +-# elif defined(CONFIG_IOMMU_QUARANTINE_BASIC) +- IOMMU_quarantine_basic; +-# elif defined(CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE) +- IOMMU_quarantine_scratch_page; +-# endif +-#else +-# define iommu_quarantine IOMMU_quarantine_none +-#endif /* CONFIG_HAS_PCI */ +- + static bool __hwdom_initdata iommu_hwdom_none; + bool __hwdom_initdata iommu_hwdom_strict; + bool __read_mostly iommu_hwdom_passthrough; + bool __hwdom_initdata iommu_hwdom_inclusive; ++bool __read_mostly iommu_hwdom_no_dma = false; + int8_t __hwdom_initdata iommu_hwdom_reserved = -1; + + #ifndef iommu_hap_pt_share +@@ -172,6 +160,8 @@ static int __init cf_check parse_dom0_iommu_param(const char *s) + iommu_hwdom_reserved = val; + else if ( !cmdline_strcmp(s, "none") ) + iommu_hwdom_none = true; ++ else if ( (val = parse_boolean("dma", s, ss)) >= 0 ) ++ iommu_hwdom_no_dma = !val; + else + rc = -EINVAL; + +@@ -193,6 +183,98 @@ static void __hwdom_init check_hwdom_reqs(struct domain *d) + arch_iommu_check_autotranslated_hwdom(d); + } + ++int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ int rc; ++ ++ BUG_ON(nb_ctx == 0); /* sanity check (prevent underflow) */ ++ ++ /* ++ * hd->other_contexts.count is always reported as 0 during initialization ++ * preventing misuse of partially initialized IOMMU contexts. ++ */ ++ ++ if ( atomic_cmpxchg(&hd->other_contexts.initialized, 0, 1) == 1 ) ++ return -EACCES; ++ ++ if ( (nb_ctx - 1) > 0 ) { ++ /* Initialize context bitmap */ ++ size_t i; ++ ++ hd->other_contexts.bitmap = xzalloc_array(unsigned long, ++ BITS_TO_LONGS(nb_ctx - 1)); ++ ++ if (!hd->other_contexts.bitmap) ++ { ++ rc = -ENOMEM; ++ goto cleanup; ++ } ++ ++ hd->other_contexts.map = xzalloc_array(struct iommu_context, nb_ctx - 1); ++ ++ if (!hd->other_contexts.map) ++ { ++ rc = -ENOMEM; ++ goto cleanup; ++ } ++ ++ for (i = 0; i < (nb_ctx - 1); i++) ++ rspin_lock_init(&hd->other_contexts.map[i].lock); ++ } ++ ++ rc = arch_iommu_pviommu_init(d, nb_ctx, arena_order); ++ ++ if ( rc ) ++ goto cleanup; ++ ++ /* Make sure initialization is complete before making it visible to other CPUs. */ ++ smp_wmb(); ++ ++ hd->other_contexts.count = nb_ctx - 1; ++ ++ printk(XENLOG_INFO "Dom%d uses %lu IOMMU contexts (%llu pages arena)\n", ++ d->domain_id, (unsigned long)nb_ctx, 1llu << arena_order); ++ ++ return 0; ++ ++cleanup: ++ /* TODO: Reset hd->other_contexts.initialized */ ++ if ( hd->other_contexts.bitmap ) ++ { ++ xfree(hd->other_contexts.bitmap); ++ hd->other_contexts.bitmap = NULL; ++ } ++ ++ if ( hd->other_contexts.map ) ++ { ++ xfree(hd->other_contexts.map); ++ hd->other_contexts.bitmap = NULL; ++ } ++ ++ return rc; ++} ++ ++int iommu_domain_pviommu_teardown(struct domain *d) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ int i; ++ /* FIXME: Potential race condition with remote_op ? */ ++ ++ for (i = 0; i < hd->other_contexts.count; i++) ++ WARN_ON(iommu_context_free(d, i, IOMMU_TEARDOWN_REATTACH_DEFAULT) != ENOENT); ++ ++ hd->other_contexts.count = 0; ++ ++ if ( hd->other_contexts.bitmap ) ++ xfree(hd->other_contexts.bitmap); ++ ++ if ( hd->other_contexts.map ) ++ xfree(hd->other_contexts.map); ++ ++ return 0; ++} ++ + int iommu_domain_init(struct domain *d, unsigned int opts) + { + struct domain_iommu *hd = dom_iommu(d); +@@ -208,6 +290,8 @@ int iommu_domain_init(struct domain *d, unsigned int opts) + hd->node = NUMA_NO_NODE; + #endif + ++ rspin_lock_init(&hd->default_ctx.lock); ++ + ret = arch_iommu_domain_init(d); + if ( ret ) + return ret; +@@ -236,6 +320,23 @@ int iommu_domain_init(struct domain *d, unsigned int opts) + + ASSERT(!(hd->need_sync && hd->hap_pt_share)); + ++ if ( hd->no_dma ) ++ { ++ /* No-DMA mode is exclusive with HAP and sync_pt. */ ++ hd->hap_pt_share = false; ++ hd->need_sync = false; ++ } ++ ++ hd->allow_pv_iommu = true; ++ ++ iommu_context_init(d, &hd->default_ctx, 0, IOMMU_CONTEXT_INIT_default); ++ ++ rwlock_init(&hd->other_contexts.lock); ++ hd->other_contexts.initialized = (atomic_t)ATOMIC_INIT(0); ++ hd->other_contexts.count = 0; ++ hd->other_contexts.bitmap = NULL; ++ hd->other_contexts.map = NULL; ++ + return 0; + } + +@@ -249,13 +350,12 @@ static void cf_check iommu_dump_page_tables(unsigned char key) + + for_each_domain(d) + { +- if ( is_hardware_domain(d) || !is_iommu_enabled(d) ) ++ if ( !is_iommu_enabled(d) ) + continue; + + if ( iommu_use_hap_pt(d) ) + { + printk("%pd sharing page tables\n", d); +- continue; + } + + iommu_vcall(dom_iommu(d)->platform_ops, dump_page_tables, d); +@@ -274,10 +374,13 @@ void __hwdom_init iommu_hwdom_init(struct domain *d) + iommu_vcall(hd->platform_ops, hwdom_init, d); + } + +-static void iommu_teardown(struct domain *d) ++void iommu_domain_destroy(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + ++ if ( !is_iommu_enabled(d) ) ++ return; ++ + /* + * During early domain creation failure, we may reach here with the + * ops not yet initialized. +@@ -286,222 +389,9 @@ static void iommu_teardown(struct domain *d) + return; + + iommu_vcall(hd->platform_ops, teardown, d); +-} +- +-void iommu_domain_destroy(struct domain *d) +-{ +- if ( !is_iommu_enabled(d) ) +- return; +- +- iommu_teardown(d); + + arch_iommu_domain_destroy(d); +-} +- +-static unsigned int mapping_order(const struct domain_iommu *hd, +- dfn_t dfn, mfn_t mfn, unsigned long nr) +-{ +- unsigned long res = dfn_x(dfn) | mfn_x(mfn); +- unsigned long sizes = hd->platform_ops->page_sizes; +- unsigned int bit = ffsl(sizes) - 1, order = 0; +- +- ASSERT(bit == PAGE_SHIFT); +- +- while ( (sizes = (sizes >> bit) & ~1) ) +- { +- unsigned long mask; +- +- bit = ffsl(sizes) - 1; +- mask = (1UL << bit) - 1; +- if ( nr <= mask || (res & mask) ) +- break; +- order += bit; +- nr >>= bit; +- res >>= bit; +- } +- +- return order; +-} +- +-long iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0, +- unsigned long page_count, unsigned int flags, +- unsigned int *flush_flags) +-{ +- const struct domain_iommu *hd = dom_iommu(d); +- unsigned long i; +- unsigned int order, j = 0; +- int rc = 0; +- +- if ( !is_iommu_enabled(d) ) +- return 0; +- +- ASSERT(!IOMMUF_order(flags)); +- +- for ( i = 0; i < page_count; i += 1UL << order ) +- { +- dfn_t dfn = dfn_add(dfn0, i); +- mfn_t mfn = mfn_add(mfn0, i); +- +- order = mapping_order(hd, dfn, mfn, page_count - i); +- +- if ( (flags & IOMMUF_preempt) && +- ((!(++j & 0xfff) && general_preempt_check()) || +- i > LONG_MAX - (1UL << order)) ) +- return i; +- +- rc = iommu_call(hd->platform_ops, map_page, d, dfn, mfn, +- flags | IOMMUF_order(order), flush_flags); +- +- if ( likely(!rc) ) +- continue; +- +- if ( !d->is_shutting_down && printk_ratelimit() ) +- printk(XENLOG_ERR +- "d%d: IOMMU mapping dfn %"PRI_dfn" to mfn %"PRI_mfn" failed: %d\n", +- d->domain_id, dfn_x(dfn), mfn_x(mfn), rc); +- +- /* while statement to satisfy __must_check */ +- while ( iommu_unmap(d, dfn0, i, 0, flush_flags) ) +- break; +- +- if ( !is_hardware_domain(d) ) +- domain_crash(d); +- +- break; +- } +- +- /* +- * Something went wrong so, if we were dealing with more than a single +- * page, flush everything and clear flush flags. +- */ +- if ( page_count > 1 && unlikely(rc) && +- !iommu_iotlb_flush_all(d, *flush_flags) ) +- *flush_flags = 0; +- +- return rc; +-} +- +-int iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn, +- unsigned long page_count, unsigned int flags) +-{ +- unsigned int flush_flags = 0; +- int rc; +- +- ASSERT(!(flags & IOMMUF_preempt)); +- rc = iommu_map(d, dfn, mfn, page_count, flags, &flush_flags); +- +- if ( !this_cpu(iommu_dont_flush_iotlb) && !rc ) +- rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags); +- +- return rc; +-} +- +-long iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count, +- unsigned int flags, unsigned int *flush_flags) +-{ +- const struct domain_iommu *hd = dom_iommu(d); +- unsigned long i; +- unsigned int order, j = 0; +- int rc = 0; +- +- if ( !is_iommu_enabled(d) ) +- return 0; +- +- ASSERT(!(flags & ~IOMMUF_preempt)); +- +- for ( i = 0; i < page_count; i += 1UL << order ) +- { +- dfn_t dfn = dfn_add(dfn0, i); +- int err; +- +- order = mapping_order(hd, dfn, _mfn(0), page_count - i); +- +- if ( (flags & IOMMUF_preempt) && +- ((!(++j & 0xfff) && general_preempt_check()) || +- i > LONG_MAX - (1UL << order)) ) +- return i; +- +- err = iommu_call(hd->platform_ops, unmap_page, d, dfn, +- flags | IOMMUF_order(order), flush_flags); +- +- if ( likely(!err) ) +- continue; +- +- if ( !d->is_shutting_down && printk_ratelimit() ) +- printk(XENLOG_ERR +- "d%d: IOMMU unmapping dfn %"PRI_dfn" failed: %d\n", +- d->domain_id, dfn_x(dfn), err); +- +- if ( !rc ) +- rc = err; +- +- if ( !is_hardware_domain(d) ) +- { +- domain_crash(d); +- break; +- } +- } +- +- /* +- * Something went wrong so, if we were dealing with more than a single +- * page, flush everything and clear flush flags. +- */ +- if ( page_count > 1 && unlikely(rc) && +- !iommu_iotlb_flush_all(d, *flush_flags) ) +- *flush_flags = 0; +- +- return rc; +-} +- +-int iommu_legacy_unmap(struct domain *d, dfn_t dfn, unsigned long page_count) +-{ +- unsigned int flush_flags = 0; +- int rc = iommu_unmap(d, dfn, page_count, 0, &flush_flags); +- +- if ( !this_cpu(iommu_dont_flush_iotlb) && !rc ) +- rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags); +- +- return rc; +-} +- +-int iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn, +- unsigned int *flags) +-{ +- const struct domain_iommu *hd = dom_iommu(d); +- +- if ( !is_iommu_enabled(d) || !hd->platform_ops->lookup_page ) +- return -EOPNOTSUPP; +- +- return iommu_call(hd->platform_ops, lookup_page, d, dfn, mfn, flags); +-} +- +-int iommu_iotlb_flush(struct domain *d, dfn_t dfn, unsigned long page_count, +- unsigned int flush_flags) +-{ +- const struct domain_iommu *hd = dom_iommu(d); +- int rc; +- +- if ( !is_iommu_enabled(d) || !hd->platform_ops->iotlb_flush || +- !page_count || !flush_flags ) +- return 0; +- +- if ( dfn_eq(dfn, INVALID_DFN) ) +- return -EINVAL; +- +- rc = iommu_call(hd->platform_ops, iotlb_flush, d, dfn, page_count, +- flush_flags); +- if ( unlikely(rc) ) +- { +- if ( !d->is_shutting_down && printk_ratelimit() ) +- printk(XENLOG_ERR +- "d%d: IOMMU IOTLB flush failed: %d, dfn %"PRI_dfn", page count %lu flags %x\n", +- d->domain_id, rc, dfn_x(dfn), page_count, flush_flags); +- +- if ( !is_hardware_domain(d) ) +- domain_crash(d); +- } +- +- return rc; ++ iommu_domain_pviommu_teardown(d); + } + + int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags) +@@ -513,7 +403,7 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags) + !flush_flags ) + return 0; + +- rc = iommu_call(hd->platform_ops, iotlb_flush, d, INVALID_DFN, 0, ++ rc = iommu_call(hd->platform_ops, iotlb_flush, d, NULL, INVALID_DFN, 0, + flush_flags | IOMMU_FLUSHF_all); + if ( unlikely(rc) ) + { +@@ -529,24 +419,6 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags) + return rc; + } + +-int iommu_quarantine_dev_init(device_t *dev) +-{ +- const struct domain_iommu *hd = dom_iommu(dom_io); +- +- if ( !iommu_quarantine || !hd->platform_ops->quarantine_init ) +- return 0; +- +- return iommu_call(hd->platform_ops, quarantine_init, +- dev, iommu_quarantine == IOMMU_quarantine_scratch_page); +-} +- +-static int __init iommu_quarantine_init(void) +-{ +- dom_io->options |= XEN_DOMCTL_CDF_iommu; +- +- return iommu_domain_init(dom_io, 0); +-} +- + int __init iommu_setup(void) + { + int rc = -ENODEV; +@@ -682,6 +554,16 @@ bool iommu_has_feature(struct domain *d, enum iommu_feature feature) + return is_iommu_enabled(d) && test_bit(feature, dom_iommu(d)->features); + } + ++uint64_t iommu_get_max_iova(struct domain *d) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ if ( !hd->platform_ops->get_max_iova ) ++ return 0; ++ ++ return iommu_call(hd->platform_ops, get_max_iova, d); ++} ++ + #define MAX_EXTRA_RESERVED_RANGES 20 + struct extra_reserved_range { + unsigned long start; +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 777c6b1a7fdc..49d014d90fd5 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -1,6 +1,6 @@ + /* + * Copyright (C) 2008, Netronome Systems, Inc. +- * ++ * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. +@@ -289,14 +289,14 @@ static void apply_quirks(struct pci_dev *pdev) + * Device [8086:2fc0] + * Erratum HSE43 + * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset +- * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html ++ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html + */ + { PCI_VENDOR_ID_INTEL, 0x2fc0 }, + /* + * Devices [8086:6f60,6fa0,6fc0] + * Errata BDF2 / BDX2 + * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration +- * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html ++ * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html + */ + { PCI_VENDOR_ID_INTEL, 0x6f60 }, + { PCI_VENDOR_ID_INTEL, 0x6fa0 }, +@@ -911,8 +911,8 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus, + devfn += pdev->phantom_stride; + if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) + break; +- ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn, +- pci_to_dev(pdev)); ++ ret = iommu_call(hd->platform_ops, add_devfn, d, pci_to_dev(pdev), devfn, ++ &target->iommu.default_ctx); + if ( ret ) + goto out; + } +@@ -921,9 +921,8 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus, + vpci_deassign_device(pdev); + write_unlock(&d->pci_lock); + +- devfn = pdev->devfn; +- ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn, +- pci_to_dev(pdev)); ++ ret = iommu_reattach_context(pdev->domain, target, pci_to_dev(pdev), 0); ++ + if ( ret ) + goto out; + +@@ -931,6 +930,7 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus, + pdev->quarantine = false; + + pdev->fault.count = 0; ++ pdev->domain = target; + + write_lock(&target->pci_lock); + /* Re-assign back to hardware_domain */ +@@ -1180,25 +1180,18 @@ struct setup_hwdom { + static void __hwdom_init setup_one_hwdom_device(const struct setup_hwdom *ctxt, + struct pci_dev *pdev) + { +- u8 devfn = pdev->devfn; + int err; + +- do { +- err = ctxt->handler(devfn, pdev); +- if ( err ) +- { +- printk(XENLOG_ERR "setup %pp for d%d failed (%d)\n", +- &pdev->sbdf, ctxt->d->domain_id, err); +- if ( devfn == pdev->devfn ) +- return; +- } +- devfn += pdev->phantom_stride; +- } while ( devfn != pdev->devfn && +- PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) ); ++ err = ctxt->handler(pdev->devfn, pdev); ++ ++ if ( err ) ++ goto done; + + write_lock(&ctxt->d->pci_lock); + err = vpci_assign_device(pdev); + write_unlock(&ctxt->d->pci_lock); ++ ++done: + if ( err ) + printk(XENLOG_ERR "setup of vPCI for d%d failed: %d\n", + ctxt->d->domain_id, err); +@@ -1370,12 +1363,7 @@ static int cf_check _dump_pci_devices(struct pci_seg *pseg, void *arg) + list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list ) + { + printk("%pp - ", &pdev->sbdf); +-#ifdef CONFIG_X86 +- if ( pdev->domain == dom_io ) +- printk("DomIO:%x", pdev->arch.pseudo_domid); +- else +-#endif +- printk("%pd", pdev->domain); ++ printk("%pd", pdev->domain); + printk(" - node %-3d", (pdev->node != NUMA_NO_NODE) ? pdev->node : -1); + pdev_dump_msi(pdev); + printk("\n"); +@@ -1402,8 +1390,6 @@ __initcall(setup_dump_pcidevs); + static int iommu_add_device(struct pci_dev *pdev) + { + const struct domain_iommu *hd; +- int rc; +- unsigned int devfn = pdev->devfn; + + if ( !pdev->domain ) + return -EINVAL; +@@ -1414,20 +1400,7 @@ static int iommu_add_device(struct pci_dev *pdev) + if ( !is_iommu_enabled(pdev->domain) ) + return 0; + +- rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev)); +- if ( rc || !pdev->phantom_stride ) +- return rc; +- +- for ( ; ; ) +- { +- devfn += pdev->phantom_stride; +- if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) +- return 0; +- rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev)); +- if ( rc ) +- printk(XENLOG_WARNING "IOMMU: add %pp failed (%d)\n", +- &PCI_SBDF(pdev->seg, pdev->bus, devfn), rc); +- } ++ return iommu_attach_context(pdev->domain, pci_to_dev(pdev), 0); + } + + static int iommu_enable_device(struct pci_dev *pdev) +@@ -1449,36 +1422,13 @@ static int iommu_enable_device(struct pci_dev *pdev) + + static int iommu_remove_device(struct pci_dev *pdev) + { +- const struct domain_iommu *hd; +- u8 devfn; +- + if ( !pdev->domain ) + return -EINVAL; + +- hd = dom_iommu(pdev->domain); + if ( !is_iommu_enabled(pdev->domain) ) + return 0; + +- for ( devfn = pdev->devfn ; pdev->phantom_stride; ) +- { +- int rc; +- +- devfn += pdev->phantom_stride; +- if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) +- break; +- rc = iommu_call(hd->platform_ops, remove_device, devfn, +- pci_to_dev(pdev)); +- if ( !rc ) +- continue; +- +- printk(XENLOG_ERR "IOMMU: remove %pp failed (%d)\n", +- &PCI_SBDF(pdev->seg, pdev->bus, devfn), rc); +- return rc; +- } +- +- devfn = pdev->devfn; +- +- return iommu_call(hd->platform_ops, remove_device, devfn, pci_to_dev(pdev)); ++ return iommu_detach_context(pdev->domain, pdev); + } + + static int device_assigned(u16 seg, u8 bus, u8 devfn) +@@ -1506,7 +1456,6 @@ static int device_assigned(u16 seg, u8 bus, u8 devfn) + /* Caller should hold the pcidevs_lock */ + static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + { +- const struct domain_iommu *hd = dom_iommu(d); + struct pci_dev *pdev; + int rc = 0; + +@@ -1544,17 +1493,7 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + pdev->fault.count = 0; + +- rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), +- flag); +- +- while ( pdev->phantom_stride && !rc ) +- { +- devfn += pdev->phantom_stride; +- if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) +- break; +- rc = iommu_call(hd->platform_ops, assign_device, d, devfn, +- pci_to_dev(pdev), flag); +- } ++ rc = iommu_reattach_context(pdev->domain, d, pci_to_dev(pdev), 0); + + if ( rc ) + goto done; +@@ -1564,27 +1503,9 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + write_unlock(&d->pci_lock); + + done: +- if ( rc ) +- { +- printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", +- d, devfn != pdev->devfn ? "phantom function " : "", +- &PCI_SBDF(seg, bus, devfn), rc); + +- if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) +- { +- /* +- * Device with phantom functions that failed to both assign and +- * rollback. Mark the device as broken and crash the target domain, +- * as the state of the functions at this point is unknown and Xen +- * has no way to assert consistent context assignment among them. +- */ +- pdev->broken = true; +- if ( !is_hardware_domain(d) && d != dom_io ) +- domain_crash(d); +- } +- } + /* The device is assigned to dom_io so mark it as quarantined */ +- else if ( d == dom_io ) ++ if ( !rc && d == dom_io ) + pdev->quarantine = true; + + return rc; +diff --git a/xen/drivers/passthrough/quarantine.c b/xen/drivers/passthrough/quarantine.c +new file mode 100644 +index 000000000000..b58f136ad81b +--- /dev/null ++++ b/xen/drivers/passthrough/quarantine.c +@@ -0,0 +1,49 @@ ++#include ++#include ++#include ++ ++#ifdef CONFIG_HAS_PCI ++uint8_t __read_mostly iommu_quarantine = ++# if defined(CONFIG_IOMMU_QUARANTINE_NONE) ++ IOMMU_quarantine_none; ++# elif defined(CONFIG_IOMMU_QUARANTINE_BASIC) ++ IOMMU_quarantine_basic; ++# elif defined(CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE) ++ IOMMU_quarantine_scratch_page; ++# endif ++#else ++# define iommu_quarantine IOMMU_quarantine_none ++#endif /* CONFIG_HAS_PCI */ ++ ++int iommu_quarantine_dev_init(device_t *dev) ++{ ++ int ret; ++ u16 ctx_no; ++ ++ if ( !iommu_quarantine ) ++ return 0; ++ ++ ret = iommu_context_alloc(dom_io, &ctx_no, IOMMU_CONTEXT_INIT_quarantine); ++ ++ if ( ret ) ++ return ret; ++ ++ /** TODO: Setup scratch page, mappings... */ ++ ++ ret = iommu_reattach_context(dev->domain, dom_io, dev, ctx_no); ++ ++ if ( ret ) ++ { ++ ASSERT(!iommu_context_free(dom_io, ctx_no, 0)); ++ return ret; ++ } ++ ++ return ret; ++} ++ ++int __init iommu_quarantine_init(void) ++{ ++ dom_io->options |= XEN_DOMCTL_CDF_iommu; ++ ++ return iommu_domain_init(dom_io, 0); ++} +diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h +index b928c67e1995..f74f3b107578 100644 +--- a/xen/include/xen/iommu.h ++++ b/xen/include/xen/iommu.h +@@ -52,7 +52,11 @@ static inline bool dfn_eq(dfn_t x, dfn_t y) + #ifdef CONFIG_HAS_PASSTHROUGH + extern bool iommu_enable, iommu_enabled; + extern bool force_iommu, iommu_verbose; ++ + /* Boolean except for the specific purposes of drivers/passthrough/iommu.c. */ ++#define IOMMU_quarantine_none 0 /* aka false */ ++#define IOMMU_quarantine_basic 1 /* aka true */ ++#define IOMMU_quarantine_scratch_page 2 + extern uint8_t iommu_quarantine; + #else + #define iommu_enabled false +@@ -106,6 +110,7 @@ extern bool iommu_debug; + extern bool amd_iommu_perdev_intremap; + + extern bool iommu_hwdom_strict, iommu_hwdom_passthrough, iommu_hwdom_inclusive; ++extern bool iommu_hwdom_no_dma; + extern int8_t iommu_hwdom_reserved; + + extern unsigned int iommu_dev_iotlb_timeout; +@@ -161,11 +166,10 @@ enum + */ + long __must_check iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0, + unsigned long page_count, unsigned int flags, +- unsigned int *flush_flags); ++ unsigned int *flush_flags, u16 ctx_no); + long __must_check iommu_unmap(struct domain *d, dfn_t dfn0, + unsigned long page_count, unsigned int flags, +- unsigned int *flush_flags); +- ++ unsigned int *flush_flags, u16 ctx_no); + int __must_check iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn, + unsigned long page_count, + unsigned int flags); +@@ -173,11 +177,12 @@ int __must_check iommu_legacy_unmap(struct domain *d, dfn_t dfn, + unsigned long page_count); + + int __must_check iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn, +- unsigned int *flags); ++ unsigned int *flags, u16 ctx_no); + + int __must_check iommu_iotlb_flush(struct domain *d, dfn_t dfn, + unsigned long page_count, +- unsigned int flush_flags); ++ unsigned int flush_flags, ++ u16 ctx_no); + int __must_check iommu_iotlb_flush_all(struct domain *d, + unsigned int flush_flags); + +@@ -250,20 +255,30 @@ struct page_info; + */ + typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt); + ++struct iommu_context; ++ + struct iommu_ops { + unsigned long page_sizes; + int (*init)(struct domain *d); + void (*hwdom_init)(struct domain *d); +- int (*quarantine_init)(device_t *dev, bool scratch_page); +- int (*add_device)(uint8_t devfn, device_t *dev); ++ int (*context_init)(struct domain *d, struct iommu_context *ctx, ++ u32 flags); ++ int (*context_teardown)(struct domain *d, struct iommu_context *ctx, ++ u32 flags); ++ int (*attach)(struct domain *d, device_t *dev, ++ struct iommu_context *ctx); ++ int (*detach)(struct domain *d, device_t *dev, ++ struct iommu_context *prev_ctx); ++ int (*reattach)(struct domain *d, device_t *dev, ++ struct iommu_context *prev_ctx, ++ struct iommu_context *ctx); ++ + int (*enable_device)(device_t *dev); +- int (*remove_device)(uint8_t devfn, device_t *dev); +- int (*assign_device)(struct domain *d, uint8_t devfn, device_t *dev, +- uint32_t flag); +- int (*reassign_device)(struct domain *s, struct domain *t, +- uint8_t devfn, device_t *dev); + #ifdef CONFIG_HAS_PCI + int (*get_device_group_id)(uint16_t seg, uint8_t bus, uint8_t devfn); ++ int (*add_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn, ++ struct iommu_context *ctx); ++ int (*remove_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn); + #endif /* HAS_PCI */ + + void (*teardown)(struct domain *d); +@@ -274,12 +289,15 @@ struct iommu_ops { + */ + int __must_check (*map_page)(struct domain *d, dfn_t dfn, mfn_t mfn, + unsigned int flags, +- unsigned int *flush_flags); ++ unsigned int *flush_flags, ++ struct iommu_context *ctx); + int __must_check (*unmap_page)(struct domain *d, dfn_t dfn, + unsigned int order, +- unsigned int *flush_flags); ++ unsigned int *flush_flags, ++ struct iommu_context *ctx); + int __must_check (*lookup_page)(struct domain *d, dfn_t dfn, mfn_t *mfn, +- unsigned int *flags); ++ unsigned int *flags, ++ struct iommu_context *ctx); + + #ifdef CONFIG_X86 + int (*enable_x2apic)(void); +@@ -292,14 +310,15 @@ struct iommu_ops { + int (*setup_hpet_msi)(struct msi_desc *msi_desc); + + void (*adjust_irq_affinities)(void); +- void (*clear_root_pgtable)(struct domain *d); ++ void (*clear_root_pgtable)(struct domain *d, struct iommu_context *ctx); + int (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg); + #endif /* CONFIG_X86 */ + + int __must_check (*suspend)(void); + void (*resume)(void); + void (*crash_shutdown)(void); +- int __must_check (*iotlb_flush)(struct domain *d, dfn_t dfn, ++ int __must_check (*iotlb_flush)(struct domain *d, ++ struct iommu_context *ctx, dfn_t dfn, + unsigned long page_count, + unsigned int flush_flags); + int (*get_reserved_device_memory)(iommu_grdm_t *func, void *ctxt); +@@ -314,6 +333,8 @@ struct iommu_ops { + */ + int (*dt_xlate)(device_t *dev, const struct dt_phandle_args *args); + #endif ++ ++ uint64_t (*get_max_iova)(struct domain *d); + }; + + /* +@@ -343,11 +364,39 @@ extern int iommu_get_extra_reserved_device_memory(iommu_grdm_t *func, + # define iommu_vcall iommu_call + #endif + ++struct iommu_context { ++ u16 id; /* Context id (0 means default context) */ ++ rspinlock_t lock; /* context lock */ ++ ++ struct list_head devices; ++ ++ struct arch_iommu_context arch; ++ ++ bool opaque; /* context can't be modified nor accessed (e.g HAP) */ ++ bool dying; /* the context is tearing down */ ++}; ++ ++struct iommu_context_list { ++ atomic_t initialized; /* has/is context list being initialized ? */ ++ rwlock_t lock; /* prevent concurrent destruction and access of contexts */ ++ uint16_t count; /* Context count excluding default context */ ++ ++ /* if count > 0 */ ++ ++ uint64_t *bitmap; /* bitmap of context allocation */ ++ struct iommu_context *map; /* Map of contexts */ ++}; ++ ++ + struct domain_iommu { ++ + #ifdef CONFIG_HAS_PASSTHROUGH + struct arch_iommu arch; + #endif + ++ struct iommu_context default_ctx; ++ struct iommu_context_list other_contexts; ++ + /* iommu_ops */ + const struct iommu_ops *platform_ops; + +@@ -365,6 +414,12 @@ struct domain_iommu { + /* SAF-2-safe enum constant in arithmetic operation */ + DECLARE_BITMAP(features, IOMMU_FEAT_count); + ++ /* Do the IOMMU block all DMA on default context (implies !has_pt_share) ? */ ++ bool no_dma; ++ ++ /* Is the domain allowed to use PV-IOMMU ? */ ++ bool allow_pv_iommu; ++ + /* Does the guest share HAP mapping with the IOMMU? */ + bool hap_pt_share; + +@@ -380,6 +435,7 @@ struct domain_iommu { + #define dom_iommu(d) (&(d)->iommu) + #define iommu_set_feature(d, f) set_bit(f, dom_iommu(d)->features) + #define iommu_clear_feature(d, f) clear_bit(f, dom_iommu(d)->features) ++#define iommu_default_context(d) (&dom_iommu(d)->default_ctx) /* does not lock ! */ + + /* Are we using the domain P2M table as its IOMMU pagetable? */ + #define iommu_use_hap_pt(d) (IS_ENABLED(CONFIG_HVM) && \ +@@ -401,10 +457,14 @@ static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d, + } + #endif + ++int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order); ++ + int __must_check iommu_suspend(void); + void iommu_resume(void); + void iommu_crash_shutdown(void); + int iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt); ++ ++int __init iommu_quarantine_init(void); + int iommu_quarantine_dev_init(device_t *dev); + + #ifdef CONFIG_HAS_PCI +@@ -414,6 +474,27 @@ int iommu_do_pci_domctl(struct xen_domctl *domctl, struct domain *d, + + void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev); + ++uint64_t iommu_get_max_iova(struct domain *d); ++ ++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no); ++void iommu_put_context(struct iommu_context *ctx); ++ ++#define IOMMU_CONTEXT_INIT_default (1 << 0) ++#define IOMMU_CONTEXT_INIT_quarantine (1 << 1) ++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no, u32 flags); ++ ++#define IOMMU_TEARDOWN_REATTACH_DEFAULT (1 << 0) ++#define IOMMU_TEARDOWN_PREEMPT (1 << 1) ++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags); ++ ++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags); ++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags); ++ ++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom, ++ device_t *dev, u16 ctx_no); ++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no); ++int iommu_detach_context(struct domain *d, device_t *dev); ++ + /* + * The purpose of the iommu_dont_flush_iotlb optional cpu flag is to + * avoid unecessary iotlb_flush in the low level IOMMU code. +@@ -429,6 +510,8 @@ DECLARE_PER_CPU(bool, iommu_dont_flush_iotlb); + extern struct spinlock iommu_pt_cleanup_lock; + extern struct page_list_head iommu_pt_cleanup_list; + ++int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order); ++int arch_iommu_pviommu_teardown(struct domain *d); + bool arch_iommu_use_permitted(const struct domain *d); + + #ifdef CONFIG_X86 +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index f784e9116059..a421ead1a423 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -97,6 +97,7 @@ struct pci_dev_info { + struct pci_dev { + struct list_head alldevs_list; + struct list_head domain_list; ++ struct list_head context_list; + + struct list_head msi_list; + +@@ -104,6 +105,8 @@ struct pci_dev { + + struct domain *domain; + ++ uint16_t context; /* IOMMU context number of domain */ ++ + const union { + struct { + uint8_t devfn; +-- +2.46.0 + diff --git a/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch b/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch new file mode 100644 index 0000000..0001ef4 --- /dev/null +++ b/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch @@ -0,0 +1,2965 @@ +From 074b5fd3767e8dc53cfd9506bb0a62b32869f6da Mon Sep 17 00:00:00 2001 +From: Teddy Astie +Date: Mon, 4 Nov 2024 14:28:40 +0000 +Subject: [PATCH 403/404] VT-d: Port IOMMU driver to new subsystem + +Port the driver with guidances specified in iommu-contexts.md. + +Add a arena-based allocator for allocating a fixed chunk of memory and +split it into 4k pages for use by the IOMMU contexts. This chunk size +is configurable with X86_ARENA_ORDER and dom0-iommu=arena-order=N. + +Signed-off-by Teddy Astie +--- + xen/arch/x86/include/asm/arena.h | 54 + + xen/arch/x86/include/asm/iommu.h | 58 +- + xen/arch/x86/include/asm/pci.h | 17 - + xen/drivers/passthrough/vtd/Makefile | 2 +- + xen/drivers/passthrough/vtd/extern.h | 14 +- + xen/drivers/passthrough/vtd/iommu.c | 1479 +++++++++----------------- + xen/drivers/passthrough/vtd/quirks.c | 20 +- + xen/drivers/passthrough/x86/Makefile | 1 + + xen/drivers/passthrough/x86/arena.c | 157 +++ + xen/drivers/passthrough/x86/iommu.c | 270 +++-- + 10 files changed, 984 insertions(+), 1088 deletions(-) + create mode 100644 xen/arch/x86/include/asm/arena.h + create mode 100644 xen/drivers/passthrough/x86/arena.c + +diff --git a/xen/arch/x86/include/asm/arena.h b/xen/arch/x86/include/asm/arena.h +new file mode 100644 +index 000000000000..7555b100e0b8 +--- /dev/null ++++ b/xen/arch/x86/include/asm/arena.h +@@ -0,0 +1,54 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/** ++ * Simple arena-based page allocator. ++ */ ++ ++#ifndef __XEN_IOMMU_ARENA_H__ ++#define __XEN_IOMMU_ARENA_H__ ++ ++#include "xen/domain.h" ++#include "xen/atomic.h" ++#include "xen/mm-frame.h" ++#include "xen/types.h" ++ ++/** ++ * struct page_arena: Page arena structure ++ */ ++struct iommu_arena { ++ /* mfn of the first page of the memory region */ ++ mfn_t region_start; ++ /* bitmap of allocations */ ++ unsigned long *map; ++ ++ /* Order of the arena */ ++ unsigned int order; ++ ++ /* Used page count */ ++ atomic_t used_pages; ++}; ++ ++/** ++ * Initialize a arena using domheap allocator. ++ * @param [out] arena Arena to allocate ++ * @param [in] domain domain that has ownership of arena pages ++ * @param [in] order order of the arena (power of two of the size) ++ * @param [in] memflags Flags for domheap_alloc_pages() ++ * @return -ENOMEM on arena allocation error, 0 otherwise ++ */ ++int iommu_arena_initialize(struct iommu_arena *arena, struct domain *domain, ++ unsigned int order, unsigned int memflags); ++ ++/** ++ * Teardown a arena. ++ * @param [out] arena arena to allocate ++ * @param [in] check check for existing allocations ++ * @return -EBUSY if check is specified ++ */ ++int iommu_arena_teardown(struct iommu_arena *arena, bool check); ++ ++struct page_info *iommu_arena_allocate_page(struct iommu_arena *arena); ++bool iommu_arena_free_page(struct iommu_arena *arena, struct page_info *page); ++ ++#define iommu_arena_size(arena) (1LLU << (arena)->order) ++ ++#endif +diff --git a/xen/arch/x86/include/asm/iommu.h b/xen/arch/x86/include/asm/iommu.h +index 8dc464fbd3ca..533bb8d77742 100644 +--- a/xen/arch/x86/include/asm/iommu.h ++++ b/xen/arch/x86/include/asm/iommu.h +@@ -2,14 +2,18 @@ + #ifndef __ARCH_X86_IOMMU_H__ + #define __ARCH_X86_IOMMU_H__ + ++#include + #include + #include + #include + #include ++#include + #include + #include + #include + ++#include "arena.h" ++ + #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 + + struct g2m_ioport { +@@ -31,27 +35,45 @@ typedef uint64_t daddr_t; + #define dfn_to_daddr(dfn) __dfn_to_daddr(dfn_x(dfn)) + #define daddr_to_dfn(daddr) _dfn(__daddr_to_dfn(daddr)) + +-struct arch_iommu ++struct arch_iommu_context + { +- spinlock_t mapping_lock; /* io page table lock */ +- struct { +- struct page_list_head list; +- spinlock_t lock; +- } pgtables; +- ++ struct page_list_head pgtables; + struct list_head identity_maps; + ++ /* Queue for freeing pages */ ++ struct page_list_head free_queue; ++ ++ /* Is this context reusing domain P2M ? */ ++ bool hap_context; ++ + union { + /* Intel VT-d */ + struct { + uint64_t pgd_maddr; /* io page directory machine address */ ++ domid_t *didmap; /* per-iommu DID */ ++ unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the context uses */ ++ uint32_t superpage_progress; /* superpage progress during teardown */ ++ } vtd; ++ /* AMD IOMMU */ ++ struct { ++ struct page_info *root_table; ++ } amd; ++ }; ++}; ++ ++struct arch_iommu ++{ ++ struct iommu_arena pt_arena; /* allocator for non-default contexts */ ++ ++ union { ++ /* Intel VT-d */ ++ struct { + unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */ +- unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the domain uses */ + } vtd; + /* AMD IOMMU */ + struct { + unsigned int paging_mode; +- struct page_info *root_table; ++ struct guest_iommu *g_iommu; + } amd; + }; + }; +@@ -109,10 +131,13 @@ static inline void iommu_disable_x2apic(void) + iommu_vcall(&iommu_ops, disable_x2apic); + } + +-int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, +- paddr_t base, paddr_t end, ++int iommu_identity_mapping(struct domain *d, struct iommu_context *ctx, ++ p2m_access_t p2ma, paddr_t base, paddr_t end, + unsigned int flag); +-void iommu_identity_map_teardown(struct domain *d); ++void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx); ++bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx, ++ mfn_t mfn); ++ + + extern bool untrusted_msi; + +@@ -128,14 +153,19 @@ unsigned long *iommu_init_domid(domid_t reserve); + domid_t iommu_alloc_domid(unsigned long *map); + void iommu_free_domid(domid_t domid, unsigned long *map); + +-int __must_check iommu_free_pgtables(struct domain *d); ++struct iommu_context; ++int __must_check iommu_free_pgtables(struct domain *d, struct iommu_context *ctx); + struct domain_iommu; + struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd, ++ struct iommu_context *ctx, + uint64_t contig_mask); +-void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg); ++void iommu_queue_free_pgtable(struct iommu_context *ctx, struct page_info *pg); + + /* Check [start, end] unity map range for correctness. */ + bool iommu_unity_region_ok(const char *prefix, mfn_t start, mfn_t end); ++int arch_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags); ++int arch_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags); ++int arch_iommu_flush_free_queue(struct domain *d, struct iommu_context *ctx); + + #endif /* !__ARCH_X86_IOMMU_H__ */ + /* +diff --git a/xen/arch/x86/include/asm/pci.h b/xen/arch/x86/include/asm/pci.h +index fd5480d67d43..214c1a0948a8 100644 +--- a/xen/arch/x86/include/asm/pci.h ++++ b/xen/arch/x86/include/asm/pci.h +@@ -15,23 +15,6 @@ + + struct arch_pci_dev { + vmask_t used_vectors; +- /* +- * These fields are (de)initialized under pcidevs-lock. Other uses of +- * them don't race (de)initialization and hence don't strictly need any +- * locking. +- */ +- union { +- /* Subset of struct arch_iommu's fields, to be used in dom_io. */ +- struct { +- uint64_t pgd_maddr; +- } vtd; +- struct { +- struct page_info *root_table; +- } amd; +- }; +- domid_t pseudo_domid; +- mfn_t leaf_mfn; +- struct page_list_head pgtables_list; + }; + + int pci_conf_write_intercept(unsigned int seg, unsigned int bdf, +diff --git a/xen/drivers/passthrough/vtd/Makefile b/xen/drivers/passthrough/vtd/Makefile +index fde7555fac07..81e1f46179b9 100644 +--- a/xen/drivers/passthrough/vtd/Makefile ++++ b/xen/drivers/passthrough/vtd/Makefile +@@ -5,4 +5,4 @@ obj-y += dmar.o + obj-y += utils.o + obj-y += qinval.o + obj-y += intremap.o +-obj-y += quirks.o ++obj-y += quirks.o +\ No newline at end of file +diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h +index c16583c951d5..6cdde26efb53 100644 +--- a/xen/drivers/passthrough/vtd/extern.h ++++ b/xen/drivers/passthrough/vtd/extern.h +@@ -80,12 +80,10 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node); + void free_pgtable_maddr(u64 maddr); + void *map_vtd_domain_page(u64 maddr); + void unmap_vtd_domain_page(const void *va); +-int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn, +- const struct pci_dev *pdev, domid_t domid, +- paddr_t pgd_maddr, unsigned int mode); +-int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn); ++int apply_context_single(struct domain *domain, struct iommu_context *ctx, ++ struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn); ++int unapply_context_single(struct domain *domain, struct vtd_iommu *iommu, ++ uint8_t bus, uint8_t devfn); + int cf_check intel_iommu_get_reserved_device_memory( + iommu_grdm_t *func, void *ctxt); + +@@ -106,8 +104,8 @@ void platform_quirks_init(void); + void vtd_ops_preamble_quirk(struct vtd_iommu *iommu); + void vtd_ops_postamble_quirk(struct vtd_iommu *iommu); + int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus, +- uint8_t devfn, domid_t domid, paddr_t pgd_maddr, +- unsigned int mode); ++ uint8_t devfn, domid_t domid, ++ unsigned int mode, struct iommu_context *ctx); + void pci_vtd_quirk(const struct pci_dev *); + void quirk_iommu_caps(struct vtd_iommu *iommu); + +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index 27a4d1640189..4e803735c318 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -30,15 +31,22 @@ + #include + #include + #include ++#include ++#include + #include ++#include ++#include ++#include + + #include + #include + #include + #include +-#include + #include + #include ++#include ++#include ++#include + + #include "iommu.h" + #include "dmar.h" +@@ -49,14 +57,6 @@ + #define CONTIG_MASK DMA_PTE_CONTIG_MASK + #include + +-/* dom_io is used as a sentinel for quarantined devices */ +-#define QUARANTINE_SKIP(d, pgd_maddr) ((d) == dom_io && !(pgd_maddr)) +-#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \ +- : (pdev)->arch.pseudo_domid) +-#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \ +- ? dom_iommu(d)->arch.vtd.pgd_maddr \ +- : (pdev)->arch.vtd.pgd_maddr) +- + bool __read_mostly iommu_igfx = true; + bool __read_mostly iommu_qinval = true; + #ifndef iommu_snoop +@@ -69,7 +69,6 @@ static unsigned int __ro_after_init min_pt_levels = UINT_MAX; + static struct tasklet vtd_fault_tasklet; + + static int cf_check setup_hwdom_device(u8 devfn, struct pci_dev *); +-static void setup_hwdom_rmrr(struct domain *d); + + static bool domid_mapping(const struct vtd_iommu *iommu) + { +@@ -209,26 +208,14 @@ static bool any_pdev_behind_iommu(const struct domain *d, + * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap. + */ + static void check_cleanup_domid_map(const struct domain *d, ++ const struct iommu_context *ctx, + const struct pci_dev *exclude, + struct vtd_iommu *iommu) + { +- bool found; +- +- if ( d == dom_io ) +- return; +- +- found = any_pdev_behind_iommu(d, exclude, iommu); +- /* +- * Hidden devices are associated with DomXEN but usable by the hardware +- * domain. Hence they need considering here as well. +- */ +- if ( !found && is_hardware_domain(d) ) +- found = any_pdev_behind_iommu(dom_xen, exclude, iommu); +- +- if ( !found ) ++ if ( !any_pdev_behind_iommu(d, exclude, iommu) ) + { +- clear_bit(iommu->index, dom_iommu(d)->arch.vtd.iommu_bitmap); +- cleanup_domid_map(d->domain_id, iommu); ++ clear_bit(iommu->index, ctx->arch.vtd.iommu_bitmap); ++ cleanup_domid_map(ctx->arch.vtd.didmap[iommu->index], iommu); + } + } + +@@ -315,8 +302,9 @@ static u64 bus_to_context_maddr(struct vtd_iommu *iommu, u8 bus) + * PTE for the requested address, + * - for target == 0 the full PTE contents below PADDR_BITS limit. + */ +-static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr, +- unsigned int target, ++static uint64_t addr_to_dma_page_maddr(struct domain *domain, ++ struct iommu_context *ctx, ++ daddr_t addr, unsigned int target, + unsigned int *flush_flags, bool alloc) + { + struct domain_iommu *hd = dom_iommu(domain); +@@ -326,10 +314,9 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr, + u64 pte_maddr = 0; + + addr &= (((u64)1) << addr_width) - 1; +- ASSERT(spin_is_locked(&hd->arch.mapping_lock)); + ASSERT(target || !alloc); + +- if ( !hd->arch.vtd.pgd_maddr ) ++ if ( !ctx->arch.vtd.pgd_maddr ) + { + struct page_info *pg; + +@@ -337,13 +324,13 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr, + goto out; + + pte_maddr = level; +- if ( !(pg = iommu_alloc_pgtable(hd, 0)) ) ++ if ( !(pg = iommu_alloc_pgtable(hd, ctx, 0)) ) + goto out; + +- hd->arch.vtd.pgd_maddr = page_to_maddr(pg); ++ ctx->arch.vtd.pgd_maddr = page_to_maddr(pg); + } + +- pte_maddr = hd->arch.vtd.pgd_maddr; ++ pte_maddr = ctx->arch.vtd.pgd_maddr; + parent = map_vtd_domain_page(pte_maddr); + while ( level > target ) + { +@@ -379,7 +366,7 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr, + } + + pte_maddr = level - 1; +- pg = iommu_alloc_pgtable(hd, DMA_PTE_CONTIG_MASK); ++ pg = iommu_alloc_pgtable(hd, ctx, DMA_PTE_CONTIG_MASK); + if ( !pg ) + break; + +@@ -431,38 +418,25 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr, + return pte_maddr; + } + +-static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, +- unsigned int nr_pt_levels) ++static paddr_t get_context_pgd(struct domain *d, struct iommu_context *ctx, ++ unsigned int nr_pt_levels) + { +- struct domain_iommu *hd = dom_iommu(d); + unsigned int agaw; ++ paddr_t pgd_maddr = ctx->arch.vtd.pgd_maddr; + +- ASSERT(spin_is_locked(&hd->arch.mapping_lock)); +- +- if ( pgd_maddr ) +- /* nothing */; +- else if ( iommu_use_hap_pt(d) ) ++ if ( !ctx->arch.vtd.pgd_maddr ) + { +- pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); ++ /* ++ * Ensure we have pagetables allocated down to the smallest ++ * level the loop below may need to run to. ++ */ ++ addr_to_dma_page_maddr(d, ctx, 0, min_pt_levels, NULL, true); + +- pgd_maddr = pagetable_get_paddr(pgt); ++ if ( !ctx->arch.vtd.pgd_maddr ) ++ return 0; + } +- else +- { +- if ( !hd->arch.vtd.pgd_maddr ) +- { +- /* +- * Ensure we have pagetables allocated down to the smallest +- * level the loop below may need to run to. +- */ +- addr_to_dma_page_maddr(d, 0, min_pt_levels, NULL, true); +- +- if ( !hd->arch.vtd.pgd_maddr ) +- return 0; +- } + +- pgd_maddr = hd->arch.vtd.pgd_maddr; +- } ++ pgd_maddr = ctx->arch.vtd.pgd_maddr; + + /* Skip top level(s) of page tables for less-than-maximum level DRHDs. */ + for ( agaw = level_to_agaw(4); +@@ -730,28 +704,18 @@ static int __must_check iommu_flush_all(void) + return rc; + } + +-static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn, ++static int __must_check cf_check iommu_flush_iotlb(struct domain *d, ++ struct iommu_context *ctx, ++ dfn_t dfn, + unsigned long page_count, + unsigned int flush_flags) + { +- struct domain_iommu *hd = dom_iommu(d); + struct acpi_drhd_unit *drhd; + struct vtd_iommu *iommu; + bool flush_dev_iotlb; + int iommu_domid; + int ret = 0; + +- if ( flush_flags & IOMMU_FLUSHF_all ) +- { +- dfn = INVALID_DFN; +- page_count = 0; +- } +- else +- { +- ASSERT(page_count && !dfn_eq(dfn, INVALID_DFN)); +- ASSERT(flush_flags); +- } +- + /* + * No need pcideves_lock here because we have flush + * when assign/deassign device +@@ -762,13 +726,20 @@ static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn, + + iommu = drhd->iommu; + +- if ( !test_bit(iommu->index, hd->arch.vtd.iommu_bitmap) ) +- continue; ++ if ( ctx ) ++ { ++ if ( !test_bit(iommu->index, ctx->arch.vtd.iommu_bitmap) ) ++ continue; ++ ++ iommu_domid = get_iommu_did(ctx->arch.vtd.didmap[iommu->index], iommu, true); ++ ++ if ( iommu_domid == -1 ) ++ continue; ++ } ++ else ++ iommu_domid = 0; + + flush_dev_iotlb = !!find_ats_dev_drhd(iommu); +- iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying); +- if ( iommu_domid == -1 ) +- continue; + + if ( !page_count || (page_count & (page_count - 1)) || + dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) ) +@@ -787,10 +758,13 @@ static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn, + ret = rc; + } + ++ if ( !ret && ctx ) ++ arch_iommu_flush_free_queue(d, ctx); ++ + return ret; + } + +-static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int level) ++static void queue_free_pt(struct iommu_context *ctx, mfn_t mfn, unsigned int level) + { + if ( level > 1 ) + { +@@ -799,13 +773,13 @@ static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int level + + for ( i = 0; i < PTE_NUM; ++i ) + if ( dma_pte_present(pt[i]) && !dma_pte_superpage(pt[i]) ) +- queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(pt[i])), ++ queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(pt[i])), + level - 1); + + unmap_domain_page(pt); + } + +- iommu_queue_free_pgtable(hd, mfn_to_page(mfn)); ++ iommu_queue_free_pgtable(ctx, mfn_to_page(mfn)); + } + + static int iommu_set_root_entry(struct vtd_iommu *iommu) +@@ -1436,11 +1410,6 @@ static int cf_check intel_iommu_domain_init(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + +- hd->arch.vtd.iommu_bitmap = xzalloc_array(unsigned long, +- BITS_TO_LONGS(nr_iommus)); +- if ( !hd->arch.vtd.iommu_bitmap ) +- return -ENOMEM; +- + hd->arch.vtd.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + + return 0; +@@ -1451,7 +1420,7 @@ static void __hwdom_init cf_check intel_iommu_hwdom_init(struct domain *d) + struct acpi_drhd_unit *drhd; + + setup_hwdom_pci_devices(d, setup_hwdom_device); +- setup_hwdom_rmrr(d); ++ + /* Make sure workarounds are applied before enabling the IOMMU(s). */ + arch_iommu_hwdom_init(d); + +@@ -1468,32 +1437,22 @@ static void __hwdom_init cf_check intel_iommu_hwdom_init(struct domain *d) + } + } + +-/* +- * This function returns +- * - a negative errno value upon error, +- * - zero upon success when previously the entry was non-present, or this isn't +- * the "main" request for a device (pdev == NULL), or for no-op quarantining +- * assignments, +- * - positive (one) upon success when previously the entry was present and this +- * is the "main" request for a device (pdev != NULL). ++/** ++ * Apply a context on a device. ++ * @param domain Domain of the context ++ * @param iommu IOMMU hardware to use (must match device iommu) ++ * @param ctx IOMMU context to apply ++ * @param devfn PCI device function (may be different to pdev) + */ +-int domain_context_mapping_one( +- struct domain *domain, +- struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn, const struct pci_dev *pdev, +- domid_t domid, paddr_t pgd_maddr, unsigned int mode) ++int apply_context_single(struct domain *domain, struct iommu_context *ctx, ++ struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn) + { +- struct domain_iommu *hd = dom_iommu(domain); + struct context_entry *context, *context_entries, lctxt; +- __uint128_t old; ++ __uint128_t res, old; + uint64_t maddr; +- uint16_t seg = iommu->drhd->segment, prev_did = 0; +- struct domain *prev_dom = NULL; ++ uint16_t seg = iommu->drhd->segment, prev_did = 0, did; + int rc, ret; +- bool flush_dev_iotlb; +- +- if ( QUARANTINE_SKIP(domain, pgd_maddr) ) +- return 0; ++ bool flush_dev_iotlb, overwrite_entry = false; + + ASSERT(pcidevs_locked()); + spin_lock(&iommu->lock); +@@ -1502,28 +1461,15 @@ int domain_context_mapping_one( + context = &context_entries[devfn]; + old = (lctxt = *context).full; + +- if ( context_present(lctxt) ) +- { +- domid_t domid; ++ did = ctx->arch.vtd.didmap[iommu->index]; + ++ if ( context_present(*context) ) ++ { + prev_did = context_domain_id(lctxt); +- domid = did_to_domain_id(iommu, prev_did); +- if ( domid < DOMID_FIRST_RESERVED ) +- prev_dom = rcu_lock_domain_by_id(domid); +- else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK ) +- prev_dom = rcu_lock_domain(dom_io); +- if ( !prev_dom ) +- { +- spin_unlock(&iommu->lock); +- unmap_vtd_domain_page(context_entries); +- dprintk(XENLOG_DEBUG VTDPREFIX, +- "no domain for did %u (nr_dom %u)\n", +- prev_did, cap_ndoms(iommu->cap)); +- return -ESRCH; +- } ++ overwrite_entry = true; + } + +- if ( iommu_hwdom_passthrough && is_hardware_domain(domain) ) ++ if ( iommu_hwdom_passthrough && is_hardware_domain(domain) && !ctx->id ) + { + context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU); + } +@@ -1531,16 +1477,10 @@ int domain_context_mapping_one( + { + paddr_t root; + +- spin_lock(&hd->arch.mapping_lock); +- +- root = domain_pgd_maddr(domain, pgd_maddr, iommu->nr_pt_levels); ++ root = get_context_pgd(domain, ctx, iommu->nr_pt_levels); + if ( !root ) + { +- spin_unlock(&hd->arch.mapping_lock); +- spin_unlock(&iommu->lock); + unmap_vtd_domain_page(context_entries); +- if ( prev_dom ) +- rcu_unlock_domain(prev_dom); + return -ENOMEM; + } + +@@ -1549,98 +1489,39 @@ int domain_context_mapping_one( + context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB); + else + context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL); +- +- spin_unlock(&hd->arch.mapping_lock); + } + +- rc = context_set_domain_id(&lctxt, domid, iommu); ++ rc = context_set_domain_id(&lctxt, did, iommu); + if ( rc ) +- { +- unlock: +- spin_unlock(&iommu->lock); +- unmap_vtd_domain_page(context_entries); +- if ( prev_dom ) +- rcu_unlock_domain(prev_dom); +- return rc; +- } +- +- if ( !prev_dom ) +- { +- context_set_address_width(lctxt, level_to_agaw(iommu->nr_pt_levels)); +- context_set_fault_enable(lctxt); +- context_set_present(lctxt); +- } +- else if ( prev_dom == domain ) +- { +- ASSERT(lctxt.full == context->full); +- rc = !!pdev; + goto unlock; +- } +- else +- { +- ASSERT(context_address_width(lctxt) == +- level_to_agaw(iommu->nr_pt_levels)); +- ASSERT(!context_fault_disable(lctxt)); +- } +- +- if ( cpu_has_cx16 ) +- { +- __uint128_t res = cmpxchg16b(context, &old, &lctxt.full); + +- /* +- * Hardware does not update the context entry behind our backs, +- * so the return value should match "old". +- */ +- if ( res != old ) +- { +- if ( pdev ) +- check_cleanup_domid_map(domain, pdev, iommu); +- printk(XENLOG_ERR +- "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", +- &PCI_SBDF(seg, bus, devfn), +- (uint64_t)(res >> 64), (uint64_t)res, +- (uint64_t)(old >> 64), (uint64_t)old); +- rc = -EILSEQ; +- goto unlock; +- } +- } +- else if ( !prev_dom || !(mode & MAP_WITH_RMRR) ) +- { +- context_clear_present(*context); +- iommu_sync_cache(context, sizeof(*context)); ++ context_set_address_width(lctxt, level_to_agaw(iommu->nr_pt_levels)); ++ context_set_fault_enable(lctxt); ++ context_set_present(lctxt); + +- write_atomic(&context->hi, lctxt.hi); +- /* No barrier should be needed between these two. */ +- write_atomic(&context->lo, lctxt.lo); +- } +- else /* Best effort, updating DID last. */ +- { +- /* +- * By non-atomically updating the context entry's DID field last, +- * during a short window in time TLB entries with the old domain ID +- * but the new page tables may be inserted. This could affect I/O +- * of other devices using this same (old) domain ID. Such updating +- * therefore is not a problem if this was the only device associated +- * with the old domain ID. Diverting I/O of any of a dying domain's +- * devices to the quarantine page tables is intended anyway. +- */ +- if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) ) +- printk(XENLOG_WARNING VTDPREFIX +- " %pp: reassignment may cause %pd data corruption\n", +- &PCI_SBDF(seg, bus, devfn), prev_dom); ++ res = cmpxchg16b(context, &old, &lctxt.full); + +- write_atomic(&context->lo, lctxt.lo); +- /* No barrier should be needed between these two. */ +- write_atomic(&context->hi, lctxt.hi); ++ /* ++ * Hardware does not update the context entry behind our backs, ++ * so the return value should match "old". ++ */ ++ if ( res != old ) ++ { ++ printk(XENLOG_ERR ++ "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n", ++ &PCI_SBDF(seg, bus, devfn), ++ (uint64_t)(res >> 64), (uint64_t)res, ++ (uint64_t)(old >> 64), (uint64_t)old); ++ rc = -EILSEQ; ++ goto unlock; + } + + iommu_sync_cache(context, sizeof(struct context_entry)); +- spin_unlock(&iommu->lock); + + rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF(bus, devfn), +- DMA_CCMD_MASK_NOBIT, !prev_dom); ++ DMA_CCMD_MASK_NOBIT, !overwrite_entry); + flush_dev_iotlb = !!find_ats_dev_drhd(iommu); +- ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb); ++ ret = iommu_flush_iotlb_dsi(iommu, prev_did, !overwrite_entry, flush_dev_iotlb); + + /* + * The current logic for returns: +@@ -1656,230 +1537,55 @@ int domain_context_mapping_one( + if ( rc > 0 ) + rc = 0; + +- set_bit(iommu->index, hd->arch.vtd.iommu_bitmap); ++ set_bit(iommu->index, ctx->arch.vtd.iommu_bitmap); + + unmap_vtd_domain_page(context_entries); ++ spin_unlock(&iommu->lock); + + if ( !seg && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode); +- +- if ( rc && !(mode & MAP_ERROR_RECOVERY) ) +- { +- if ( !prev_dom || +- /* +- * Unmapping here means DEV_TYPE_PCI devices with RMRRs (if such +- * exist) would cause problems if such a region was actually +- * accessed. +- */ +- (prev_dom == dom_io && !pdev) ) +- ret = domain_context_unmap_one(domain, iommu, bus, devfn); +- else +- ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev, +- DEVICE_DOMID(prev_dom, pdev), +- DEVICE_PGTABLE(prev_dom, pdev), +- (mode & MAP_WITH_RMRR) | +- MAP_ERROR_RECOVERY) < 0; +- +- if ( !ret && pdev && pdev->devfn == devfn ) +- check_cleanup_domid_map(domain, pdev, iommu); +- } ++ rc = me_wifi_quirk(domain, bus, devfn, did, 0, ctx); + +- if ( prev_dom ) +- rcu_unlock_domain(prev_dom); ++ return rc; + +- return rc ?: pdev && prev_dom; ++ unlock: ++ unmap_vtd_domain_page(context_entries); ++ spin_unlock(&iommu->lock); ++ return rc; + } + +-static const struct acpi_drhd_unit *domain_context_unmap( +- struct domain *d, uint8_t devfn, struct pci_dev *pdev); +- +-static int domain_context_mapping(struct domain *domain, u8 devfn, +- struct pci_dev *pdev) ++int apply_context(struct domain *d, struct iommu_context *ctx, ++ struct pci_dev *pdev, u8 devfn) + { + const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); +- const struct acpi_rmrr_unit *rmrr; +- paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev); +- domid_t orig_domid = pdev->arch.pseudo_domid; + int ret = 0; +- unsigned int i, mode = 0; +- uint16_t seg = pdev->seg, bdf; +- uint8_t bus = pdev->bus, secbus; +- +- /* +- * Generally we assume only devices from one node to get assigned to a +- * given guest. But even if not, by replacing the prior value here we +- * guarantee that at least some basic allocations for the device being +- * added will get done against its node. Any further allocations for +- * this or other devices may be penalized then, but some would also be +- * if we left other than NUMA_NO_NODE untouched here. +- */ +- if ( drhd && drhd->iommu->node != NUMA_NO_NODE ) +- dom_iommu(domain)->node = drhd->iommu->node; +- +- ASSERT(pcidevs_locked()); +- +- for_each_rmrr_device( rmrr, bdf, i ) +- { +- if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf ) +- continue; + +- mode |= MAP_WITH_RMRR; +- break; +- } ++ if ( !drhd ) ++ return -EINVAL; + +- if ( domain != pdev->domain && pdev->domain != dom_io ) ++ if ( pdev->type == DEV_TYPE_PCI_HOST_BRIDGE || ++ pdev->type == DEV_TYPE_PCIe_BRIDGE || ++ pdev->type == DEV_TYPE_PCIe2PCI_BRIDGE || ++ pdev->type == DEV_TYPE_LEGACY_PCI_BRIDGE ) + { +- if ( pdev->domain->is_dying ) +- mode |= MAP_OWNER_DYING; +- else if ( drhd && +- !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) && +- !pdev->phantom_stride ) +- mode |= MAP_SINGLE_DEVICE; ++ printk(XENLOG_WARNING VTDPREFIX " Ignoring apply_context on PCI bridge\n"); ++ return 0; + } + +- switch ( pdev->type ) +- { +- bool prev_present; +- +- case DEV_TYPE_PCI_HOST_BRIDGE: +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:Hostbridge: skip %pp map\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- if ( !is_hardware_domain(domain) ) +- return -EPERM; +- break; +- +- case DEV_TYPE_PCIe_BRIDGE: +- case DEV_TYPE_PCIe2PCI_BRIDGE: +- case DEV_TYPE_LEGACY_PCI_BRIDGE: +- break; +- +- case DEV_TYPE_PCIe_ENDPOINT: +- if ( !drhd ) +- return -ENODEV; +- +- if ( iommu_quarantine && orig_domid == DOMID_INVALID ) +- { +- pdev->arch.pseudo_domid = +- iommu_alloc_domid(drhd->iommu->pseudo_domid_map); +- if ( pdev->arch.pseudo_domid == DOMID_INVALID ) +- return -ENOSPC; +- } +- +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:PCIe: map %pp\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev, +- DEVICE_DOMID(domain, pdev), pgd_maddr, +- mode); +- if ( ret > 0 ) +- ret = 0; +- if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) +- enable_ats_device(pdev, &drhd->iommu->ats_devices); +- +- break; +- +- case DEV_TYPE_PCI: +- if ( !drhd ) +- return -ENODEV; +- +- if ( iommu_quarantine && orig_domid == DOMID_INVALID ) +- { +- pdev->arch.pseudo_domid = +- iommu_alloc_domid(drhd->iommu->pseudo_domid_map); +- if ( pdev->arch.pseudo_domid == DOMID_INVALID ) +- return -ENOSPC; +- } +- +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:PCI: map %pp\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- +- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- pdev, DEVICE_DOMID(domain, pdev), +- pgd_maddr, mode); +- if ( ret < 0 ) +- break; +- prev_present = ret; +- +- if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 ) +- { +- if ( !ret ) +- break; +- ret = -ENXIO; +- } +- /* +- * Strictly speaking if the device is the only one behind this bridge +- * and the only one with this (secbus,0,0) tuple, it could be allowed +- * to be re-assigned regardless of RMRR presence. But let's deal with +- * that case only if it is actually found in the wild. Note that +- * dealing with this just here would still not render the operation +- * secure. +- */ +- else if ( prev_present && (mode & MAP_WITH_RMRR) && +- domain != pdev->domain ) +- ret = -EOPNOTSUPP; +- +- /* +- * Mapping a bridge should, if anything, pass the struct pci_dev of +- * that bridge. Since bridges don't normally get assigned to guests, +- * their owner would be the wrong one. Pass NULL instead. +- */ +- if ( ret >= 0 ) +- ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, +- NULL, DEVICE_DOMID(domain, pdev), +- pgd_maddr, mode); +- +- /* +- * Devices behind PCIe-to-PCI/PCIx bridge may generate different +- * requester-id. It may originate from devfn=0 on the secondary bus +- * behind the bridge. Map that id as well if we didn't already. +- * +- * Somewhat similar as for bridges, we don't want to pass a struct +- * pci_dev here - there may not even exist one for this (secbus,0,0) +- * tuple. If there is one, without properly working device groups it +- * may again not have the correct owner. +- */ +- if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE && +- (secbus != pdev->bus || pdev->devfn != 0) ) +- ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0, +- NULL, DEVICE_DOMID(domain, pdev), +- pgd_maddr, mode); +- +- if ( ret ) +- { +- if ( !prev_present ) +- domain_context_unmap(domain, devfn, pdev); +- else if ( pdev->domain != domain ) /* Avoid infinite recursion. */ +- domain_context_mapping(pdev->domain, devfn, pdev); +- } ++ ASSERT(pcidevs_locked()); + +- break; ++ ret = apply_context_single(d, ctx, drhd->iommu, pdev->bus, devfn); + +- default: +- dprintk(XENLOG_ERR VTDPREFIX, "%pd:unknown(%u): %pp\n", +- domain, pdev->type, &PCI_SBDF(seg, bus, devfn)); +- ret = -EINVAL; +- break; +- } ++ if ( !ret && ats_device(pdev, drhd) > 0 ) ++ enable_ats_device(pdev, &drhd->iommu->ats_devices); + + if ( !ret && devfn == pdev->devfn ) + pci_vtd_quirk(pdev); + +- if ( ret && drhd && orig_domid == DOMID_INVALID ) +- { +- iommu_free_domid(pdev->arch.pseudo_domid, +- drhd->iommu->pseudo_domid_map); +- pdev->arch.pseudo_domid = DOMID_INVALID; +- } +- + return ret; + } + +-int domain_context_unmap_one( +- struct domain *domain, +- struct vtd_iommu *iommu, +- uint8_t bus, uint8_t devfn) ++int unapply_context_single(struct domain *domain, struct vtd_iommu *iommu, ++ uint8_t bus, uint8_t devfn) + { + struct context_entry *context, *context_entries; + u64 maddr; +@@ -1931,8 +1637,8 @@ int domain_context_unmap_one( + unmap_vtd_domain_page(context_entries); + + if ( !iommu->drhd->segment && !rc ) +- rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, 0, +- UNMAP_ME_PHANTOM_FUNC); ++ rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, UNMAP_ME_PHANTOM_FUNC, ++ NULL); + + if ( rc && !is_hardware_domain(domain) && domain != dom_io ) + { +@@ -1950,143 +1656,28 @@ int domain_context_unmap_one( + return rc; + } + +-static const struct acpi_drhd_unit *domain_context_unmap( +- struct domain *domain, +- uint8_t devfn, +- struct pci_dev *pdev) +-{ +- const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); +- struct vtd_iommu *iommu = drhd ? drhd->iommu : NULL; +- int ret; +- uint16_t seg = pdev->seg; +- uint8_t bus = pdev->bus, tmp_bus, tmp_devfn, secbus; +- +- switch ( pdev->type ) +- { +- case DEV_TYPE_PCI_HOST_BRIDGE: +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:Hostbridge: skip %pp unmap\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- return ERR_PTR(is_hardware_domain(domain) ? 0 : -EPERM); +- +- case DEV_TYPE_PCIe_BRIDGE: +- case DEV_TYPE_PCIe2PCI_BRIDGE: +- case DEV_TYPE_LEGACY_PCI_BRIDGE: +- return ERR_PTR(0); +- +- case DEV_TYPE_PCIe_ENDPOINT: +- if ( !iommu ) +- return ERR_PTR(-ENODEV); +- +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:PCIe: unmap %pp\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn); +- if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 ) +- disable_ats_device(pdev); +- +- break; +- +- case DEV_TYPE_PCI: +- if ( !iommu ) +- return ERR_PTR(-ENODEV); +- +- if ( iommu_debug ) +- printk(VTDPREFIX "%pd:PCI: unmap %pp\n", +- domain, &PCI_SBDF(seg, bus, devfn)); +- ret = domain_context_unmap_one(domain, iommu, bus, devfn); +- if ( ret ) +- break; +- +- tmp_bus = bus; +- tmp_devfn = devfn; +- if ( (ret = find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, +- &secbus)) < 1 ) +- { +- if ( ret ) +- { +- ret = -ENXIO; +- if ( !domain->is_dying && +- !is_hardware_domain(domain) && domain != dom_io ) +- { +- domain_crash(domain); +- /* Make upper layers continue in a best effort manner. */ +- ret = 0; +- } +- } +- break; +- } +- +- ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn); +- /* PCIe to PCI/PCIx bridge */ +- if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE ) +- ret = domain_context_unmap_one(domain, iommu, secbus, 0); +- +- break; +- +- default: +- dprintk(XENLOG_ERR VTDPREFIX, "%pd:unknown(%u): %pp\n", +- domain, pdev->type, &PCI_SBDF(seg, bus, devfn)); +- return ERR_PTR(-EINVAL); +- } +- +- if ( !ret && pdev->devfn == devfn && +- !QUARANTINE_SKIP(domain, pdev->arch.vtd.pgd_maddr) ) +- check_cleanup_domid_map(domain, pdev, iommu); +- +- return drhd; +-} +- +-static void cf_check iommu_clear_root_pgtable(struct domain *d) ++static void cf_check iommu_clear_root_pgtable(struct domain *d, struct iommu_context *ctx) + { +- struct domain_iommu *hd = dom_iommu(d); +- +- spin_lock(&hd->arch.mapping_lock); +- hd->arch.vtd.pgd_maddr = 0; +- spin_unlock(&hd->arch.mapping_lock); ++ ctx->arch.vtd.pgd_maddr = 0; + } + + static void cf_check iommu_domain_teardown(struct domain *d) + { +- struct domain_iommu *hd = dom_iommu(d); ++ struct iommu_context *ctx = iommu_default_context(d); + const struct acpi_drhd_unit *drhd; + + if ( list_empty(&acpi_drhd_units) ) + return; + +- iommu_identity_map_teardown(d); +- +- ASSERT(!hd->arch.vtd.pgd_maddr); ++ ASSERT(!ctx->arch.vtd.pgd_maddr); + + for_each_drhd_unit ( drhd ) + cleanup_domid_map(d->domain_id, drhd->iommu); +- +- XFREE(hd->arch.vtd.iommu_bitmap); +-} +- +-static void quarantine_teardown(struct pci_dev *pdev, +- const struct acpi_drhd_unit *drhd) +-{ +- struct domain_iommu *hd = dom_iommu(dom_io); +- +- ASSERT(pcidevs_locked()); +- +- if ( !pdev->arch.vtd.pgd_maddr ) +- return; +- +- ASSERT(page_list_empty(&hd->arch.pgtables.list)); +- page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list); +- while ( iommu_free_pgtables(dom_io) == -ERESTART ) +- /* nothing */; +- pdev->arch.vtd.pgd_maddr = 0; +- +- if ( drhd ) +- cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu); + } + + static int __must_check cf_check intel_iommu_map_page( + struct domain *d, dfn_t dfn, mfn_t mfn, unsigned int flags, +- unsigned int *flush_flags) ++ unsigned int *flush_flags, struct iommu_context *ctx) + { + struct domain_iommu *hd = dom_iommu(d); + struct dma_pte *page, *pte, old, new = {}; +@@ -2097,33 +1688,24 @@ static int __must_check cf_check intel_iommu_map_page( + ASSERT((hd->platform_ops->page_sizes >> IOMMUF_order(flags)) & + PAGE_SIZE_4K); + +- /* Do nothing if VT-d shares EPT page table */ +- if ( iommu_use_hap_pt(d) ) ++ if ( ctx->opaque ) + return 0; + +- /* Do nothing if hardware domain and iommu supports pass thru. */ +- if ( iommu_hwdom_passthrough && is_hardware_domain(d) ) +- return 0; +- +- spin_lock(&hd->arch.mapping_lock); +- + /* + * IOMMU mapping request can be safely ignored when the domain is dying. + * +- * hd->arch.mapping_lock guarantees that d->is_dying will be observed ++ * hd->lock guarantees that d->is_dying will be observed + * before any page tables are freed (see iommu_free_pgtables()) + */ + if ( d->is_dying ) + { +- spin_unlock(&hd->arch.mapping_lock); + return 0; + } + +- pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), level, flush_flags, ++ pg_maddr = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), level, flush_flags, + true); + if ( pg_maddr < PAGE_SIZE ) + { +- spin_unlock(&hd->arch.mapping_lock); + return -ENOMEM; + } + +@@ -2144,7 +1726,6 @@ static int __must_check cf_check intel_iommu_map_page( + + if ( !((old.val ^ new.val) & ~DMA_PTE_CONTIG_MASK) ) + { +- spin_unlock(&hd->arch.mapping_lock); + unmap_vtd_domain_page(page); + return 0; + } +@@ -2173,7 +1754,7 @@ static int __must_check cf_check intel_iommu_map_page( + new.val &= ~(LEVEL_MASK << level_to_offset_bits(level)); + dma_set_pte_superpage(new); + +- pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), ++level, ++ pg_maddr = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), ++level, + flush_flags, false); + BUG_ON(pg_maddr < PAGE_SIZE); + +@@ -2183,11 +1764,10 @@ static int __must_check cf_check intel_iommu_map_page( + iommu_sync_cache(pte, sizeof(*pte)); + + *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all; +- iommu_queue_free_pgtable(hd, pg); ++ iommu_queue_free_pgtable(ctx, pg); + perfc_incr(iommu_pt_coalesces); + } + +- spin_unlock(&hd->arch.mapping_lock); + unmap_vtd_domain_page(page); + + *flush_flags |= IOMMU_FLUSHF_added; +@@ -2196,7 +1776,7 @@ static int __must_check cf_check intel_iommu_map_page( + *flush_flags |= IOMMU_FLUSHF_modified; + + if ( IOMMUF_order(flags) && !dma_pte_superpage(old) ) +- queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)), ++ queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(old)), + IOMMUF_order(flags) / LEVEL_STRIDE); + } + +@@ -2204,7 +1784,8 @@ static int __must_check cf_check intel_iommu_map_page( + } + + static int __must_check cf_check intel_iommu_unmap_page( +- struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags) ++ struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags, ++ struct iommu_context *ctx) + { + struct domain_iommu *hd = dom_iommu(d); + daddr_t addr = dfn_to_daddr(dfn); +@@ -2218,29 +1799,19 @@ static int __must_check cf_check intel_iommu_unmap_page( + */ + ASSERT((hd->platform_ops->page_sizes >> order) & PAGE_SIZE_4K); + +- /* Do nothing if VT-d shares EPT page table */ +- if ( iommu_use_hap_pt(d) ) +- return 0; +- +- /* Do nothing if hardware domain and iommu supports pass thru. */ +- if ( iommu_hwdom_passthrough && is_hardware_domain(d) ) ++ if ( ctx->opaque ) + return 0; + +- spin_lock(&hd->arch.mapping_lock); + /* get target level pte */ +- pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false); ++ pg_maddr = addr_to_dma_page_maddr(d, ctx, addr, level, flush_flags, false); + if ( pg_maddr < PAGE_SIZE ) +- { +- spin_unlock(&hd->arch.mapping_lock); + return pg_maddr ? -ENOMEM : 0; +- } + + page = map_vtd_domain_page(pg_maddr); + pte = &page[address_level_offset(addr, level)]; + + if ( !dma_pte_present(*pte) ) + { +- spin_unlock(&hd->arch.mapping_lock); + unmap_vtd_domain_page(page); + return 0; + } +@@ -2258,7 +1829,7 @@ static int __must_check cf_check intel_iommu_unmap_page( + + unmap_vtd_domain_page(page); + +- pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false); ++ pg_maddr = addr_to_dma_page_maddr(d, ctx, addr, level, flush_flags, false); + BUG_ON(pg_maddr < PAGE_SIZE); + + page = map_vtd_domain_page(pg_maddr); +@@ -2267,42 +1838,31 @@ static int __must_check cf_check intel_iommu_unmap_page( + iommu_sync_cache(pte, sizeof(*pte)); + + *flush_flags |= IOMMU_FLUSHF_all; +- iommu_queue_free_pgtable(hd, pg); ++ iommu_queue_free_pgtable(ctx, pg); + perfc_incr(iommu_pt_coalesces); + } + +- spin_unlock(&hd->arch.mapping_lock); +- + unmap_vtd_domain_page(page); + + *flush_flags |= IOMMU_FLUSHF_modified; + + if ( order && !dma_pte_superpage(old) ) +- queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)), ++ queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(old)), + order / LEVEL_STRIDE); + + return 0; + } + + static int cf_check intel_iommu_lookup_page( +- struct domain *d, dfn_t dfn, mfn_t *mfn, unsigned int *flags) ++ struct domain *d, dfn_t dfn, mfn_t *mfn, unsigned int *flags, ++ struct iommu_context *ctx) + { +- struct domain_iommu *hd = dom_iommu(d); + uint64_t val; + +- /* +- * If VT-d shares EPT page table or if the domain is the hardware +- * domain and iommu_passthrough is set then pass back the dfn. +- */ +- if ( iommu_use_hap_pt(d) || +- (iommu_hwdom_passthrough && is_hardware_domain(d)) ) ++ if ( ctx->opaque ) + return -EOPNOTSUPP; + +- spin_lock(&hd->arch.mapping_lock); +- +- val = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0, NULL, false); +- +- spin_unlock(&hd->arch.mapping_lock); ++ val = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), 0, NULL, false); + + if ( val < PAGE_SIZE ) + return -ENOENT; +@@ -2323,7 +1883,7 @@ static bool __init vtd_ept_page_compatible(const struct vtd_iommu *iommu) + + /* EPT is not initialised yet, so we must check the capability in + * the MSR explicitly rather than use cpu_has_vmx_ept_*() */ +- if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) ++ if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) + return false; + + return (ept_has_2mb(ept_cap) && opt_hap_2mb) <= +@@ -2332,44 +1892,6 @@ static bool __init vtd_ept_page_compatible(const struct vtd_iommu *iommu) + (cap_sps_1gb(vtd_cap) && iommu_superpages); + } + +-static int cf_check intel_iommu_add_device(u8 devfn, struct pci_dev *pdev) +-{ +- struct acpi_rmrr_unit *rmrr; +- u16 bdf; +- int ret, i; +- +- ASSERT(pcidevs_locked()); +- +- if ( !pdev->domain ) +- return -EINVAL; +- +- for_each_rmrr_device ( rmrr, bdf, i ) +- { +- if ( rmrr->segment == pdev->seg && bdf == PCI_BDF(pdev->bus, devfn) ) +- { +- /* +- * iommu_add_device() is only called for the hardware +- * domain (see xen/drivers/passthrough/pci.c:pci_add_device()). +- * Since RMRRs are always reserved in the e820 map for the hardware +- * domain, there shouldn't be a conflict. +- */ +- ret = iommu_identity_mapping(pdev->domain, p2m_access_rw, +- rmrr->base_address, rmrr->end_address, +- 0); +- if ( ret ) +- dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n", +- pdev->domain); +- } +- } +- +- ret = domain_context_mapping(pdev->domain, devfn, pdev); +- if ( ret ) +- dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n", +- pdev->domain); +- +- return ret; +-} +- + static int cf_check intel_iommu_enable_device(struct pci_dev *pdev) + { + struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); +@@ -2385,49 +1907,16 @@ static int cf_check intel_iommu_enable_device(struct pci_dev *pdev) + return ret >= 0 ? 0 : ret; + } + +-static int cf_check intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev) +-{ +- const struct acpi_drhd_unit *drhd; +- struct acpi_rmrr_unit *rmrr; +- u16 bdf; +- unsigned int i; +- +- if ( !pdev->domain ) +- return -EINVAL; +- +- drhd = domain_context_unmap(pdev->domain, devfn, pdev); +- if ( IS_ERR(drhd) ) +- return PTR_ERR(drhd); +- +- for_each_rmrr_device ( rmrr, bdf, i ) +- { +- if ( rmrr->segment != pdev->seg || bdf != PCI_BDF(pdev->bus, devfn) ) +- continue; +- +- /* +- * Any flag is nothing to clear these mappings but here +- * its always safe and strict to set 0. +- */ +- iommu_identity_mapping(pdev->domain, p2m_access_x, rmrr->base_address, +- rmrr->end_address, 0); +- } +- +- quarantine_teardown(pdev, drhd); +- +- if ( drhd ) +- { +- iommu_free_domid(pdev->arch.pseudo_domid, +- drhd->iommu->pseudo_domid_map); +- pdev->arch.pseudo_domid = DOMID_INVALID; +- } +- +- return 0; +-} +- + static int __hwdom_init cf_check setup_hwdom_device( + u8 devfn, struct pci_dev *pdev) + { +- return domain_context_mapping(pdev->domain, devfn, pdev); ++ if (pdev->type == DEV_TYPE_PCI_HOST_BRIDGE || ++ pdev->type == DEV_TYPE_PCIe_BRIDGE || ++ pdev->type == DEV_TYPE_PCIe2PCI_BRIDGE || ++ pdev->type == DEV_TYPE_LEGACY_PCI_BRIDGE) ++ return 0; ++ ++ return iommu_attach_context(hardware_domain, pdev, 0); + } + + void clear_fault_bits(struct vtd_iommu *iommu) +@@ -2521,7 +2010,7 @@ static int __must_check init_vtd_hw(bool resume) + + /* + * Enable queue invalidation +- */ ++ */ + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; +@@ -2542,7 +2031,7 @@ static int __must_check init_vtd_hw(bool resume) + + /* + * Enable interrupt remapping +- */ ++ */ + if ( iommu_intremap != iommu_intremap_off ) + { + int apic; +@@ -2597,34 +2086,53 @@ static int __must_check init_vtd_hw(bool resume) + return iommu_flush_all(); + } + +-static void __hwdom_init setup_hwdom_rmrr(struct domain *d) +-{ +- struct acpi_rmrr_unit *rmrr; +- u16 bdf; +- int ret, i; ++static struct iommu_state { ++ uint32_t fectl; ++} *__read_mostly iommu_state; + +- pcidevs_lock(); +- for_each_rmrr_device ( rmrr, bdf, i ) ++static void arch_iommu_dump_domain_contexts(struct domain *d) ++{ ++ unsigned int i, iommu_no; ++ struct pci_dev *pdev; ++ struct iommu_context *ctx; ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ printk("d%hu contexts\n", d->domain_id); ++ ++ for (i = 0; i < (1 + hd->other_contexts.count); ++i) + { +- /* +- * Here means we're add a device to the hardware domain. +- * Since RMRRs are always reserved in the e820 map for the hardware +- * domain, there shouldn't be a conflict. So its always safe and +- * strict to set 0. +- */ +- ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, +- rmrr->end_address, 0); +- if ( ret ) +- dprintk(XENLOG_ERR VTDPREFIX, +- "IOMMU: mapping reserved region failed\n"); ++ if ( (ctx = iommu_get_context(d, i)) ) ++ { ++ printk(" Context %d (%"PRIx64")\n", i, ctx->arch.vtd.pgd_maddr); ++ ++ for (iommu_no = 0; iommu_no < nr_iommus; iommu_no++) ++ printk(" IOMMU %hu (used=%u; did=%hu)\n", iommu_no, ++ test_bit(iommu_no, ctx->arch.vtd.iommu_bitmap), ++ ctx->arch.vtd.didmap[iommu_no]); ++ ++ list_for_each_entry(pdev, &ctx->devices, context_list) ++ { ++ printk(" - %pp\n", &pdev->sbdf); ++ } ++ ++ iommu_put_context(ctx); ++ } + } +- pcidevs_unlock(); + } + +-static struct iommu_state { +- uint32_t fectl; +-} *__read_mostly iommu_state; ++static void arch_iommu_dump_contexts(unsigned char key) ++{ ++ struct domain *d; ++ ++ for_each_domain(d) ++ if (is_iommu_enabled(d)) { ++ struct domain_iommu *hd = dom_iommu(d); ++ printk("d%hu arena page usage: %d\n", d->domain_id, ++ atomic_read(&hd->arch.pt_arena.used_pages)); + ++ arch_iommu_dump_domain_contexts(d); ++ } ++} + static int __init cf_check vtd_setup(void) + { + struct acpi_drhd_unit *drhd; +@@ -2752,6 +2260,7 @@ static int __init cf_check vtd_setup(void) + iommu_ops.page_sizes |= large_sizes; + + register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1); ++ register_keyhandler('X', arch_iommu_dump_contexts, "dump iommu contexts", 1); + + return 0; + +@@ -2766,192 +2275,6 @@ static int __init cf_check vtd_setup(void) + return ret; + } + +-static int cf_check reassign_device_ownership( +- struct domain *source, +- struct domain *target, +- u8 devfn, struct pci_dev *pdev) +-{ +- int ret; +- +- if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) ) +- { +- if ( !has_arch_pdevs(target) ) +- vmx_pi_hooks_assign(target); +- +-#ifdef CONFIG_PV +- /* +- * Devices assigned to untrusted domains (here assumed to be any domU) +- * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected +- * by the root complex unless interrupt remapping is enabled. +- */ +- if ( !iommu_intremap && !is_hardware_domain(target) && +- !is_system_domain(target) ) +- untrusted_msi = true; +-#endif +- +- ret = domain_context_mapping(target, devfn, pdev); +- +- if ( !ret && pdev->devfn == devfn && +- !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) ) +- { +- const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); +- +- if ( drhd ) +- check_cleanup_domid_map(source, pdev, drhd->iommu); +- } +- } +- else +- { +- const struct acpi_drhd_unit *drhd; +- +- drhd = domain_context_unmap(source, devfn, pdev); +- ret = IS_ERR(drhd) ? PTR_ERR(drhd) : 0; +- } +- if ( ret ) +- { +- if ( !has_arch_pdevs(target) ) +- vmx_pi_hooks_deassign(target); +- return ret; +- } +- +- if ( devfn == pdev->devfn && pdev->domain != target ) +- { +- write_lock(&source->pci_lock); +- list_del(&pdev->domain_list); +- write_unlock(&source->pci_lock); +- +- pdev->domain = target; +- +- write_lock(&target->pci_lock); +- list_add(&pdev->domain_list, &target->pdev_list); +- write_unlock(&target->pci_lock); +- } +- +- if ( !has_arch_pdevs(source) ) +- vmx_pi_hooks_deassign(source); +- +- /* +- * If the device belongs to the hardware domain, and it has RMRR, don't +- * remove it from the hardware domain, because BIOS may use RMRR at +- * booting time. +- */ +- if ( !is_hardware_domain(source) ) +- { +- const struct acpi_rmrr_unit *rmrr; +- u16 bdf; +- unsigned int i; +- +- for_each_rmrr_device( rmrr, bdf, i ) +- if ( rmrr->segment == pdev->seg && +- bdf == PCI_BDF(pdev->bus, devfn) ) +- { +- /* +- * Any RMRR flag is always ignored when remove a device, +- * but its always safe and strict to set 0. +- */ +- ret = iommu_identity_mapping(source, p2m_access_x, +- rmrr->base_address, +- rmrr->end_address, 0); +- if ( ret && ret != -ENOENT ) +- return ret; +- } +- } +- +- return 0; +-} +- +-static int cf_check intel_iommu_assign_device( +- struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag) +-{ +- struct domain *s = pdev->domain; +- struct acpi_rmrr_unit *rmrr; +- int ret = 0, i; +- u16 bdf, seg; +- u8 bus; +- +- if ( list_empty(&acpi_drhd_units) ) +- return -ENODEV; +- +- seg = pdev->seg; +- bus = pdev->bus; +- /* +- * In rare cases one given rmrr is shared by multiple devices but +- * obviously this would put the security of a system at risk. So +- * we would prevent from this sort of device assignment. But this +- * can be permitted if user set +- * "pci = [ 'sbdf, rdm_policy=relaxed' ]" +- * +- * TODO: in the future we can introduce group device assignment +- * interface to make sure devices sharing RMRR are assigned to the +- * same domain together. +- */ +- for_each_rmrr_device( rmrr, bdf, i ) +- { +- if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) && +- rmrr->scope.devices_cnt > 1 ) +- { +- bool relaxed = flag & XEN_DOMCTL_DEV_RDM_RELAXED; +- +- printk(XENLOG_GUEST "%s" VTDPREFIX +- " It's %s to assign %pp" +- " with shared RMRR at %"PRIx64" for %pd.\n", +- relaxed ? XENLOG_WARNING : XENLOG_ERR, +- relaxed ? "risky" : "disallowed", +- &PCI_SBDF(seg, bus, devfn), rmrr->base_address, d); +- if ( !relaxed ) +- return -EPERM; +- } +- } +- +- if ( d == dom_io ) +- return reassign_device_ownership(s, d, devfn, pdev); +- +- /* Setup rmrr identity mapping */ +- for_each_rmrr_device( rmrr, bdf, i ) +- { +- if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) ) +- { +- ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address, +- rmrr->end_address, flag); +- if ( ret ) +- { +- printk(XENLOG_G_ERR VTDPREFIX +- "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n", +- d, rmrr->base_address, rmrr->end_address, ret); +- break; +- } +- } +- } +- +- if ( !ret ) +- ret = reassign_device_ownership(s, d, devfn, pdev); +- +- /* See reassign_device_ownership() for the hwdom aspect. */ +- if ( !ret || is_hardware_domain(d) ) +- return ret; +- +- for_each_rmrr_device( rmrr, bdf, i ) +- { +- if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) ) +- { +- int rc = iommu_identity_mapping(d, p2m_access_x, +- rmrr->base_address, +- rmrr->end_address, 0); +- +- if ( rc && rc != -ENOENT ) +- { +- printk(XENLOG_ERR VTDPREFIX +- "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n", +- d, rmrr->base_address, rmrr->end_address, rc); +- domain_crash(d); +- break; +- } +- } +- } +- +- return ret; +-} +- + static int cf_check intel_iommu_group_id(u16 seg, u8 bus, u8 devfn) + { + u8 secbus; +@@ -3076,6 +2399,11 @@ static void vtd_dump_page_table_level(paddr_t pt_maddr, int level, paddr_t gpa, + if ( level < 1 ) + return; + ++ if (pt_maddr == 0) { ++ printk(" (empty)\n"); ++ return; ++ } ++ + pt_vaddr = map_vtd_domain_page(pt_maddr); + + next_level = level - 1; +@@ -3106,158 +2434,374 @@ static void vtd_dump_page_table_level(paddr_t pt_maddr, int level, paddr_t gpa, + + static void cf_check vtd_dump_page_tables(struct domain *d) + { +- const struct domain_iommu *hd = dom_iommu(d); ++ struct domain_iommu *hd = dom_iommu(d); ++ unsigned int i; + +- printk(VTDPREFIX" %pd table has %d levels\n", d, ++ printk(VTDPREFIX " %pd table has %d levels\n", d, + agaw_to_level(hd->arch.vtd.agaw)); +- vtd_dump_page_table_level(hd->arch.vtd.pgd_maddr, +- agaw_to_level(hd->arch.vtd.agaw), 0, 0); ++ ++ for (i = 1; i < (1 + hd->other_contexts.count); ++i) ++ { ++ struct iommu_context *ctx = iommu_get_context(d, i); ++ ++ printk(VTDPREFIX " %pd context %d: %s\n", d, i, ++ ctx ? "allocated" : "non-allocated"); ++ ++ if (ctx) ++ { ++ vtd_dump_page_table_level(ctx->arch.vtd.pgd_maddr, ++ agaw_to_level(hd->arch.vtd.agaw), 0, 0); ++ iommu_put_context(ctx); ++ } ++ } + } + +-static int fill_qpt(struct dma_pte *this, unsigned int level, +- struct page_info *pgs[6]) ++static int intel_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags) + { +- struct domain_iommu *hd = dom_iommu(dom_io); +- unsigned int i; +- int rc = 0; ++ struct acpi_drhd_unit *drhd; ++ ++ ctx->arch.vtd.didmap = xzalloc_array(u16, nr_iommus); ++ ++ if ( !ctx->arch.vtd.didmap ) ++ return -ENOMEM; + +- for ( i = 0; !rc && i < PTE_NUM; ++i ) ++ ctx->arch.vtd.iommu_bitmap = xzalloc_array(unsigned long, ++ BITS_TO_LONGS(nr_iommus)); ++ if ( !ctx->arch.vtd.iommu_bitmap ) ++ return -ENOMEM; ++ ++ ctx->arch.vtd.superpage_progress = 0; ++ ++ if ( flags & IOMMU_CONTEXT_INIT_default ) + { +- struct dma_pte *pte = &this[i], *next; ++ ctx->arch.vtd.pgd_maddr = 0; + +- if ( !dma_pte_present(*pte) ) ++ /* ++ * Context is considered "opaque" (non-managed) in these cases : ++ * - HAP is enabled, in this case, the pagetable is not managed by the ++ * IOMMU code, thus opaque ++ * - IOMMU is in passthrough which means that there is no actual pagetable ++ * ++ * If no-dma mode is specified, it's always non-opaque as the pagetable is ++ * always managed regardless of the rest. ++ */ ++ ctx->arch.hap_context = !iommu_hwdom_no_dma && (iommu_use_hap_pt(d) || iommu_hwdom_passthrough); ++ ++ ctx->opaque = ctx->arch.hap_context; ++ ++ /* Populate context DID map using domain id. */ ++ for_each_drhd_unit(drhd) + { +- if ( !pgs[level] ) +- { +- /* +- * The pgtable allocator is fine for the leaf page, as well as +- * page table pages, and the resulting allocations are always +- * zeroed. +- */ +- pgs[level] = iommu_alloc_pgtable(hd, 0); +- if ( !pgs[level] ) +- { +- rc = -ENOMEM; +- break; +- } +- +- if ( level ) +- { +- next = map_vtd_domain_page(page_to_maddr(pgs[level])); +- rc = fill_qpt(next, level - 1, pgs); +- unmap_vtd_domain_page(next); +- } +- } ++ ctx->arch.vtd.didmap[drhd->iommu->index] = ++ convert_domid(drhd->iommu, d->domain_id); ++ } ++ } ++ else ++ { ++ /* Populate context DID map using pseudo DIDs */ ++ for_each_drhd_unit(drhd) ++ { ++ ctx->arch.vtd.didmap[drhd->iommu->index] = ++ iommu_alloc_domid(drhd->iommu->pseudo_domid_map); ++ } ++ } ++ ++ if ( !ctx->opaque ) ++ /* Create initial context page */ ++ addr_to_dma_page_maddr(d, ctx, 0, min_pt_levels, NULL, true); ++ ++ return arch_iommu_context_init(d, ctx, flags); ++} ++ ++static int intel_iommu_cleanup_pte(uint64_t pte_maddr, bool preempt) ++{ ++ size_t i; ++ struct dma_pte *pte = map_vtd_domain_page(pte_maddr); ++ ++ for (i = 0; i < (1 << PAGETABLE_ORDER); ++i) ++ if ( dma_pte_present(pte[i]) ) ++ { ++ /* Remove the reference of the target mapping (if needed) */ ++ mfn_t mfn = maddr_to_mfn(dma_pte_addr(pte[i])); ++ ++ if ( mfn_valid(mfn) ) ++ put_page(mfn_to_page(mfn)); + +- dma_set_pte_addr(*pte, page_to_maddr(pgs[level])); +- dma_set_pte_readable(*pte); +- dma_set_pte_writable(*pte); ++ if ( preempt ) ++ dma_clear_pte(pte[i]); + } +- else if ( level && !dma_pte_superpage(*pte) ) ++ ++ unmap_vtd_domain_page(pte); ++ ++ return 0; ++} ++ ++/** ++ * Cleanup logic : ++ * Walk through the entire page table, progressively removing mappings if preempt. ++ * ++ * Return values : ++ * - Report preemption with -ERESTART. ++ * - Report empty pte/pgd with 0. ++ * ++ * When preempted during superpage operation, store state in vtd.superpage_progress. ++ */ ++ ++static int intel_iommu_cleanup_superpage(struct iommu_context *ctx, ++ unsigned int page_order, uint64_t pte_maddr, ++ bool preempt) ++{ ++ size_t i = 0, page_count = 1 << page_order; ++ struct page_info *page = maddr_to_page(pte_maddr); ++ ++ if ( preempt ) ++ i = ctx->arch.vtd.superpage_progress; ++ ++ for (; i < page_count; page++) ++ { ++ put_page(page); ++ ++ if ( preempt && (i & 0xff) && general_preempt_check() ) + { +- next = map_vtd_domain_page(dma_pte_addr(*pte)); +- rc = fill_qpt(next, level - 1, pgs); +- unmap_vtd_domain_page(next); ++ ctx->arch.vtd.superpage_progress = i + 1; ++ return -ERESTART; + } + } + +- return rc; ++ if ( preempt ) ++ ctx->arch.vtd.superpage_progress = 0; ++ ++ return 0; + } + +-static int cf_check intel_iommu_quarantine_init(struct pci_dev *pdev, +- bool scratch_page) ++static int intel_iommu_cleanup_mappings(struct iommu_context *ctx, ++ unsigned int nr_pt_levels, uint64_t pgd_maddr, ++ bool preempt) + { +- struct domain_iommu *hd = dom_iommu(dom_io); +- struct page_info *pg; +- unsigned int agaw = hd->arch.vtd.agaw; +- unsigned int level = agaw_to_level(agaw); +- const struct acpi_drhd_unit *drhd; +- const struct acpi_rmrr_unit *rmrr; +- unsigned int i, bdf; +- bool rmrr_found = false; ++ size_t i; + int rc; ++ struct dma_pte *pgd; + +- ASSERT(pcidevs_locked()); +- ASSERT(!hd->arch.vtd.pgd_maddr); +- ASSERT(page_list_empty(&hd->arch.pgtables.list)); ++ if ( ctx->opaque ) ++ /* don't touch opaque contexts */ ++ return 0; ++ ++ pgd = map_vtd_domain_page(pgd_maddr); + +- if ( pdev->arch.vtd.pgd_maddr ) ++ for (i = 0; i < (1 << PAGETABLE_ORDER); ++i) + { +- clear_domain_page(pdev->arch.leaf_mfn); +- return 0; ++ if ( dma_pte_present(pgd[i]) ) ++ { ++ uint64_t pte_maddr = dma_pte_addr(pgd[i]); ++ ++ if ( dma_pte_superpage(pgd[i]) ) ++ rc = intel_iommu_cleanup_superpage(ctx, nr_pt_levels * SUPERPAGE_ORDER, ++ pte_maddr, preempt); ++ else if ( nr_pt_levels > 2 ) ++ /* Next level is not PTE */ ++ rc = intel_iommu_cleanup_mappings(ctx, nr_pt_levels - 1, ++ pte_maddr, preempt); ++ else ++ rc = intel_iommu_cleanup_pte(pte_maddr, preempt); ++ ++ if ( preempt && !rc ) ++ /* Fold pgd (no more mappings in it) */ ++ dma_clear_pte(pgd[i]); ++ else if ( preempt && (rc == -ERESTART || general_preempt_check()) ) ++ { ++ unmap_vtd_domain_page(pgd); ++ return -ERESTART; ++ } ++ } + } + +- drhd = acpi_find_matched_drhd_unit(pdev); +- if ( !drhd ) +- return -ENODEV; ++ unmap_vtd_domain_page(pgd); + +- pg = iommu_alloc_pgtable(hd, 0); +- if ( !pg ) +- return -ENOMEM; ++ return 0; ++} + +- rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu); ++static int intel_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags) ++{ ++ struct acpi_drhd_unit *drhd; ++ pcidevs_lock(); + +- /* Transiently install the root into DomIO, for iommu_identity_mapping(). */ +- hd->arch.vtd.pgd_maddr = page_to_maddr(pg); ++ // Cleanup mappings ++ if ( intel_iommu_cleanup_mappings(ctx, agaw_to_level(d->iommu.arch.vtd.agaw), ++ ctx->arch.vtd.pgd_maddr, ++ flags & IOMMUF_preempt) < 0 ) ++ { ++ pcidevs_unlock(); ++ return -ERESTART; ++ } + +- for_each_rmrr_device ( rmrr, bdf, i ) ++ if (ctx->arch.vtd.didmap) + { +- if ( rc ) +- break; ++ for_each_drhd_unit(drhd) ++ { ++ iommu_free_domid(ctx->arch.vtd.didmap[drhd->iommu->index], ++ drhd->iommu->pseudo_domid_map); ++ } ++ ++ xfree(ctx->arch.vtd.didmap); ++ } ++ ++ pcidevs_unlock(); ++ return arch_iommu_context_teardown(d, ctx, flags); ++} + +- if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf ) ++static int intel_iommu_dev_rmrr(struct domain *d, struct pci_dev *pdev, ++ struct iommu_context *ctx, bool unmap) ++{ ++ struct acpi_rmrr_unit *rmrr; ++ u16 bdf; ++ int ret, i; ++ ++ for_each_rmrr_device(rmrr, bdf, i) ++ { ++ if ( PCI_SBDF(rmrr->segment, bdf).sbdf == pdev->sbdf.sbdf ) + { +- rmrr_found = true; +- +- rc = iommu_identity_mapping(dom_io, p2m_access_rw, +- rmrr->base_address, rmrr->end_address, +- 0); +- if ( rc ) +- printk(XENLOG_ERR VTDPREFIX +- "%pp: RMRR quarantine mapping failed\n", +- &pdev->sbdf); ++ ret = iommu_identity_mapping(d, ctx, ++ unmap ? p2m_access_x : p2m_access_rw, ++ rmrr->base_address, rmrr->end_address, ++ 0); ++ ++ if ( ret < 0 ) ++ return ret; + } + } + +- iommu_identity_map_teardown(dom_io); +- hd->arch.vtd.pgd_maddr = 0; +- pdev->arch.vtd.pgd_maddr = page_to_maddr(pg); ++ return 0; ++} ++ ++static int intel_iommu_attach(struct domain *d, struct pci_dev *pdev, ++ struct iommu_context *ctx) ++{ ++ int ret; ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if (!pdev || !drhd) ++ return -EINVAL; + +- if ( !rc && scratch_page ) ++ if ( !ctx->opaque || ctx->arch.hap_context ) + { +- struct dma_pte *root; +- struct page_info *pgs[6] = {}; ++ ret = intel_iommu_dev_rmrr(d, pdev, ctx, false); ++ ++ if ( ret ) ++ return ret; ++ } ++ ++ ret = apply_context(d, ctx, pdev, pdev->devfn); ++ ++ if ( ret ) ++ return ret; ++ ++ pci_vtd_quirk(pdev); + +- root = map_vtd_domain_page(pdev->arch.vtd.pgd_maddr); +- rc = fill_qpt(root, level - 1, pgs); +- unmap_vtd_domain_page(root); ++ return ret; ++} ++ ++static int intel_iommu_detach(struct domain *d, struct pci_dev *pdev, ++ struct iommu_context *prev_ctx) ++{ ++ int ret; ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if (!pdev || !drhd) ++ return -EINVAL; ++ ++ ret = unapply_context_single(d, drhd->iommu, pdev->bus, pdev->devfn); + +- pdev->arch.leaf_mfn = page_to_mfn(pgs[0]); ++ if ( ret ) ++ return ret; ++ ++ if ( !prev_ctx->opaque || prev_ctx->arch.hap_context ) ++ WARN_ON(intel_iommu_dev_rmrr(d, pdev, prev_ctx, true)); ++ ++ check_cleanup_domid_map(d, prev_ctx, NULL, drhd->iommu); ++ ++ return ret; ++} ++ ++static int intel_iommu_reattach(struct domain *d, struct pci_dev *pdev, ++ struct iommu_context *prev_ctx, ++ struct iommu_context *ctx) ++{ ++ int ret; ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if (!pdev || !drhd) ++ return -EINVAL; ++ ++ if ( !ctx->opaque || ctx->arch.hap_context ) ++ { ++ ret = intel_iommu_dev_rmrr(d, pdev, ctx, false); ++ ++ if ( ret ) ++ return ret; + } + +- page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list); ++ ret = apply_context_single(d, ctx, drhd->iommu, pdev->bus, pdev->devfn); ++ ++ if ( ret ) ++ return ret; + +- if ( rc || (!scratch_page && !rmrr_found) ) +- quarantine_teardown(pdev, drhd); ++ if ( !prev_ctx->opaque || prev_ctx->arch.hap_context ) ++ WARN_ON(intel_iommu_dev_rmrr(d, pdev, prev_ctx, true)); + +- return rc; ++ /* We are overwriting an entry, cleanup previous domid if needed. */ ++ check_cleanup_domid_map(d, prev_ctx, pdev, drhd->iommu); ++ ++ pci_vtd_quirk(pdev); ++ ++ return ret; ++} ++ ++static int intel_iommu_add_devfn(struct domain *d, struct pci_dev *pdev, ++ u16 devfn, struct iommu_context *ctx) ++{ ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if (!pdev || !drhd) ++ return -EINVAL; ++ ++ return apply_context(d, ctx, pdev, devfn); ++} ++ ++static int intel_iommu_remove_devfn(struct domain *d, struct pci_dev *pdev, ++ u16 devfn) ++{ ++ const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev); ++ ++ if (!pdev || !drhd) ++ return -EINVAL; ++ ++ return unapply_context_single(d, drhd->iommu, pdev->bus, devfn); ++} ++ ++static uint64_t intel_iommu_get_max_iova(struct domain *d) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ return (1LLU << agaw_to_width(hd->arch.vtd.agaw)) - 1; + } + + static const struct iommu_ops __initconst_cf_clobber vtd_ops = { + .page_sizes = PAGE_SIZE_4K, + .init = intel_iommu_domain_init, + .hwdom_init = intel_iommu_hwdom_init, +- .quarantine_init = intel_iommu_quarantine_init, +- .add_device = intel_iommu_add_device, ++ .context_init = intel_iommu_context_init, ++ .context_teardown = intel_iommu_context_teardown, ++ .attach = intel_iommu_attach, ++ .detach = intel_iommu_detach, ++ .reattach = intel_iommu_reattach, ++ .add_devfn = intel_iommu_add_devfn, ++ .remove_devfn = intel_iommu_remove_devfn, + .enable_device = intel_iommu_enable_device, +- .remove_device = intel_iommu_remove_device, +- .assign_device = intel_iommu_assign_device, + .teardown = iommu_domain_teardown, + .clear_root_pgtable = iommu_clear_root_pgtable, + .map_page = intel_iommu_map_page, + .unmap_page = intel_iommu_unmap_page, + .lookup_page = intel_iommu_lookup_page, +- .reassign_device = reassign_device_ownership, + .get_device_group_id = intel_iommu_group_id, + .enable_x2apic = intel_iommu_enable_eim, + .disable_x2apic = intel_iommu_disable_eim, +@@ -3272,6 +2816,7 @@ static const struct iommu_ops __initconst_cf_clobber vtd_ops = { + .iotlb_flush = iommu_flush_iotlb, + .get_reserved_device_memory = intel_iommu_get_reserved_device_memory, + .dump_page_tables = vtd_dump_page_tables, ++ .get_max_iova = intel_iommu_get_max_iova, + }; + + const struct iommu_init_ops __initconstrel intel_iommu_init_ops = { +diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c +index dc3dac749ce6..6bf19b4f6c0d 100644 +--- a/xen/drivers/passthrough/vtd/quirks.c ++++ b/xen/drivers/passthrough/vtd/quirks.c +@@ -408,9 +408,8 @@ void __init platform_quirks_init(void) + + static int __must_check map_me_phantom_function(struct domain *domain, + unsigned int dev, +- domid_t domid, +- paddr_t pgd_maddr, +- unsigned int mode) ++ unsigned int mode, ++ struct iommu_context *ctx) + { + struct acpi_drhd_unit *drhd; + struct pci_dev *pdev; +@@ -422,18 +421,17 @@ static int __must_check map_me_phantom_function(struct domain *domain, + + /* map or unmap ME phantom function */ + if ( !(mode & UNMAP_ME_PHANTOM_FUNC) ) +- rc = domain_context_mapping_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7), NULL, +- domid, pgd_maddr, mode); ++ rc = apply_context_single(domain, ctx, drhd->iommu, 0, ++ PCI_DEVFN(dev, 7)); + else +- rc = domain_context_unmap_one(domain, drhd->iommu, 0, +- PCI_DEVFN(dev, 7)); ++ rc = unapply_context_single(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7)); + + return rc; + } + + int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, +- domid_t domid, paddr_t pgd_maddr, unsigned int mode) ++ domid_t domid, unsigned int mode, ++ struct iommu_context *ctx) + { + u32 id; + int rc = 0; +@@ -457,7 +455,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + case 0x423b8086: + case 0x423c8086: + case 0x423d8086: +- rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode); ++ rc = map_me_phantom_function(domain, 3, mode, ctx); + break; + default: + break; +@@ -483,7 +481,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn, + case 0x42388086: /* Puma Peak */ + case 0x422b8086: + case 0x422c8086: +- rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode); ++ rc = map_me_phantom_function(domain, 22, mode, ctx); + break; + default: + break; +diff --git a/xen/drivers/passthrough/x86/Makefile b/xen/drivers/passthrough/x86/Makefile +index 75b288533640..1614f3d2840b 100644 +--- a/xen/drivers/passthrough/x86/Makefile ++++ b/xen/drivers/passthrough/x86/Makefile +@@ -1,2 +1,3 @@ + obj-y += iommu.o ++obj-y += arena.o + obj-$(CONFIG_HVM) += hvm.o +diff --git a/xen/drivers/passthrough/x86/arena.c b/xen/drivers/passthrough/x86/arena.c +new file mode 100644 +index 000000000000..984bc4d643f1 +--- /dev/null ++++ b/xen/drivers/passthrough/x86/arena.c +@@ -0,0 +1,157 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/** ++ * Simple arena-based page allocator. ++ * ++ * Allocate a large block using alloc_domheam_pages and allocate single pages ++ * using iommu_arena_allocate_page and iommu_arena_free_page functions. ++ * ++ * Concurrent {allocate/free}_page is thread-safe ++ * iommu_arena_teardown during {allocate/free}_page is not thread-safe. ++ * ++ * Written by Teddy Astie ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++/* Maximum of scan tries if the bit found not available */ ++#define ARENA_TSL_MAX_TRIES 5 ++ ++int iommu_arena_initialize(struct iommu_arena *arena, struct domain *d, ++ unsigned int order, unsigned int memflags) ++{ ++ struct page_info *page; ++ ++ /* TODO: Maybe allocate differently ? */ ++ page = alloc_domheap_pages(d, order, memflags); ++ ++ if ( !page ) ++ return -ENOMEM; ++ ++ arena->map = xzalloc_array(unsigned long, BITS_TO_LONGS(1LLU << order)); ++ arena->order = order; ++ arena->region_start = page_to_mfn(page); ++ ++ _atomic_set(&arena->used_pages, 0); ++ bitmap_zero(arena->map, iommu_arena_size(arena)); ++ ++ printk(XENLOG_DEBUG "IOMMU: Allocated arena (%llu pages, start=%"PRI_mfn")\n", ++ iommu_arena_size(arena), mfn_x(arena->region_start)); ++ return 0; ++} ++ ++int iommu_arena_teardown(struct iommu_arena *arena, bool check) ++{ ++ BUG_ON(mfn_x(arena->region_start) == 0); ++ ++ /* Check for allocations if check is specified */ ++ if ( check && (atomic_read(&arena->used_pages) > 0) ) ++ return -EBUSY; ++ ++ free_domheap_pages(mfn_to_page(arena->region_start), arena->order); ++ ++ arena->region_start = _mfn(0); ++ _atomic_set(&arena->used_pages, 0); ++ xfree(arena->map); ++ arena->map = NULL; ++ ++ return 0; ++} ++ ++struct page_info *iommu_arena_allocate_page(struct iommu_arena *arena) ++{ ++ unsigned int index; ++ unsigned int tsl_tries = 0; ++ ++ BUG_ON(mfn_x(arena->region_start) == 0); ++ ++ if ( atomic_read(&arena->used_pages) == iommu_arena_size(arena) ) ++ /* All pages used */ ++ return NULL; ++ ++ do ++ { ++ index = find_first_zero_bit(arena->map, iommu_arena_size(arena)); ++ ++ if ( index >= iommu_arena_size(arena) ) ++ /* No more free pages */ ++ return NULL; ++ ++ /* ++ * While there shouldn't be a lot of retries in practice, this loop ++ * *may* run indefinetly if the found bit is never free due to being ++ * overwriten by another CPU core right after. Add a safeguard for ++ * such very rare cases. ++ */ ++ tsl_tries++; ++ ++ if ( unlikely(tsl_tries == ARENA_TSL_MAX_TRIES) ) ++ { ++ printk(XENLOG_ERR "ARENA: Too many TSL retries !"); ++ return NULL; ++ } ++ ++ /* Make sure that the bit we found is still free */ ++ } while ( test_and_set_bit(index, arena->map) ); ++ ++ atomic_inc(&arena->used_pages); ++ ++ return mfn_to_page(mfn_add(arena->region_start, index)); ++} ++ ++bool iommu_arena_free_page(struct iommu_arena *arena, struct page_info *page) ++{ ++ unsigned long index; ++ mfn_t frame; ++ ++ if ( !page ) ++ { ++ printk(XENLOG_WARNING "IOMMU: Trying to free NULL page"); ++ WARN(); ++ return false; ++ } ++ ++ frame = page_to_mfn(page); ++ ++ /* Check if page belongs to our arena */ ++ if ( (mfn_x(frame) < mfn_x(arena->region_start)) ++ || (mfn_x(frame) >= (mfn_x(arena->region_start) + iommu_arena_size(arena))) ) ++ { ++ printk(XENLOG_WARNING ++ "IOMMU: Trying to free outside arena region [mfn=%"PRI_mfn"]", ++ mfn_x(frame)); ++ WARN(); ++ return false; ++ } ++ ++ index = mfn_x(frame) - mfn_x(arena->region_start); ++ ++ /* Sanity check in case of underflow. */ ++ ASSERT(index < iommu_arena_size(arena)); ++ ++ if ( !test_and_clear_bit(index, arena->map) ) ++ { ++ /* ++ * Bit was free during our arena_free_page, which means that ++ * either this page was never allocated, or we are in a double-free ++ * situation. ++ */ ++ printk(XENLOG_WARNING ++ "IOMMU: Freeing non-allocated region (double-free?) [mfn=%"PRI_mfn"]", ++ mfn_x(frame)); ++ WARN(); ++ return false; ++ } ++ ++ atomic_dec(&arena->used_pages); ++ ++ return true; ++} +\ No newline at end of file +diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c +index 8b1e0596b84a..849f57c1ce21 100644 +--- a/xen/drivers/passthrough/x86/iommu.c ++++ b/xen/drivers/passthrough/x86/iommu.c +@@ -12,6 +12,12 @@ + * this program; If not, see . + */ + ++#include ++#include ++#include ++#include ++#include ++#include + #include + #include + #include +@@ -28,6 +34,10 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + const struct iommu_init_ops *__initdata iommu_init_ops; + struct iommu_ops __ro_after_init iommu_ops; +@@ -183,19 +193,66 @@ void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d) + panic("PVH hardware domain iommu must be set in 'strict' mode\n"); + } + +-int arch_iommu_domain_init(struct domain *d) ++int arch_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags) ++{ ++ INIT_PAGE_LIST_HEAD(&ctx->arch.pgtables); ++ INIT_PAGE_LIST_HEAD(&ctx->arch.free_queue); ++ INIT_LIST_HEAD(&ctx->arch.identity_maps); ++ ++ return 0; ++} ++ ++int arch_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags) ++{ ++ /* Cleanup all page tables */ ++ while ( iommu_free_pgtables(d, ctx) == -ERESTART ) ++ /* nothing */; ++ ++ return 0; ++} ++ ++int arch_iommu_flush_free_queue(struct domain *d, struct iommu_context *ctx) ++{ ++ struct page_info *pg; ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ while ( (pg = page_list_remove_head(&ctx->arch.free_queue)) ) ++ iommu_arena_free_page(&hd->arch.pt_arena, pg); ++ ++ return 0; ++} ++ ++int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order) ++{ ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ if ( arena_order == 0 ) ++ return 0; ++ ++ return iommu_arena_initialize(&hd->arch.pt_arena, NULL, arena_order, 0); ++} ++ ++int arch_iommu_pviommu_teardown(struct domain *d) + { + struct domain_iommu *hd = dom_iommu(d); + +- spin_lock_init(&hd->arch.mapping_lock); ++ if ( iommu_arena_teardown(&hd->arch.pt_arena, true) ) ++ { ++ printk(XENLOG_WARNING "IOMMU Arena used while being destroyed\n"); ++ WARN(); + +- INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list); +- spin_lock_init(&hd->arch.pgtables.lock); +- INIT_LIST_HEAD(&hd->arch.identity_maps); ++ /* Teardown anyway */ ++ iommu_arena_teardown(&hd->arch.pt_arena, false); ++ } + + return 0; + } + ++int arch_iommu_domain_init(struct domain *d) ++{ ++ return 0; ++} ++ + void arch_iommu_domain_destroy(struct domain *d) + { + /* +@@ -203,8 +260,9 @@ void arch_iommu_domain_destroy(struct domain *d) + * domain is destroyed. Note that arch_iommu_domain_destroy() is + * called unconditionally, so pgtables may be uninitialized. + */ +- ASSERT(!dom_iommu(d)->platform_ops || +- page_list_empty(&dom_iommu(d)->arch.pgtables.list)); ++ struct domain_iommu *hd = dom_iommu(d); ++ ++ ASSERT(!hd->platform_ops); + } + + struct identity_map { +@@ -214,32 +272,104 @@ struct identity_map { + unsigned int count; + }; + +-int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, +- paddr_t base, paddr_t end, ++static int unmap_identity_region(struct domain *d, struct iommu_context *ctx, ++ unsigned int base_pfn, unsigned int end_pfn) ++{ ++ int ret = 0; ++ ++ if ( ctx->arch.hap_context ) ++ { ++ this_cpu(iommu_dont_flush_iotlb) = true; ++ while ( base_pfn < end_pfn ) ++ { ++ if ( p2m_remove_identity_entry(d, base_pfn) ) ++ ret = -ENXIO; ++ ++ base_pfn++; ++ } ++ this_cpu(iommu_dont_flush_iotlb) = false; ++ } ++ else ++ { ++ size_t page_count = end_pfn - base_pfn + 1; ++ unsigned int flush_flags; ++ ++ ret = iommu_unmap(d, _dfn(base_pfn), page_count, 0, &flush_flags, ++ ctx->id); ++ ++ if ( ret ) ++ return ret; ++ ++ ret = iommu_iotlb_flush(d, _dfn(base_pfn), page_count, ++ flush_flags, ctx->id); ++ } ++ ++ return ret; ++} ++ ++static int map_identity_region(struct domain *d, struct iommu_context *ctx, ++ unsigned int base_pfn, unsigned int end_pfn, ++ p2m_access_t p2ma, unsigned int flag) ++{ ++ int ret = 0; ++ unsigned int flush_flags = 0; ++ size_t page_count = end_pfn - base_pfn + 1; ++ ++ if ( ctx->arch.hap_context ) ++ { ++ this_cpu(iommu_dont_flush_iotlb) = true; ++ while ( base_pfn < end_pfn ) ++ { ++ ret = p2m_add_identity_entry(d, base_pfn, p2ma, flag); ++ ++ if ( ret ) ++ { ++ this_cpu(iommu_dont_flush_iotlb) = false; ++ return ret; ++ } ++ ++ base_pfn++; ++ } ++ this_cpu(iommu_dont_flush_iotlb) = false; ++ } ++ else ++ { ++ ret = iommu_map(d, _dfn(base_pfn), _mfn(base_pfn), page_count, ++ p2m_access_to_iommu_flags(p2ma), &flush_flags, ++ ctx->id); ++ ++ if ( ret ) ++ return ret; ++ } ++ ++ ret = iommu_iotlb_flush(d, _dfn(base_pfn), page_count, flush_flags, ++ ctx->id); ++ ++ return ret; ++} ++ ++/* p2m_access_x removes the mapping */ ++int iommu_identity_mapping(struct domain *d, struct iommu_context *ctx, ++ p2m_access_t p2ma, paddr_t base, paddr_t end, + unsigned int flag) + { + unsigned long base_pfn = base >> PAGE_SHIFT_4K; + unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K; + struct identity_map *map; +- struct domain_iommu *hd = dom_iommu(d); ++ int ret = 0; + + ASSERT(pcidevs_locked()); + ASSERT(base < end); + +- /* +- * No need to acquire hd->arch.mapping_lock: Both insertion and removal +- * get done while holding pcidevs_lock. +- */ +- list_for_each_entry( map, &hd->arch.identity_maps, list ) ++ list_for_each_entry( map, &ctx->arch.identity_maps, list ) + { + if ( map->base == base && map->end == end ) + { +- int ret = 0; +- + if ( p2ma != p2m_access_x ) + { + if ( map->access != p2ma ) + return -EADDRINUSE; ++ + ++map->count; + return 0; + } +@@ -247,12 +377,9 @@ int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, + if ( --map->count ) + return 0; + +- while ( base_pfn < end_pfn ) +- { +- if ( clear_identity_p2m_entry(d, base_pfn) ) +- ret = -ENXIO; +- base_pfn++; +- } ++ printk("Unmapping [%"PRI_mfn"x:%"PRI_mfn"] for d%dc%d\n", base_pfn, end_pfn, ++ d->domain_id, ctx->id); ++ ret = unmap_identity_region(d, ctx, base_pfn, end_pfn); + + list_del(&map->list); + xfree(map); +@@ -271,47 +398,43 @@ int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma, + if ( !map ) + return -ENOMEM; + +- map->base = base; +- map->end = end; +- map->access = p2ma; +- map->count = 1; +- +- /* +- * Insert into list ahead of mapping, so the range can be found when +- * trying to clean up. +- */ +- list_add_tail(&map->list, &hd->arch.identity_maps); ++ printk("Mapping [%"PRI_mfn"x:%"PRI_mfn"] for d%dc%d\n", base_pfn, end_pfn, ++ d->domain_id, ctx->id); ++ ret = map_identity_region(d, ctx, base_pfn, end_pfn, p2ma, flag); + +- for ( ; base_pfn < end_pfn; ++base_pfn ) ++ if ( ret ) + { +- int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag); +- +- if ( !err ) +- continue; +- +- if ( (map->base >> PAGE_SHIFT_4K) == base_pfn ) +- { +- list_del(&map->list); +- xfree(map); +- } +- return err; ++ xfree(map); ++ return ret; + } + + return 0; + } + +-void iommu_identity_map_teardown(struct domain *d) ++void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx) + { +- struct domain_iommu *hd = dom_iommu(d); + struct identity_map *map, *tmp; + +- list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list ) ++ list_for_each_entry_safe ( map, tmp, &ctx->arch.identity_maps, list ) + { + list_del(&map->list); + xfree(map); + } + } + ++bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx, ++ mfn_t mfn) ++{ ++ struct identity_map *map; ++ uint64_t addr = pfn_to_paddr(mfn_x(mfn)); ++ ++ list_for_each_entry ( map, &ctx->arch.identity_maps, list ) ++ if (addr >= map->base && addr < map->end) ++ return true; ++ ++ return false; ++} ++ + static int __hwdom_init cf_check map_subtract(unsigned long s, unsigned long e, + void *data) + { +@@ -369,7 +492,7 @@ static int __hwdom_init cf_check identity_map(unsigned long s, unsigned long e, + if ( iomem_access_permitted(d, s, s) ) + { + rc = iommu_map(d, _dfn(s), _mfn(s), 1, perms, +- &info->flush_flags); ++ &info->flush_flags, 0); + if ( rc < 0 ) + break; + /* Must map a frame at least, which is what we request for. */ +@@ -379,7 +502,7 @@ static int __hwdom_init cf_check identity_map(unsigned long s, unsigned long e, + s++; + } + while ( (rc = iommu_map(d, _dfn(s), _mfn(s), e - s + 1, +- perms, &info->flush_flags)) > 0 ) ++ perms, &info->flush_flags, 0)) > 0 ) + { + s += rc; + process_pending_softirqs(); +@@ -408,6 +531,10 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d) + if ( iommu_hwdom_reserved == -1 ) + iommu_hwdom_reserved = 1; + ++ if ( iommu_hwdom_no_dma ) ++ /* Skip special mappings with no-dma mode */ ++ return; ++ + if ( iommu_hwdom_inclusive ) + { + printk(XENLOG_WARNING +@@ -545,7 +672,6 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d) + + void arch_pci_init_pdev(struct pci_dev *pdev) + { +- pdev->arch.pseudo_domid = DOMID_INVALID; + } + + unsigned long *__init iommu_init_domid(domid_t reserve) +@@ -576,8 +702,6 @@ domid_t iommu_alloc_domid(unsigned long *map) + static unsigned int start; + unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start); + +- ASSERT(pcidevs_locked()); +- + if ( idx >= UINT16_MAX - DOMID_MASK ) + idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK); + if ( idx >= UINT16_MAX - DOMID_MASK ) +@@ -603,7 +727,7 @@ void iommu_free_domid(domid_t domid, unsigned long *map) + BUG(); + } + +-int iommu_free_pgtables(struct domain *d) ++int iommu_free_pgtables(struct domain *d, struct iommu_context *ctx) + { + struct domain_iommu *hd = dom_iommu(d); + struct page_info *pg; +@@ -612,18 +736,18 @@ int iommu_free_pgtables(struct domain *d) + if ( !is_iommu_enabled(d) ) + return 0; + +- /* After this barrier, no new IOMMU mappings can be inserted. */ +- spin_barrier(&hd->arch.mapping_lock); +- + /* + * Pages will be moved to the free list below. So we want to + * clear the root page-table to avoid any potential use after-free. + */ +- iommu_vcall(hd->platform_ops, clear_root_pgtable, d); ++ iommu_vcall(hd->platform_ops, clear_root_pgtable, d, ctx); + +- while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) ) ++ while ( (pg = page_list_remove_head(&ctx->arch.pgtables)) ) + { +- free_domheap_page(pg); ++ if (ctx->id == 0) ++ free_domheap_page(pg); ++ else ++ iommu_arena_free_page(&hd->arch.pt_arena, pg); + + if ( !(++done & 0xff) && general_preempt_check() ) + return -ERESTART; +@@ -633,6 +757,7 @@ int iommu_free_pgtables(struct domain *d) + } + + struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd, ++ struct iommu_context *ctx, + uint64_t contig_mask) + { + unsigned int memflags = 0; +@@ -644,7 +769,11 @@ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd, + memflags = MEMF_node(hd->node); + #endif + +- pg = alloc_domheap_page(NULL, memflags); ++ if (ctx->id == 0) ++ pg = alloc_domheap_page(NULL, memflags); ++ else ++ pg = iommu_arena_allocate_page(&hd->arch.pt_arena); ++ + if ( !pg ) + return NULL; + +@@ -677,9 +806,7 @@ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd, + + unmap_domain_page(p); + +- spin_lock(&hd->arch.pgtables.lock); +- page_list_add(pg, &hd->arch.pgtables.list); +- spin_unlock(&hd->arch.pgtables.lock); ++ page_list_add(pg, &ctx->arch.pgtables); + + return pg; + } +@@ -718,17 +845,20 @@ static void cf_check free_queued_pgtables(void *arg) + } + } + +-void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg) ++void iommu_queue_free_pgtable(struct iommu_context *ctx, struct page_info *pg) + { + unsigned int cpu = smp_processor_id(); + +- spin_lock(&hd->arch.pgtables.lock); +- page_list_del(pg, &hd->arch.pgtables.list); +- spin_unlock(&hd->arch.pgtables.lock); ++ page_list_del(pg, &ctx->arch.pgtables); + +- page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu)); ++ if ( !ctx->id ) ++ { ++ page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu)); + +- tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu)); ++ tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu)); ++ } ++ else ++ page_list_add_tail(pg, &ctx->arch.free_queue); + } + + static int cf_check cpu_callback( +-- +2.46.0 + diff --git a/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch b/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch new file mode 100644 index 0000000..0dce269 --- /dev/null +++ b/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch @@ -0,0 +1,965 @@ +From 64340057e5819e0755f87c6d9d7b4b954c9a8a93 Mon Sep 17 00:00:00 2001 +From: Teddy Astie +Date: Mon, 4 Nov 2024 14:28:39 +0000 +Subject: [PATCH 404/404] xen/public: Introduce PV-IOMMU hypercall interface + +Introduce a new pv interface to manage the underlying IOMMU and manage contexts +and devices. This interface allows creation of new contexts from Dom0 and +addition of IOMMU mappings using guest PoV. + +This interface doesn't allow creation of mapping to other domains. + +Signed-off-by Teddy Astie +--- + xen/common/Makefile | 1 + + xen/common/pv-iommu.c | 539 ++++++++++++++++++++++++++++++++++ + xen/include/hypercall-defs.c | 6 + + xen/include/public/pv-iommu.h | 341 +++++++++++++++++++++ + xen/include/public/xen.h | 1 + + 5 files changed, 888 insertions(+) + create mode 100644 xen/common/pv-iommu.c + create mode 100644 xen/include/public/pv-iommu.h + +diff --git a/xen/common/Makefile b/xen/common/Makefile +index cba3b32733ba..0b6df5966056 100644 +--- a/xen/common/Makefile ++++ b/xen/common/Makefile +@@ -61,6 +61,7 @@ obj-y += wait.o + obj-bin-y += warning.init.o + obj-$(CONFIG_XENOPROF) += xenoprof.o + obj-y += xmalloc_tlsf.o ++obj-y += pv-iommu.o + + obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma lzo unlzo unlz4 unzstd earlycpio,$(n).init.o) + +diff --git a/xen/common/pv-iommu.c b/xen/common/pv-iommu.c +new file mode 100644 +index 000000000000..9c7d04b4c7e6 +--- /dev/null ++++ b/xen/common/pv-iommu.c +@@ -0,0 +1,539 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * xen/common/pv_iommu.c ++ * ++ * PV-IOMMU hypercall interface. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PVIOMMU_PREFIX "[PV-IOMMU] " ++ ++static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn, ++ struct page_info **page, bool readonly) ++{ ++ int ret = 0; ++ p2m_type_t p2mt = p2m_invalid; ++ ++ #ifdef CONFIG_X86 ++ p2m_query_t query = P2M_ALLOC; ++ ++ if ( !readonly ) ++ query |= P2M_UNSHARE; ++ ++ *mfn = get_gfn_type(d, gfn_x(gfn), &p2mt, query); ++ #else ++ *mfn = p2m_lookup(d, gfn, &p2mt); ++ #endif ++ ++ if ( mfn_eq(*mfn, INVALID_MFN) ) ++ { ++ /* No mapping ? */ ++ printk(XENLOG_G_WARNING PVIOMMU_PREFIX ++ "Trying to map to non-backed page frame (gfn=%"PRI_gfn ++ " p2mt=%d d%d)\n", gfn_x(gfn), p2mt, d->domain_id); ++ ++ ret = -ENOENT; ++ } ++ else if ( p2m_is_any_ram(p2mt) && mfn_valid(*mfn) ) ++ { ++ *page = get_page_from_mfn(*mfn, d); ++ ret = 0; ++ } ++ else if ( p2m_is_mmio(p2mt) || ++ iomem_access_permitted(d, mfn_x(*mfn),mfn_x(*mfn)) ) ++ { ++ *page = NULL; ++ ret = 0; ++ } ++ else ++ { ++ printk(XENLOG_G_WARNING PVIOMMU_PREFIX ++ "Unexpected p2mt %d (d%d gfn=%"PRI_gfn" mfn=%"PRI_mfn")\n", ++ p2mt, d->domain_id, gfn_x(gfn), mfn_x(*mfn)); ++ ++ ret = -EPERM; ++ } ++ ++ put_gfn(d, gfn_x(gfn)); ++ return ret; ++} ++ ++static bool can_use_iommu_check(struct domain *d) ++{ ++ if ( !is_iommu_enabled(d) ) ++ { ++ printk(XENLOG_G_WARNING PVIOMMU_PREFIX ++ "IOMMU disabled for this domain\n"); ++ return false; ++ } ++ ++ if ( !dom_iommu(d)->allow_pv_iommu ) ++ { ++ printk(XENLOG_G_WARNING PVIOMMU_PREFIX ++ "PV-IOMMU disabled for this domain\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++static long capabilities_op(struct pv_iommu_capabilities *cap, struct domain *d) ++{ ++ cap->max_ctx_no = d->iommu.other_contexts.count; ++ cap->max_iova_addr = iommu_get_max_iova(d); ++ ++ cap->max_pasid = 0; /* TODO */ ++ cap->cap_flags = 0; ++ ++ if ( !dom_iommu(d)->no_dma ) ++ cap->cap_flags |= IOMMUCAP_default_identity; ++ ++ cap->pgsize_mask = PAGE_SIZE_4K; ++ ++ return 0; ++} ++ ++static long init_op(struct pv_iommu_init *init, struct domain *d) ++{ ++ if (init->max_ctx_no == UINT32_MAX) ++ return -E2BIG; ++ ++ return iommu_domain_pviommu_init(d, init->max_ctx_no + 1, init->arena_order); ++} ++ ++static long alloc_context_op(struct pv_iommu_alloc *alloc, struct domain *d) ++{ ++ u16 ctx_no = 0; ++ int status = 0; ++ ++ status = iommu_context_alloc(d, &ctx_no, 0); ++ ++ if ( status ) ++ return status; ++ ++ printk(XENLOG_G_INFO PVIOMMU_PREFIX ++ "Created IOMMU context %hu in d%d\n", ctx_no, d->domain_id); ++ ++ alloc->ctx_no = ctx_no; ++ return 0; ++} ++ ++static long free_context_op(struct pv_iommu_free *free, struct domain *d) ++{ ++ int flags = IOMMU_TEARDOWN_PREEMPT; ++ ++ if ( !free->ctx_no ) ++ return -EINVAL; ++ ++ if ( free->free_flags & IOMMU_FREE_reattach_default ) ++ flags |= IOMMU_TEARDOWN_REATTACH_DEFAULT; ++ ++ return iommu_context_free(d, free->ctx_no, flags); ++} ++ ++static long reattach_device_op(struct pv_iommu_reattach_device *reattach, ++ struct domain *d) ++{ ++ int ret; ++ device_t *pdev; ++ struct physdev_pci_device dev = reattach->dev; ++ ++ pcidevs_lock(); ++ pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn)); ++ ++ if ( !pdev ) ++ { ++ pcidevs_unlock(); ++ return -ENOENT; ++ } ++ ++ ret = iommu_reattach_context(d, d, pdev, reattach->ctx_no); ++ ++ pcidevs_unlock(); ++ return ret; ++} ++ ++static long map_pages_op(struct pv_iommu_map_pages *map, struct domain *d) ++{ ++ struct iommu_context *ctx; ++ int ret = 0, flush_ret; ++ struct page_info *page = NULL; ++ mfn_t mfn, mfn_lookup; ++ unsigned int flags = 0, flush_flags = 0; ++ size_t i = 0; ++ dfn_t dfn0 = _dfn(map->dfn); /* original map->dfn */ ++ ++ if ( !map->ctx_no || !(ctx = iommu_get_context(d, map->ctx_no)) ) ++ return -EINVAL; ++ ++ if ( map->map_flags & IOMMU_MAP_readable ) ++ flags |= IOMMUF_readable; ++ ++ if ( map->map_flags & IOMMU_MAP_writeable ) ++ flags |= IOMMUF_writable; ++ ++ for (i = 0; i < map->nr_pages; i++) ++ { ++ gfn_t gfn = _gfn(map->gfn + i); ++ dfn_t dfn = _dfn(map->dfn + i); ++ ++#ifdef CONFIG_X86 ++ if ( iommu_identity_map_check(d, ctx, _mfn(map->dfn)) ) ++ { ++ ret = -EADDRNOTAVAIL; ++ break; ++ } ++#endif ++ ++ ret = get_paged_frame(d, gfn, &mfn, &page, 0); ++ ++ if ( ret ) ++ break; ++ ++ /* Check for conflict with existing mappings */ ++ if ( !iommu_lookup_page(d, dfn, &mfn_lookup, &flags, map->ctx_no) ) ++ { ++ if ( page ) ++ put_page(page); ++ ++ ret = -EADDRINUSE; ++ break; ++ } ++ ++ ret = iommu_map(d, dfn, mfn, 1, flags, &flush_flags, map->ctx_no); ++ ++ if ( ret ) ++ { ++ if ( page ) ++ put_page(page); ++ ++ break; ++ } ++ ++ map->mapped++; ++ ++ if ( (i & 0xff) && hypercall_preempt_check() ) ++ { ++ i++; ++ ++ map->gfn += i; ++ map->dfn += i; ++ map->nr_pages -= i; ++ ++ ret = -ERESTART; ++ break; ++ } ++ } ++ ++ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, map->ctx_no); ++ ++ iommu_put_context(ctx); ++ ++ if ( flush_ret ) ++ printk(XENLOG_G_WARNING PVIOMMU_PREFIX ++ "Flush operation failed for d%dc%d (%d)\n", d->domain_id, ++ ctx->id, flush_ret); ++ ++ return ret; ++} ++ ++static long unmap_pages_op(struct pv_iommu_unmap_pages *unmap, struct domain *d) ++{ ++ struct iommu_context *ctx; ++ mfn_t mfn; ++ int ret = 0, flush_ret; ++ unsigned int flags, flush_flags = 0; ++ size_t i = 0; ++ dfn_t dfn0 = _dfn(unmap->dfn); /* original unmap->dfn */ ++ ++ if ( !unmap->ctx_no || !(ctx = iommu_get_context(d, unmap->ctx_no)) ) ++ return -EINVAL; ++ ++ for (i = 0; i < unmap->nr_pages; i++) ++ { ++ dfn_t dfn = _dfn(unmap->dfn + i); ++ ++#ifdef CONFIG_X86 ++ if ( iommu_identity_map_check(d, ctx, _mfn(unmap->dfn)) ) ++ { ++ ret = -EADDRNOTAVAIL; ++ break; ++ } ++#endif ++ ++ /* Check if there is a valid mapping for this domain */ ++ if ( iommu_lookup_page(d, dfn, &mfn, &flags, unmap->ctx_no) ) { ++ ret = -ENOENT; ++ break; ++ } ++ ++ ret = iommu_unmap(d, dfn, 1, 0, &flush_flags, unmap->ctx_no); ++ ++ if ( ret ) ++ break; ++ ++ unmap->unmapped++; ++ ++ /* Decrement reference counter (if needed) */ ++ if ( mfn_valid(mfn) ) ++ put_page(mfn_to_page(mfn)); ++ ++ if ( (i & 0xff) && hypercall_preempt_check() ) ++ { ++ i++; ++ ++ unmap->dfn += i; ++ unmap->nr_pages -= i; ++ ++ ret = -ERESTART; ++ break; ++ } ++ } ++ ++ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, unmap->ctx_no); ++ ++ iommu_put_context(ctx); ++ ++ if ( flush_ret ) ++ printk(XENLOG_WARNING PVIOMMU_PREFIX ++ "Flush operation failed for d%dc%d (%d)\n", d->domain_id, ++ ctx->id, flush_ret); ++ ++ return ret; ++} ++ ++static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg, ++ struct domain *d, bool remote); ++ ++static long remote_cmd_op(struct pv_iommu_remote_cmd *remote_cmd, ++ struct domain *current_domain) ++{ ++ long ret = 0; ++ struct domain *d; ++ ++ /* TODO: use a better permission logic */ ++ if ( !is_hardware_domain(current_domain) ) ++ return -EPERM; ++ ++ d = get_domain_by_id(remote_cmd->domid); ++ ++ if ( !d ) ++ return -ENOENT; ++ ++ ret = do_iommu_subop(remote_cmd->subop, remote_cmd->arg, d, true); ++ ++ put_domain(d); ++ ++ return ret; ++} ++ ++static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg, ++ struct domain *d, bool remote) ++{ ++ long ret = 0; ++ ++ switch ( subop ) ++ { ++ case IOMMU_noop: ++ break; ++ ++ case IOMMU_query_capabilities: ++ { ++ struct pv_iommu_capabilities cap; ++ ++ ret = capabilities_op(&cap, d); ++ ++ if ( unlikely(copy_to_guest(arg, &cap, 1)) ) ++ ret = -EFAULT; ++ ++ break; ++ } ++ ++ case IOMMU_init: ++ { ++ struct pv_iommu_init init; ++ ++ if ( unlikely(copy_from_guest(&init, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = init_op(&init, d); ++ } ++ ++ case IOMMU_alloc_context: ++ { ++ struct pv_iommu_alloc alloc; ++ ++ if ( unlikely(copy_from_guest(&alloc, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = alloc_context_op(&alloc, d); ++ ++ if ( unlikely(copy_to_guest(arg, &alloc, 1)) ) ++ ret = -EFAULT; ++ ++ break; ++ } ++ ++ case IOMMU_free_context: ++ { ++ struct pv_iommu_free free; ++ ++ if ( unlikely(copy_from_guest(&free, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = free_context_op(&free, d); ++ break; ++ } ++ ++ case IOMMU_reattach_device: ++ { ++ struct pv_iommu_reattach_device reattach; ++ ++ if ( unlikely(copy_from_guest(&reattach, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = reattach_device_op(&reattach, d); ++ break; ++ } ++ ++ case IOMMU_map_pages: ++ { ++ struct pv_iommu_map_pages map; ++ ++ if ( unlikely(copy_from_guest(&map, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = map_pages_op(&map, d); ++ ++ if ( unlikely(copy_to_guest(arg, &map, 1)) ) ++ ret = -EFAULT; ++ ++ break; ++ } ++ ++ case IOMMU_unmap_pages: ++ { ++ struct pv_iommu_unmap_pages unmap; ++ ++ if ( unlikely(copy_from_guest(&unmap, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = unmap_pages_op(&unmap, d); ++ ++ if ( unlikely(copy_to_guest(arg, &unmap, 1)) ) ++ ret = -EFAULT; ++ ++ break; ++ } ++ ++ case IOMMU_remote_cmd: ++ { ++ struct pv_iommu_remote_cmd remote_cmd; ++ ++ if ( remote ) ++ { ++ /* Prevent remote_cmd from being called recursively */ ++ ret = -EINVAL; ++ break; ++ } ++ ++ if ( unlikely(copy_from_guest(&remote_cmd, arg, 1)) ) ++ { ++ ret = -EFAULT; ++ break; ++ } ++ ++ ret = remote_cmd_op(&remote_cmd, d); ++ break; ++ } ++ ++ /* ++ * TODO ++ */ ++ case IOMMU_alloc_nested: ++ { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ case IOMMU_flush_nested: ++ { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ case IOMMU_attach_pasid: ++ { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ case IOMMU_detach_pasid: ++ { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return ret; ++} ++ ++long do_iommu_op(unsigned int subop, XEN_GUEST_HANDLE_PARAM(void) arg) ++{ ++ long ret = 0; ++ ++ if ( !can_use_iommu_check(current->domain) ) ++ return -ENODEV; ++ ++ ret = do_iommu_subop(subop, arg, current->domain, false); ++ ++ if ( ret == -ERESTART ) ++ return hypercall_create_continuation(__HYPERVISOR_iommu_op, "ih", subop, arg); ++ ++ return ret; ++} ++ ++/* ++ * Local variables: ++ * mode: C ++ * c-file-style: "BSD" ++ * c-basic-offset: 4 ++ * tab-width: 4 ++ * indent-tabs-mode: nil ++ * End: ++ */ +diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c +index 7720a29ade0b..78ca87b57fab 100644 +--- a/xen/include/hypercall-defs.c ++++ b/xen/include/hypercall-defs.c +@@ -209,6 +209,9 @@ hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns + #ifdef CONFIG_X86 + xenpmu_op(unsigned int op, xen_pmu_params_t *arg) + #endif ++#ifdef CONFIG_HAS_PASSTHROUGH ++iommu_op(unsigned int subop, void *arg) ++#endif + + #ifdef CONFIG_PV + caller: pv64 +@@ -295,5 +298,8 @@ mca do do - - - + #ifndef CONFIG_PV_SHIM_EXCLUSIVE + paging_domctl_cont do do do do - + #endif ++#ifdef CONFIG_HAS_PASSTHROUGH ++iommu_op do do do do - ++#endif + + #endif /* !CPPCHECK */ +diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h +new file mode 100644 +index 000000000000..c14b8435c980 +--- /dev/null ++++ b/xen/include/public/pv-iommu.h +@@ -0,0 +1,341 @@ ++/* SPDX-License-Identifier: MIT */ ++/** ++ * pv-iommu.h ++ * ++ * Paravirtualized IOMMU driver interface. ++ * ++ * Copyright (c) 2024 Teddy Astie ++ */ ++ ++#ifndef __XEN_PUBLIC_PV_IOMMU_H__ ++#define __XEN_PUBLIC_PV_IOMMU_H__ ++ ++#include "xen.h" ++#include "physdev.h" ++ ++#ifndef uint64_aligned_t ++#define uint64_aligned_t uint64_t ++#endif ++ ++#define IOMMU_DEFAULT_CONTEXT (0) ++ ++enum { ++ /* Basic cmd */ ++ IOMMU_noop = 0, ++ IOMMU_query_capabilities, ++ IOMMU_init, ++ IOMMU_alloc_context, ++ IOMMU_free_context, ++ IOMMU_reattach_device, ++ IOMMU_map_pages, ++ IOMMU_unmap_pages, ++ IOMMU_remote_cmd, ++ ++ /* Extended cmd */ ++ IOMMU_alloc_nested, /* if IOMMUCAP_nested */ ++ IOMMU_flush_nested, /* if IOMMUCAP_nested */ ++ IOMMU_attach_pasid, /* if IOMMUCAP_pasid */ ++ IOMMU_detach_pasid, /* if IOMMUCAP_pasid */ ++}; ++ ++/** ++ * Indicate if the default context is a identity mapping to domain memory. ++ * If not defined, default context blocks all DMA to domain memory. ++ */ ++#define IOMMUCAP_default_identity (1 << 0) ++ ++/** ++ * IOMMU_MAP_cache support. ++ */ ++#define IOMMUCAP_cache (1 << 1) ++ ++/** ++ * Support for IOMMU_alloc_nested. ++ */ ++#define IOMMUCAP_nested (1 << 2) ++ ++/** ++ * Support for IOMMU_attach_pasid and IOMMU_detach_pasid and pasid parameter in ++ * reattach_context. ++ */ ++#define IOMMUCAP_pasid (1 << 3) ++ ++/** ++ * Support for IOMMU_ALLOC_identity ++ */ ++#define IOMMUCAP_identity (1 << 4) ++ ++/** ++ * IOMMU_query_capabilities ++ * Query PV-IOMMU capabilities for this domain. ++ */ ++struct pv_iommu_capabilities { ++ /* ++ * OUT: Maximum device address (iova) that the guest can use for mappings. ++ */ ++ uint64_aligned_t max_iova_addr; ++ ++ /* OUT: IOMMU capabilities flags */ ++ uint32_t cap_flags; ++ ++ /* OUT: Mask of all supported page sizes. */ ++ uint32_t pgsize_mask; ++ ++ /* OUT: Maximum pasid (if IOMMUCAP_pasid) */ ++ uint32_t max_pasid; ++ ++ /* OUT: Maximum number of IOMMU context this domain can use. */ ++ uint16_t max_ctx_no; ++}; ++typedef struct pv_iommu_capabilities pv_iommu_capabilities_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_capabilities_t); ++ ++/** ++ * IOMMU_init ++ * Initialize PV-IOMMU for this domain. ++ * ++ * Fails with -EACCESS if PV-IOMMU is already initialized. ++ */ ++struct pv_iommu_init { ++ /* IN: Maximum number of IOMMU context this domain can use. */ ++ uint32_t max_ctx_no; ++ ++ /* IN: Arena size in pages (in power of two) */ ++ uint32_t arena_order; ++}; ++typedef struct pv_iommu_init pv_iommu_init_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_init_t); ++ ++/** ++ * Create a 1:1 identity mapped context to domain memory ++ * (needs IOMMUCAP_identity). ++ */ ++#define IOMMU_ALLOC_identity (1 << 0) ++ ++/** ++ * IOMMU_alloc_context ++ * Allocate an IOMMU context. ++ * Fails with -ENOSPC if no context number is available. ++ */ ++struct pv_iommu_alloc { ++ /* OUT: allocated IOMMU context number */ ++ uint16_t ctx_no; ++ ++ /* IN: allocation flags */ ++ uint32_t alloc_flags; ++}; ++typedef struct pv_iommu_alloc pv_iommu_alloc_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_t); ++ ++/** ++ * Move all devices to default context before freeing the context. ++ */ ++#define IOMMU_FREE_reattach_default (1 << 0) ++ ++/** ++ * IOMMU_free_context ++ * Destroy a IOMMU context. ++ * ++ * If IOMMU_FREE_reattach_default is specified, move all context devices to ++ * default context before destroying this context. ++ * ++ * If there are devices in the context and IOMMU_FREE_reattach_default is not ++ * specified, fail with -EBUSY. ++ * ++ * The default context can't be destroyed. ++ */ ++struct pv_iommu_free { ++ /* IN: IOMMU context number to free */ ++ uint16_t ctx_no; ++ ++ /* IN: Free operation specific flags */ ++ uint32_t free_flags; ++}; ++typedef struct pv_iommu_free pv_iommu_free_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_free_t); ++ ++/* Device has read access */ ++#define IOMMU_MAP_readable (1 << 0) ++ ++/* Device has write access */ ++#define IOMMU_MAP_writeable (1 << 1) ++ ++/* Enforce DMA coherency */ ++#define IOMMU_MAP_cache (1 << 2) ++ ++/** ++ * IOMMU_map_pages ++ * Map pages on a IOMMU context. ++ * ++ * pgsize must be supported by pgsize_mask. ++ * Fails with -EINVAL if mapping on top of another mapping. ++ * Report actually mapped page count in mapped field (regardless of failure). ++ */ ++struct pv_iommu_map_pages { ++ /* IN: IOMMU context number */ ++ uint16_t ctx_no; ++ ++ /* IN: Guest frame number */ ++ uint64_aligned_t gfn; ++ ++ /* IN: Device frame number */ ++ uint64_aligned_t dfn; ++ ++ /* IN: Map flags */ ++ uint32_t map_flags; ++ ++ /* IN: Size of pages to map */ ++ uint32_t pgsize; ++ ++ /* IN: Number of pages to map */ ++ uint32_t nr_pages; ++ ++ /* OUT: Number of pages actually mapped */ ++ uint32_t mapped; ++}; ++typedef struct pv_iommu_map_pages pv_iommu_map_pages_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_map_pages_t); ++ ++/** ++ * IOMMU_unmap_pages ++ * Unmap pages on a IOMMU context. ++ * ++ * pgsize must be supported by pgsize_mask. ++ * Report actually unmapped page count in mapped field (regardless of failure). ++ * Fails with -ENOENT when attempting to unmap a page without any mapping ++ */ ++struct pv_iommu_unmap_pages { ++ /* IN: IOMMU context number */ ++ uint16_t ctx_no; ++ ++ /* IN: Device frame number */ ++ uint64_aligned_t dfn; ++ ++ /* IN: Size of pages to unmap */ ++ uint32_t pgsize; ++ ++ /* IN: Number of pages to unmap */ ++ uint32_t nr_pages; ++ ++ /* OUT: Number of pages actually unmapped */ ++ uint32_t unmapped; ++}; ++typedef struct pv_iommu_unmap_pages pv_iommu_unmap_pages_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_unmap_pages_t); ++ ++/** ++ * IOMMU_reattach_device ++ * Reattach a device to another IOMMU context. ++ * Fails with -ENODEV if no such device exist. ++ */ ++struct pv_iommu_reattach_device { ++ /* IN: Target IOMMU context number */ ++ uint16_t ctx_no; ++ ++ /* IN: Physical device to move */ ++ struct physdev_pci_device dev; ++ ++ /* IN: PASID of the device (if IOMMUCAP_pasid) */ ++ uint32_t pasid; ++}; ++typedef struct pv_iommu_reattach_device pv_iommu_reattach_device_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_reattach_device_t); ++ ++ ++/** ++ * IOMMU_remote_cmd ++ * Do a PV-IOMMU operation on another domain. ++ * Current domain needs to be allowed to act on the target domain, otherwise ++ * fails with -EPERM. ++ */ ++struct pv_iommu_remote_cmd { ++ /* IN: Target domain to do the subop on */ ++ uint16_t domid; ++ ++ /* IN: Command to do on target domain. */ ++ uint16_t subop; ++ ++ /* INOUT: Command argument from current domain memory */ ++ XEN_GUEST_HANDLE(void) arg; ++}; ++typedef struct pv_iommu_remote_cmd pv_iommu_remote_cmd_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_remote_cmd_t); ++ ++/** ++ * IOMMU_alloc_nested ++ * Create a nested IOMMU context (needs IOMMUCAP_nested). ++ * ++ * This context uses a platform-specific page table from domain address space ++ * specified in pgtable_gfn and use it for nested translations. ++ * ++ * Explicit flushes needs to be submited with IOMMU_flush_nested on ++ * modification of the nested pagetable to ensure coherency between IOTLB and ++ * nested page table. ++ * ++ * This context can be destroyed using IOMMU_free_context. ++ * This context cannot be modified using map_pages, unmap_pages. ++ */ ++struct pv_iommu_alloc_nested { ++ /* OUT: allocated IOMMU context number */ ++ uint16_t ctx_no; ++ ++ /* IN: guest frame number of the nested page table */ ++ uint64_aligned_t pgtable_gfn; ++ ++ /* IN: nested mode flags */ ++ uint64_aligned_t nested_flags; ++}; ++typedef struct pv_iommu_alloc_nested pv_iommu_alloc_nested_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_nested_t); ++ ++/** ++ * IOMMU_flush_nested (needs IOMMUCAP_nested) ++ * Flush the IOTLB for nested translation. ++ */ ++struct pv_iommu_flush_nested { ++ /* TODO */ ++}; ++typedef struct pv_iommu_flush_nested pv_iommu_flush_nested_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_flush_nested_t); ++ ++/** ++ * IOMMU_attach_pasid (needs IOMMUCAP_pasid) ++ * Attach a new device-with-pasid to a IOMMU context. ++ * If a matching device-with-pasid already exists (globally), ++ * fail with -EEXIST. ++ * If pasid is 0, fails with -EINVAL. ++ * If physical device doesn't exist in domain, fail with -ENOENT. ++ */ ++struct pv_iommu_attach_pasid { ++ /* IN: IOMMU context to add the device-with-pasid in */ ++ uint16_t ctx_no; ++ ++ /* IN: Physical device */ ++ struct physdev_pci_device dev; ++ ++ /* IN: pasid of the device to attach */ ++ uint32_t pasid; ++}; ++typedef struct pv_iommu_attach_pasid pv_iommu_attach_pasid_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_attach_pasid_t); ++ ++/** ++ * IOMMU_detach_pasid (needs IOMMUCAP_pasid) ++ * detach a device-with-pasid. ++ * If the device-with-pasid doesn't exist or belong to the domain, ++ * fail with -ENOENT. ++ * If pasid is 0, fails with -EINVAL. ++ */ ++struct pv_iommu_detach_pasid { ++ /* IN: Physical device */ ++ struct physdev_pci_device dev; ++ ++ /* pasid of the device to detach */ ++ uint32_t pasid; ++}; ++typedef struct pv_iommu_detach_pasid pv_iommu_detach_pasid_t; ++DEFINE_XEN_GUEST_HANDLE(pv_iommu_detach_pasid_t); ++ ++/* long do_iommu_op(int subop, XEN_GUEST_HANDLE_PARAM(void) arg) */ ++ ++#endif +\ No newline at end of file +diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h +index e051f989a5ca..d5bdedfee5ee 100644 +--- a/xen/include/public/xen.h ++++ b/xen/include/public/xen.h +@@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t); + #define __HYPERVISOR_xenpmu_op 40 + #define __HYPERVISOR_dm_op 41 + #define __HYPERVISOR_hypfs_op 42 ++#define __HYPERVISOR_iommu_op 43 + + /* Architecture-specific hypercall definitions. */ + #define __HYPERVISOR_arch_0 48 +-- +2.46.0 + diff --git a/xen.spec.in b/xen.spec.in index 8252188..a1201c3 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -98,6 +98,12 @@ Patch0203: 0203-xen.efi.build.patch # Backports (300+) +Patch0400: 0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch +Patch0401: 0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch +Patch0402: 0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch +Patch0403: 0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch +Patch0404: 0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch + # Security fixes (500+) # Upstreamable patches (600+) From ea8ecfcc28b0d03d340124607eec25f47c85f95a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Wed, 8 Jan 2025 21:33:17 +0100 Subject: [PATCH 5/6] WIP Disable AMD IOMMU AMD IOMMU isn't ported to the new IOMMU API yet. --- config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config b/config index 4bebb45..90108c8 100644 --- a/config +++ b/config @@ -120,6 +120,8 @@ CONFIG_SERIAL_TX_BUFSIZE=16384 CONFIG_XHCI=y CONFIG_HAS_CPUFREQ=y CONFIG_HAS_PASSTHROUGH=y +# CONFIG_AMD_IOMMU is not set +CONFIG_INTEL_IOMMU=y # CONFIG_IOMMU_QUARANTINE_NONE is not set CONFIG_IOMMU_QUARANTINE_BASIC=y # CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE is not set From d8dcbccdf830e6f3133269fb648c4b37e4b58b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= Date: Thu, 9 Jan 2025 01:51:58 +0100 Subject: [PATCH 6/6] Fix setting configure flags Move setting $CONFIG_EXTRA before its use for the configure call. This also makes --disable-pvshim option effective, so remove pvshim from the files list. --- xen.spec.in | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/xen.spec.in b/xen.spec.in index a1201c3..0dc7a16 100644 --- a/xen.spec.in +++ b/xen.spec.in @@ -428,6 +428,18 @@ CONFIG_EXTRA="$CONFIG_EXTRA --with-system-seabios=/usr/share/seabios/bios-256k.b %else CONFIG_EXTRA="$CONFIG_EXTRA --disable-seabios" %endif + +# BEGIN QUBES SPECIFIC PART +%ifnarch armv7hl aarch64 +#CONFIG_EXTRA="$CONFIG_EXTRA --with-system-ipxe=/usr/share/ipxe" +CONFIG_EXTRA="$CONFIG_EXTRA --disable-ipxe --disable-rombios" +CONFIG_EXTRA="$CONFIG_EXTRA --disable-pvshim" +%endif +CONFIG_EXTRA="$CONFIG_EXTRA --with-system-qemu=/usr/bin/qemu-system-x86_64" +export PATH="/usr/bin:$PATH" +autoreconf -i +# END QUBES SPECIFIC PART + ./configure --prefix=%{_prefix} --libdir=%{_libdir} --libexecdir=%{_libexecdir} --with-system-qemu=/usr/bin/qemu-system-i386 --with-linux-backend-modules="xen-evtchn xen-gntdev xen-gntalloc xen-blkback xen-netback xen-pciback xen-scsiback xen-acpi-processor" --enable-systemd --disable-pygrub $CONFIG_EXTRA unset CFLAGS CXXFLAGS FFLAGS LDFLAGS export LDFLAGS="$LDFLAGS_SAVE" @@ -449,17 +461,6 @@ export CFLAGS=`echo $CFLAGS | sed -e 's/-mfloat-abi=hard//g' -e 's/-march=armv7- %endif unset CFLAGS CXXFLAGS FFLAGS LDFLAGS -# BEGIN QUBES SPECIFIC PART -%ifnarch armv7hl aarch64 -#CONFIG_EXTRA="$CONFIG_EXTRA --with-system-ipxe=/usr/share/ipxe" -CONFIG_EXTRA="$CONFIG_EXTRA --disable-ipxe --disable-rombios" -CONFIG_EXTRA="$CONFIG_EXTRA --disable-pvshim" -%endif -CONFIG_EXTRA="$CONFIG_EXTRA --with-system-qemu=/usr/bin/qemu-system-x86_64" -export PATH="/usr/bin:$PATH" -autoreconf -i -# END QUBES SPECIFIC PART - %make_build %{?ocaml_flags} prefix=/usr tools %if %build_docs make prefix=/usr docs @@ -897,10 +898,10 @@ fi %ifarch %{ix86} x86_64 %dir %{_libexecdir}/%{name}/boot %{_libexecdir}/xen/boot/hvmloader -%ifnarch %{ix86} -%{_libexecdir}/%{name}/boot/xen-shim -/usr/lib/debug%{_libexecdir}/xen/boot/xen-shim-syms -%endif +%dnl %ifnarch %{ix86} +%dnl %{_libexecdir}/%{name}/boot/xen-shim +%dnl /usr/lib/debug%{_libexecdir}/xen/boot/xen-shim-syms +%dnl %endif %if %build_stubdom %if %build_qemutrad %{_libexecdir}/xen/boot/ioemu-stubdom.gz