From 87ecf6eb717f4b5939fc0f7172516b3ddede8fe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 8 Jan 2025 05:25:21 +0100
Subject: [PATCH 1/6] Revert "Mark the version as 4.19.1.0"

This reverts commit 346e0bcfbffae725903369ca0821a28d87228c22.
---
 .qubesbuilder         | 2 +-
 archlinux/PKGBUILD.in | 4 ++--
 xen.spec.in           | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.qubesbuilder b/.qubesbuilder
index 36817db..dc441cf 100644
--- a/.qubesbuilder
+++ b/.qubesbuilder
@@ -13,6 +13,6 @@ source:
 #  - git-url: https://gitlab.com/xen-project/xen.git
   - git-url: https://xenbits.xenproject.org/git-http/xen.git
     tag: RELEASE-@VERSION@
-    git-basename: xen-@VERSION@.0
+    git-basename: xen-@VERSION@
     pubkeys:
     - xen.org-key.asc
diff --git a/archlinux/PKGBUILD.in b/archlinux/PKGBUILD.in
index c5cf0b9..ed1e286 100644
--- a/archlinux/PKGBUILD.in
+++ b/archlinux/PKGBUILD.in
@@ -2,7 +2,7 @@
 
 _upstream_pkgver=@VERSION@
 pkgname=qubes-vm-xen
-pkgbase="xen-$_upstream_pkgver.0"
+pkgbase="xen-$_upstream_pkgver"
 pkgver=${_upstream_pkgver/-/\~}
 pkgrel=@REL@
 pkgdesc="Xen is a virtual machine monitor"
@@ -23,7 +23,7 @@ _patches=(
   1102-docs-xen-headers-use-alphabetical-sorting-for-incont.patch
   1103-Strip-build-path-directories-in-tools-xen-and-xen-ar.patch
 )
-source=(xen-$_upstream_pkgver.0.tar.gz "${_patches[@]}")
+source=(xen-$_upstream_pkgver.tar.gz "${_patches[@]}")
 md5sums=(SKIP SKIP SKIP SKIP SKIP SKIP SKIP SKIP)
 
 prepare() {
diff --git a/xen.spec.in b/xen.spec.in
index ad073f8..0d4e672 100644
--- a/xen.spec.in
+++ b/xen.spec.in
@@ -72,7 +72,7 @@ Release: %{?rctag}@REL@%{?dist}
 Epoch:   2001
 License: GPLv2+ and LGPLv2+ and BSD
 URL:     http://xen.org/
-Source0: https://downloads.xenproject.org/release/xen/%{upstream_version}/xen-%{upstream_version}.0.tar.gz
+Source0: https://downloads.xenproject.org/release/xen/%{upstream_version}/xen-%{upstream_version}.tar.gz
 Source2: %{name}.logrotate
 # .config file for xen hypervisor
 Source3: config
@@ -377,7 +377,7 @@ manage Xen virtual machines.
 %endif
 
 %prep
-%autosetup -p1 -n %{name}-%{upstream_version}.0
+%autosetup -p1 -n %{name}-%{upstream_version}
 
 # copy xen hypervisor .config file to change settings
 cp -v %{SOURCE3} xen/.config

From 3dbaa67c3169876ce44162c283e5beba41021e0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 8 Jan 2025 15:46:42 +0100
Subject: [PATCH 2/6] Update to Xen 4.20-unstable

Take the current staging commit
25492368ea429fe3357748660c72456b9ba16528.
Adjust patches, and drop those already included upstream.

This invents 4.20.0-rc0 version, just to make the build scripts happy.
---
 .qubesbuilder                                 |   3 +-
 0203-xen.efi.build.patch                      |   6 +-
 ...xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch |  34 --
 ...or-marking-only-part-of-a-MMIO-page-.patch | 516 ------------------
 ...-sub-page-ro-API-to-make-just-xhci-d.patch |  90 ---
 ...pport-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch | 350 ------------
 ...ent-early-exit-from-i8259-loop-detec.patch |  83 ---
 ...-t-use-EFI-s-GetTime-call-by-default.patch |  34 +-
 ...se-ACPI-for-CPUs-without-hardcoded-C.patch |  15 +-
 ...e-LZMA_BLOCK_SIZE-for-uncompressing-.patch |  62 ---
 ...ates-time-based-on-SOURCE_DATE_EPOCH.patch |   2 +-
 version                                       |   2 +-
 xen.spec.in                                   |   7 -
 13 files changed, 31 insertions(+), 1173 deletions(-)
 delete mode 100644 0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch
 delete mode 100644 0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch
 delete mode 100644 0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch
 delete mode 100644 0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch
 delete mode 100644 0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch
 delete mode 100644 0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch

diff --git a/.qubesbuilder b/.qubesbuilder
index dc441cf..c38f532 100644
--- a/.qubesbuilder
+++ b/.qubesbuilder
@@ -12,7 +12,8 @@ source:
 #    signature: https://downloads.xenproject.org/release/xen/@VERSION@/xen-@VERSION@.tar.gz.sig
 #  - git-url: https://gitlab.com/xen-project/xen.git
   - git-url: https://xenbits.xenproject.org/git-http/xen.git
-    tag: RELEASE-@VERSION@
+#    tag: RELEASE-@VERSION@
     git-basename: xen-@VERSION@
+    commit-id: 25492368ea429fe3357748660c72456b9ba16528
     pubkeys:
     - xen.org-key.asc
diff --git a/0203-xen.efi.build.patch b/0203-xen.efi.build.patch
index 718e26b..76cc398 100644
--- a/0203-xen.efi.build.patch
+++ b/0203-xen.efi.build.patch
@@ -15,10 +15,10 @@ index 4f6c086988dd..0efc664bc919 100644
                                           -c $(srctree)/$(efi-check).c -o $(efi-check).o,y)
  
  # Check if the linker supports PE.
--EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10
-+#EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10
+-EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 --enable-long-section-names
++#EFI_LDFLAGS := $(patsubst -m%,-mi386pep,$(LDFLAGS)) --subsystem=10 --enable-long-section-names
 +# use a reduced set of options from LDFLAGS
-+EFI_LDFLAGS = --as-needed --build-id=sha1 -mi386pep --subsystem=10
++EFI_LDFLAGS = --as-needed --build-id=sha1 -mi386pep --subsystem=10 --enable-long-section-names
  LD_PE_check_cmd = $(call ld-option,$(EFI_LDFLAGS) --image-base=0x100000000 -o $(efi-check).efi $(efi-check).o)
  XEN_BUILD_PE := $(LD_PE_check_cmd)
  
diff --git a/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch b/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch
deleted file mode 100644
index 5f26079..0000000
--- a/0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 2e1e0504cc52901689d15517459163b4159c8110 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Tue, 23 Jul 2024 13:59:12 +0200
-Subject: [PATCH] xen/list: add LIST_HEAD_RO_AFTER_INIT
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Similar to LIST_HEAD_READ_MOSTLY.
-
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
-Acked-by: Jan Beulich <jbeulich@suse.com>
----
- xen/include/xen/list.h | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/xen/include/xen/list.h b/xen/include/xen/list.h
-index 6506ac40893b..62169f46742e 100644
---- a/xen/include/xen/list.h
-+++ b/xen/include/xen/list.h
-@@ -42,6 +42,9 @@ struct list_head {
- #define LIST_HEAD_READ_MOSTLY(name) \
-     struct list_head __read_mostly name = LIST_HEAD_INIT(name)
- 
-+#define LIST_HEAD_RO_AFTER_INIT(name) \
-+    struct list_head __ro_after_init name = LIST_HEAD_INIT(name)
-+
- static inline void INIT_LIST_HEAD(struct list_head *list)
- {
-     list->next = list;
--- 
-2.46.0
-
diff --git a/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch b/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch
deleted file mode 100644
index 282d5cf..0000000
--- a/0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch
+++ /dev/null
@@ -1,516 +0,0 @@
-From 7439bbc83314e4d24a82afca4f6dbf1a1d002141 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Mon, 20 Mar 2023 21:19:25 +0100
-Subject: [PATCH] x86/mm: add API for marking only part of a MMIO page
- read only
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-In some cases, only few registers on a page needs to be write-protected.
-Examples include USB3 console (64 bytes worth of registers) or MSI-X's
-PBA table (which doesn't need to span the whole table either), although
-in the latter case the spec forbids placing other registers on the same
-page. Current API allows only marking whole pages pages read-only,
-which sometimes may cover other registers that guest may need to
-write into.
-
-Currently, when a guest tries to write to an MMIO page on the
-mmio_ro_ranges, it's either immediately crashed on EPT violation - if
-that's HVM, or if PV, it gets #PF. In case of Linux PV, if access was
-from userspace (like, /dev/mem), it will try to fixup by updating page
-tables (that Xen again will force to read-only) and will hit that #PF
-again (looping endlessly). Both behaviors are undesirable if guest could
-actually be allowed the write.
-
-Introduce an API that allows marking part of a page read-only. Since
-sub-page permissions are not a thing in page tables (they are in EPT,
-but not granular enough), do this via emulation (or simply page fault
-handler for PV) that handles writes that are supposed to be allowed.
-The new subpage_mmio_ro_add() takes a start physical address and the
-region size in bytes. Both start address and the size need to be 8-byte
-aligned, as a practical simplification (allows using smaller bitmask,
-and a smaller granularity isn't really necessary right now).
-It will internally add relevant pages to mmio_ro_ranges, but if either
-start or end address is not page-aligned, it additionally adds that page
-to a list for sub-page R/O handling. The list holds a bitmask which
-qwords are supposed to be read-only and an address where page is mapped
-for write emulation - this mapping is done only on the first access. A
-plain list is used instead of more efficient structure, because there
-isn't supposed to be many pages needing this precise r/o control.
-
-The mechanism this API is plugged in is slightly different for PV and
-HVM. For both paths, it's plugged into mmio_ro_emulated_write(). For PV,
-it's already called for #PF on read-only MMIO page. For HVM however, EPT
-violation on p2m_mmio_direct page results in a direct domain_crash() for
-non hardware domains.  To reach mmio_ro_emulated_write(), change how
-write violations for p2m_mmio_direct are handled - specifically, check
-if they relate to such partially protected page via
-subpage_mmio_write_accept() and if so, call hvm_emulate_one_mmio() for
-them too. This decodes what guest is trying write and finally calls
-mmio_ro_emulated_write(). The EPT write violation is detected as
-npfec.write_access and npfec.present both being true (similar to other
-places), which may cover some other (future?) cases - if that happens,
-emulator might get involved unnecessarily, but since it's limited to
-pages marked with subpage_mmio_ro_add() only, the impact is minimal.
-Both of those paths need an MFN to which guest tried to write (to check
-which part of the page is supposed to be read-only, and where
-the page is mapped for writes). This information currently isn't
-available directly in mmio_ro_emulated_write(), but in both cases it is
-already resolved somewhere higher in the call tree. Pass it down to
-mmio_ro_emulated_write() via new mmio_ro_emulate_ctxt.mfn field.
-
-This may give a bit more access to the instruction emulator to HVM
-guests (the change in hvm_hap_nested_page_fault()), but only for pages
-explicitly marked with subpage_mmio_ro_add() - so, if the guest has a
-passed through a device partially used by Xen.
-As of the next patch, it applies only configuration explicitly
-documented as not security supported.
-
-The subpage_mmio_ro_add() function cannot be called with overlapping
-ranges, and on pages already added to mmio_ro_ranges separately.
-Successful calls would result in correct handling, but error paths may
-result in incorrect state (like pages removed from mmio_ro_ranges too
-early). Debug build has asserts for relevant cases.
-
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
----
-Shadow mode is not tested, but I don't expect it to work differently than
-HAP in areas related to this patch.
-
-Changes in v7:
-- refuse misaligned start in release build too, to have release build
-  running what was tested in debug build
-- simplify return from subpage_mmio_ro_add_page
-Changes in v6:
-- fix return type of subpage_mmio_find_page()
-- change 'iter' pointer to 'new_entry' bool and move list_add()
-- comment why different error handling for unaligned start / size
-- code style
-Changes in v5:
-- use subpage_mmio_find_page helper, simplifying several functions
-- use LIST_HEAD_RO_AFTER_INIT
-- don't use subpage_ro_lock in __init
-- drop #ifdef in mm.h
-- return error on unaligned size in subpage_mmio_ro_add() instead of
-  extending the size (in release build)
-Changes in v4:
-- rename SUBPAGE_MMIO_RO_ALIGN to MMIO_RO_SUBPAGE_GRAN
-- guard subpage_mmio_write_accept with CONFIG_HVM, as it's used only
-  there
-- rename ro_qwords to ro_elems
-- use unsigned arguments for subpage_mmio_ro_remove_page()
-- use volatile for __iomem
-- do not set mmio_ro_ctxt.mfn for mmcfg case
-- comment where fields of mmio_ro_ctxt are used
-- use bool for result of __test_and_set_bit
-- do not open-code mfn_to_maddr()
-- remove leftover RCU
-- mention hvm_hap_nested_page_fault() explicitly in the commit message
-Changes in v3:
-- use unsigned int for loop iterators
-- use __set_bit/__clear_bit when under spinlock
-- avoid ioremap() under spinlock
-- do not cast away const
-- handle unaligned parameters in release build
-- comment fixes
-- remove RCU - the add functions are __init and actual usage is only
-  much later after domains are running
-- add checks overlapping ranges in debug build and document the
-  limitations
-- change subpage_mmio_ro_add() so the error path doesn't potentially
-  remove pages from mmio_ro_ranges
-- move printing message to avoid one goto in
-  subpage_mmio_write_emulate()
-Changes in v2:
-- Simplify subpage_mmio_ro_add() parameters
-- add to mmio_ro_ranges from within subpage_mmio_ro_add()
-- use ioremap() instead of caller-provided fixmap
-- use 8-bytes granularity (largest supported single write) and a bitmap
-  instead of a rangeset
-- clarify commit message
-- change how it's plugged in for HVM domain, to not change the behavior for
-  read-only parts (keep it hitting domain_crash(), instead of ignoring
-  write)
-- remove unused subpage_mmio_ro_remove()
----
- xen/arch/x86/hvm/emulate.c      |   2 +-
- xen/arch/x86/hvm/hvm.c          |   4 +-
- xen/arch/x86/include/asm/mm.h   |  23 +++
- xen/arch/x86/mm.c               | 261 ++++++++++++++++++++++++++++++++
- xen/arch/x86/pv/ro-page-fault.c |   6 +-
- 5 files changed, 291 insertions(+), 5 deletions(-)
-
-diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
-index 02e378365b40..7253a87032dd 100644
---- a/xen/arch/x86/hvm/emulate.c
-+++ b/xen/arch/x86/hvm/emulate.c
-@@ -2734,7 +2734,7 @@ int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla)
-         .write      = mmio_ro_emulated_write,
-         .validate   = hvmemul_validate,
-     };
--    struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla };
-+    struct mmio_ro_emulate_ctxt mmio_ro_ctxt = { .cr2 = gla, .mfn = _mfn(mfn) };
-     struct hvm_emulate_ctxt ctxt;
-     const struct x86_emulate_ops *ops;
-     unsigned int seg, bdf;
-diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
-index 7f4b627b1f5f..a108870558bf 100644
---- a/xen/arch/x86/hvm/hvm.c
-+++ b/xen/arch/x86/hvm/hvm.c
-@@ -2016,8 +2016,8 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
-         goto out_put_gfn;
-     }
- 
--    if ( (p2mt == p2m_mmio_direct) && is_hardware_domain(currd) &&
--         npfec.write_access && npfec.present &&
-+    if ( (p2mt == p2m_mmio_direct) && npfec.write_access && npfec.present &&
-+         (is_hardware_domain(currd) || subpage_mmio_write_accept(mfn, gla)) &&
-          (hvm_emulate_one_mmio(mfn_x(mfn), gla) == X86EMUL_OKAY) )
-     {
-         rc = 1;
-diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
-index 98b66edaca5e..a457f0d2b1b3 100644
---- a/xen/arch/x86/include/asm/mm.h
-+++ b/xen/arch/x86/include/asm/mm.h
-@@ -522,9 +522,32 @@ extern struct rangeset *mmio_ro_ranges;
- void memguard_guard_stack(void *p);
- void memguard_unguard_stack(void *p);
- 
-+/*
-+ * Add more precise r/o marking for a MMIO page. Range specified here
-+ * will still be R/O, but the rest of the page (not marked as R/O via another
-+ * call) will have writes passed through.
-+ * The start address and the size must be aligned to MMIO_RO_SUBPAGE_GRAN.
-+ *
-+ * This API cannot be used for overlapping ranges, nor for pages already added
-+ * to mmio_ro_ranges separately.
-+ *
-+ * Since there is currently no subpage_mmio_ro_remove(), relevant device should
-+ * not be hot-unplugged.
-+ *
-+ * Return values:
-+ *  - negative: error
-+ *  - 0: success
-+ */
-+#define MMIO_RO_SUBPAGE_GRAN 8
-+int subpage_mmio_ro_add(paddr_t start, size_t size);
-+bool subpage_mmio_write_accept(mfn_t mfn, unsigned long gla);
-+
- struct mmio_ro_emulate_ctxt {
-         unsigned long cr2;
-+        /* Used only for mmcfg case */
-         unsigned int seg, bdf;
-+        /* Used only for non-mmcfg case */
-+        mfn_t mfn;
- };
- 
- int cf_check mmio_ro_emulated_write(
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 95795567f2a5..cfd487d06474 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -152,6 +152,18 @@ struct rangeset *__read_mostly mmio_ro_ranges;
- struct rangeset *__read_mostly mmio_ro_ranges;
- 
- static uint32_t base_disallow_mask;
-+
-+/* Handling sub-page read-only MMIO regions */
-+struct subpage_ro_range {
-+    struct list_head list;
-+    mfn_t mfn;
-+    void __iomem *mapped;
-+    DECLARE_BITMAP(ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN);
-+};
-+
-+static LIST_HEAD_RO_AFTER_INIT(subpage_ro_ranges);
-+static DEFINE_SPINLOCK(subpage_ro_lock);
-+
- /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
- #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
- 
-@@ -4912,6 +4923,253 @@ long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
-     return rc;
- }
- 
-+static struct subpage_ro_range *subpage_mmio_find_page(mfn_t mfn)
-+{
-+    struct subpage_ro_range *entry;
-+
-+    list_for_each_entry(entry, &subpage_ro_ranges, list)
-+        if ( mfn_eq(entry->mfn, mfn) )
-+            return entry;
-+
-+    return NULL;
-+}
-+
-+/*
-+ * Mark part of the page as R/O.
-+ * Returns:
-+ * - 0 on success - first range in the page
-+ * - 1 on success - subsequent range in the page
-+ * - <0 on error
-+ */
-+static int __init subpage_mmio_ro_add_page(
-+    mfn_t mfn,
-+    unsigned int offset_s,
-+    unsigned int offset_e)
-+{
-+    struct subpage_ro_range *entry = NULL;
-+    bool new_entry = false;
-+    unsigned int i;
-+
-+    entry = subpage_mmio_find_page(mfn);
-+    if ( !entry )
-+    {
-+        entry = xzalloc(struct subpage_ro_range);
-+        if ( !entry )
-+            return -ENOMEM;
-+        entry->mfn = mfn;
-+        list_add(&entry->list, &subpage_ro_ranges);
-+        new_entry = true;
-+    }
-+
-+    for ( i = offset_s; i <= offset_e; i += MMIO_RO_SUBPAGE_GRAN )
-+    {
-+        bool oldbit = __test_and_set_bit(i / MMIO_RO_SUBPAGE_GRAN,
-+                                         entry->ro_elems);
-+        ASSERT(!oldbit);
-+    }
-+
-+    return !new_entry;
-+}
-+
-+static void __init subpage_mmio_ro_remove_page(
-+    mfn_t mfn,
-+    unsigned int offset_s,
-+    unsigned int offset_e)
-+{
-+    struct subpage_ro_range *entry = NULL;
-+    unsigned int i;
-+
-+    entry = subpage_mmio_find_page(mfn);
-+    if ( !entry )
-+        return;
-+
-+    for ( i = offset_s; i <= offset_e; i += MMIO_RO_SUBPAGE_GRAN )
-+        __clear_bit(i / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems);
-+
-+    if ( !bitmap_empty(entry->ro_elems, PAGE_SIZE / MMIO_RO_SUBPAGE_GRAN) )
-+        return;
-+
-+    list_del(&entry->list);
-+    if ( entry->mapped )
-+        iounmap(entry->mapped);
-+    xfree(entry);
-+}
-+
-+int __init subpage_mmio_ro_add(
-+    paddr_t start,
-+    size_t size)
-+{
-+    mfn_t mfn_start = maddr_to_mfn(start);
-+    paddr_t end = start + size - 1;
-+    mfn_t mfn_end = maddr_to_mfn(end);
-+    unsigned int offset_end = 0;
-+    int rc;
-+    bool subpage_start, subpage_end;
-+
-+    ASSERT(IS_ALIGNED(start, MMIO_RO_SUBPAGE_GRAN));
-+    ASSERT(IS_ALIGNED(size, MMIO_RO_SUBPAGE_GRAN));
-+    if ( !IS_ALIGNED(start, MMIO_RO_SUBPAGE_GRAN) ||
-+         !IS_ALIGNED(size, MMIO_RO_SUBPAGE_GRAN) )
-+        return -EINVAL;
-+
-+    if ( !size )
-+        return 0;
-+
-+    if ( mfn_eq(mfn_start, mfn_end) )
-+    {
-+        /* Both starting and ending parts handled at once */
-+        subpage_start = PAGE_OFFSET(start) || PAGE_OFFSET(end) != PAGE_SIZE - 1;
-+        subpage_end = false;
-+    }
-+    else
-+    {
-+        subpage_start = PAGE_OFFSET(start);
-+        subpage_end = PAGE_OFFSET(end) != PAGE_SIZE - 1;
-+    }
-+
-+    if ( subpage_start )
-+    {
-+        offset_end = mfn_eq(mfn_start, mfn_end) ?
-+                     PAGE_OFFSET(end) :
-+                     (PAGE_SIZE - 1);
-+        rc = subpage_mmio_ro_add_page(mfn_start,
-+                                      PAGE_OFFSET(start),
-+                                      offset_end);
-+        if ( rc < 0 )
-+            goto err_unlock;
-+        /* Check if not marking R/W part of a page intended to be fully R/O */
-+        ASSERT(rc || !rangeset_contains_singleton(mmio_ro_ranges,
-+                                                  mfn_x(mfn_start)));
-+    }
-+
-+    if ( subpage_end )
-+    {
-+        rc = subpage_mmio_ro_add_page(mfn_end, 0, PAGE_OFFSET(end));
-+        if ( rc < 0 )
-+            goto err_unlock_remove;
-+        /* Check if not marking R/W part of a page intended to be fully R/O */
-+        ASSERT(rc || !rangeset_contains_singleton(mmio_ro_ranges,
-+                                                  mfn_x(mfn_end)));
-+    }
-+
-+    rc = rangeset_add_range(mmio_ro_ranges, mfn_x(mfn_start), mfn_x(mfn_end));
-+    if ( rc )
-+        goto err_remove;
-+
-+    return 0;
-+
-+ err_remove:
-+    if ( subpage_end )
-+        subpage_mmio_ro_remove_page(mfn_end, 0, PAGE_OFFSET(end));
-+ err_unlock_remove:
-+    if ( subpage_start )
-+        subpage_mmio_ro_remove_page(mfn_start, PAGE_OFFSET(start), offset_end);
-+ err_unlock:
-+    return rc;
-+}
-+
-+static void __iomem *subpage_mmio_map_page(
-+    struct subpage_ro_range *entry)
-+{
-+    void __iomem *mapped_page;
-+
-+    if ( entry->mapped )
-+        return entry->mapped;
-+
-+    mapped_page = ioremap(mfn_to_maddr(entry->mfn), PAGE_SIZE);
-+
-+    spin_lock(&subpage_ro_lock);
-+    /* Re-check under the lock */
-+    if ( entry->mapped )
-+    {
-+        spin_unlock(&subpage_ro_lock);
-+        if ( mapped_page )
-+            iounmap(mapped_page);
-+        return entry->mapped;
-+    }
-+
-+    entry->mapped = mapped_page;
-+    spin_unlock(&subpage_ro_lock);
-+    return entry->mapped;
-+}
-+
-+static void subpage_mmio_write_emulate(
-+    mfn_t mfn,
-+    unsigned int offset,
-+    const void *data,
-+    unsigned int len)
-+{
-+    struct subpage_ro_range *entry;
-+    volatile void __iomem *addr;
-+
-+    entry = subpage_mmio_find_page(mfn);
-+    if ( !entry )
-+        /* Do not print message for pages without any writable parts. */
-+        return;
-+
-+    if ( test_bit(offset / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems) )
-+    {
-+ write_ignored:
-+        gprintk(XENLOG_WARNING,
-+                "ignoring write to R/O MMIO 0x%"PRI_mfn"%03x len %u\n",
-+                mfn_x(mfn), offset, len);
-+        return;
-+    }
-+
-+    addr = subpage_mmio_map_page(entry);
-+    if ( !addr )
-+    {
-+        gprintk(XENLOG_ERR,
-+                "Failed to map page for MMIO write at 0x%"PRI_mfn"%03x\n",
-+                mfn_x(mfn), offset);
-+        return;
-+    }
-+
-+    switch ( len )
-+    {
-+    case 1:
-+        writeb(*(const uint8_t*)data, addr);
-+        break;
-+    case 2:
-+        writew(*(const uint16_t*)data, addr);
-+        break;
-+    case 4:
-+        writel(*(const uint32_t*)data, addr);
-+        break;
-+    case 8:
-+        writeq(*(const uint64_t*)data, addr);
-+        break;
-+    default:
-+        /* mmio_ro_emulated_write() already validated the size */
-+        ASSERT_UNREACHABLE();
-+        goto write_ignored;
-+    }
-+}
-+
-+#ifdef CONFIG_HVM
-+bool subpage_mmio_write_accept(mfn_t mfn, unsigned long gla)
-+{
-+    unsigned int offset = PAGE_OFFSET(gla);
-+    const struct subpage_ro_range *entry;
-+
-+    entry = subpage_mmio_find_page(mfn);
-+    if ( !entry )
-+        return false;
-+
-+    if ( !test_bit(offset / MMIO_RO_SUBPAGE_GRAN, entry->ro_elems) )
-+    {
-+        /*
-+         * We don't know the write size at this point yet, so it could be
-+         * an unaligned write, but accept it here anyway and deal with it
-+         * later.
-+         */
-+        return true;
-+    }
-+
-+    return false;
-+}
-+#endif
-+
- int cf_check mmio_ro_emulated_write(
-     enum x86_segment seg,
-     unsigned long offset,
-@@ -4930,6 +5188,9 @@ int cf_check mmio_ro_emulated_write(
-         return X86EMUL_UNHANDLEABLE;
-     }
- 
-+    subpage_mmio_write_emulate(mmio_ro_ctxt->mfn, PAGE_OFFSET(offset),
-+                               p_data, bytes);
-+
-     return X86EMUL_OKAY;
- }
- 
-diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
-index cad28ef928ad..2ea1a6ad489c 100644
---- a/xen/arch/x86/pv/ro-page-fault.c
-+++ b/xen/arch/x86/pv/ro-page-fault.c
-@@ -333,8 +333,10 @@ static int mmio_ro_do_page_fault(struct x86_emulate_ctxt *ctxt,
-     ctxt->data = &mmio_ro_ctxt;
-     if ( pci_ro_mmcfg_decode(mfn_x(mfn), &mmio_ro_ctxt.seg, &mmio_ro_ctxt.bdf) )
-         return x86_emulate(ctxt, &mmcfg_intercept_ops);
--    else
--        return x86_emulate(ctxt, &mmio_ro_emulate_ops);
-+
-+    mmio_ro_ctxt.mfn = mfn;
-+
-+    return x86_emulate(ctxt, &mmio_ro_emulate_ops);
- }
- 
- int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
--- 
-2.46.0
-
diff --git a/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch b/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch
deleted file mode 100644
index a37b131..0000000
--- a/0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch
+++ /dev/null
@@ -1,90 +0,0 @@
-From 278c3f5336a02f6c3235772271e364f9d50c6034 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Fri, 24 Mar 2023 18:24:41 +0100
-Subject: [PATCH] drivers/char: Use sub-page ro API to make just xhci
- dbc cap RO
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Not the whole page, which may contain other registers too. The XHCI
-specification describes DbC as designed to be controlled by a different
-driver, but does not mandate placing registers on a separate page. In fact
-on Tiger Lake and newer (at least), this page do contain other registers
-that Linux tries to use. And with share=yes, a domU would use them too.
-Without this patch, PV dom0 would fail to initialize the controller,
-while HVM would be killed on EPT violation.
-
-With `share=yes`, this patch gives domU more access to the emulator
-(although a HVM with any emulated device already has plenty of it). This
-configuration is already documented as unsafe with untrusted guests and
-not security supported.
-
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
-Changes in v4:
-- restore mmio_ro_ranges in the fallback case
-- set XHCI_SHARE_NONE in the fallback case
-Changes in v3:
-- indentation fix
-- remove stale comment
-- fallback to pci_ro_device() if subpage_mmio_ro_add() fails
-- extend commit message
-Changes in v2:
- - adjust for simplified subpage_mmio_ro_add() API
----
- xen/drivers/char/xhci-dbc.c | 36 ++++++++++++++++++++++--------------
- 1 file changed, 22 insertions(+), 14 deletions(-)
-
-diff --git a/xen/drivers/char/xhci-dbc.c b/xen/drivers/char/xhci-dbc.c
-index 8e2037f1a5f7..c45e4b6825cc 100644
---- a/xen/drivers/char/xhci-dbc.c
-+++ b/xen/drivers/char/xhci-dbc.c
-@@ -1216,20 +1216,28 @@ static void __init cf_check dbc_uart_init_postirq(struct serial_port *port)
-         break;
-     }
- #ifdef CONFIG_X86
--    /*
--     * This marks the whole page as R/O, which may include other registers
--     * unrelated to DbC. Xen needs only DbC area protected, but it seems
--     * Linux's XHCI driver (as of 5.18) works without writting to the whole
--     * page, so keep it simple.
--     */
--    if ( rangeset_add_range(mmio_ro_ranges,
--                PFN_DOWN((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) +
--                         uart->dbc.xhc_dbc_offset),
--                PFN_UP((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) +
--                       uart->dbc.xhc_dbc_offset +
--                sizeof(*uart->dbc.dbc_reg)) - 1) )
--        printk(XENLOG_INFO
--               "Error while adding MMIO range of device to mmio_ro_ranges\n");
-+    if ( subpage_mmio_ro_add(
-+             (uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) +
-+              uart->dbc.xhc_dbc_offset,
-+             sizeof(*uart->dbc.dbc_reg)) )
-+    {
-+        printk(XENLOG_WARNING
-+               "Error while marking MMIO range of XHCI console as R/O, "
-+               "making the whole device R/O (share=no)\n");
-+        uart->dbc.share = XHCI_SHARE_NONE;
-+        if ( pci_ro_device(0, uart->dbc.sbdf.bus, uart->dbc.sbdf.devfn) )
-+            printk(XENLOG_WARNING
-+                   "Failed to mark read-only %pp used for XHCI console\n",
-+                   &uart->dbc.sbdf);
-+        if ( rangeset_add_range(mmio_ro_ranges,
-+                 PFN_DOWN((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) +
-+                          uart->dbc.xhc_dbc_offset),
-+                 PFN_UP((uart->dbc.bar_val & PCI_BASE_ADDRESS_MEM_MASK) +
-+                        uart->dbc.xhc_dbc_offset +
-+                        sizeof(*uart->dbc.dbc_reg)) - 1) )
-+            printk(XENLOG_INFO
-+                   "Error while adding MMIO range of device to mmio_ro_ranges\n");
-+    }
- #endif
- }
- 
--- 
-2.46.0
-
diff --git a/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch b/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch
deleted file mode 100644
index 4e697ee..0000000
--- a/0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch
+++ /dev/null
@@ -1,350 +0,0 @@
-From db40e7b40bb68470684d6bef2c0318c448df34d8 Mon Sep 17 00:00:00 2001
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Date: Fri, 20 Dec 2024 19:34:24 +0000
-Subject: [PATCH 305/306] x86/spec-ctrl: Support for SRSO_U/S_NO and
- SRSO_MSR_FIX
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-AMD have updated the SRSO whitepaper[1] with further information.  These
-features exist on AMD Zen5 CPUs and are necessary for Xen to use.
-
-The two features are in principle unrelated:
-
- * SRSO_U/S_NO is an enumeration saying that SRSO attacks can't cross the
-   User(CPL3) / Supervisor(CPL<3) boundary.  i.e. Xen don't need to use
-   IBPB-on-entry for PV64.  PV32 guests are explicitly unsupported for
-   speculative issues, and excluded from consideration for simplicity.
-
- * SRSO_MSR_FIX is an enumeration identifying that the BP_SPEC_REDUCE bit is
-   available in MSR_BP_CFG.  When set, SRSO attacks can't cross the host/guest
-   boundary.  i.e. Xen don't need to use IBPB-on-entry for HVM.
-
-Extend ibpb_calculations() to account for these when calculating
-opt_ibpb_entry_{pv,hvm} defaults.  Add a `bp-spec-reduce=<bool>` option to
-control the use of BP_SPEC_REDUCE, with it active by default.
-
-Because MSR_BP_CFG is core-scoped with a race condition updating it, repurpose
-amd_check_erratum_1485() into amd_check_bp_cfg() and calculate all updates at
-once.
-
-Xen also needs to to advertise SRSO_U/S_NO to guests to allow the guest kernel
-to skip SRSO mitigations too:
-
- * This is trivial for HVM guests.  It is also is accurate for PV32 guests
-   too, but we have already excluded them from consideration, and do so again
-   here to simplify the policy logic.
-
- * As written, SRSO_U/S_NO does not help for the PV64 user->kernel boundary.
-   However, after discussing with AMD, an implementation detail of having
-   BP_SPEC_REDUCE active causes the PV64 user->kernel boundary to have the
-   property described by SRSO_U/S_NO, so we can advertise SRSO_U/S_NO to
-   guests when the BP_SPEC_REDUCE precondition is met.
-
-Finally, fix a typo in the SRSO_NO's comment.
-
-[1] https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
----
- docs/misc/xen-command-line.pandoc           |  9 +++-
- xen/arch/x86/cpu-policy.c                   | 21 +++++++++
- xen/arch/x86/cpu/amd.c                      | 29 +++++++++---
- xen/arch/x86/include/asm/msr-index.h        |  1 +
- xen/arch/x86/include/asm/spec_ctrl.h        |  1 +
- xen/arch/x86/spec_ctrl.c                    | 51 ++++++++++++++++-----
- xen/include/public/arch-x86/cpufeatureset.h |  4 +-
- 7 files changed, 96 insertions(+), 20 deletions(-)
-
-diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
-index 2096ae5841de..1944847172d7 100644
---- a/docs/misc/xen-command-line.pandoc
-+++ b/docs/misc/xen-command-line.pandoc
-@@ -2392,7 +2392,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
- >              {ibrs,ibpb,ssbd,psfd,
- >              eager-fpu,l1d-flush,branch-harden,srb-lock,
- >              unpriv-mmio,gds-mit,div-scrub,lock-harden,
-->              bhi-dis-s}=<bool> ]`
-+>              bhi-dis-s,bp-spec-reduce}=<bool> ]`
- 
- Controls for speculative execution sidechannel mitigations.  By default, Xen
- will pick the most appropriate mitigations based on compiled in support,
-@@ -2541,6 +2541,13 @@ boolean can be used to force or prevent Xen from using speculation barriers to
- protect lock critical regions.  This mitigation won't be engaged by default,
- and needs to be explicitly enabled on the command line.
- 
-+On hardware supporting SRSO_MSR_FIX, the `bp-spec-reduce=` option can be used
-+to force or prevent Xen from using MSR_BP_CFG.BP_SPEC_REDUCE to mitigate the
-+SRSO (Speculative Return Stack Overflow) vulnerability.  Xen will use
-+bp-spec-reduce when available, as it is preferable to using `ibpb-entry=hvm`
-+to mitigate SRSO for HVM guests, and because it is a prerequisite to advertise
-+SRSO_U/S_NO to PV guests.
-+
- ### sync_console
- > `= <boolean>`
- 
-diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
-index 304dc20cfab8..1722f5f90392 100644
---- a/xen/arch/x86/cpu-policy.c
-+++ b/xen/arch/x86/cpu-policy.c
-@@ -14,6 +14,7 @@
- #include <asm/msr-index.h>
- #include <asm/paging.h>
- #include <asm/setup.h>
-+#include <asm/spec_ctrl.h>
- #include <asm/xstate.h>
- 
- struct cpu_policy __read_mostly       raw_cpu_policy;
-@@ -605,6 +606,26 @@ static void __init calculate_pv_max_policy(void)
-         __clear_bit(X86_FEATURE_IBRS, fs);
-     }
- 
-+    /*
-+     * SRSO_U/S_NO means that the CPU is not vulnerable to SRSO attacks across
-+     * the User (CPL3)/Supervisor (CPL<3) boundary.
-+     *
-+     * PV32 guests are unsupported for speculative issues, and excluded from
-+     * consideration for simplicity.
-+     *
-+     * The PV64 user/kernel boundary is CPL3 on both sides, so SRSO_U/S_NO
-+     * won't convey the meaning that a PV kernel expects.
-+     *
-+     * After discussions with AMD, an implementation detail of having
-+     * BP_SPEC_REDUCE active causes the PV64 user/kernel boundary to have a
-+     * property compatible with the meaning of SRSO_U/S_NO.
-+     *
-+     * If BP_SPEC_REDUCE isn't active, remove SRSO_U/S_NO from the PV max
-+     * policy, which will cause it to filter out of PV default too.
-+     */
-+    if ( !boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) || !opt_bp_spec_reduce )
-+        __clear_bit(X86_FEATURE_SRSO_US_NO, fs);
-+
-     guest_common_max_feature_adjustments(fs);
-     guest_common_feature_adjustments(fs);
- 
-diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
-index ab92333673b9..c448997be551 100644
---- a/xen/arch/x86/cpu/amd.c
-+++ b/xen/arch/x86/cpu/amd.c
-@@ -1009,16 +1009,33 @@ static void cf_check fam17_disable_c6(void *arg)
- 	wrmsrl(MSR_AMD_CSTATE_CFG, val & mask);
- }
- 
--static void amd_check_erratum_1485(void)
-+static void amd_check_bp_cfg(void)
- {
--	uint64_t val, chickenbit = (1 << 5);
-+	uint64_t val, new = 0;
- 
--	if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x19 || !is_zen4_uarch())
-+	/*
-+	 * AMD Erratum #1485.  Set bit 5, as instructed.
-+	 */
-+	if (!cpu_has_hypervisor && boot_cpu_data.x86 == 0x19 && is_zen4_uarch())
-+		new |= (1 << 5);
-+
-+	/*
-+	 * On hardware supporting SRSO_MSR_FIX, activate BP_SPEC_REDUCE by
-+	 * default.  This lets us do two things:
-+	 *
-+	 * 1) Avoid IBPB-on-entry to mitigate SRSO attacks from HVM guests.
-+	 * 2) Advertise SRSO_US_NO to PV guests.
-+	 */
-+	if (boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) && opt_bp_spec_reduce)
-+		new |= BP_CFG_SPEC_REDUCE;
-+
-+	/* Avoid reading BP_CFG if we don't intend to change anything. */
-+	if (!new)
- 		return;
- 
- 	rdmsrl(MSR_AMD64_BP_CFG, val);
- 
--	if (val & chickenbit)
-+	if ((val & new) == new)
- 		return;
- 
- 	/*
-@@ -1027,7 +1044,7 @@ static void amd_check_erratum_1485(void)
- 	 * same time before the chickenbit is set. It's benign because the
- 	 * value being written is the same on both.
- 	 */
--	wrmsrl(MSR_AMD64_BP_CFG, val | chickenbit);
-+	wrmsrl(MSR_AMD64_BP_CFG, val | new);
- }
- 
- static void cf_check init_amd(struct cpuinfo_x86 *c)
-@@ -1297,7 +1314,7 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
- 		disable_c1_ramping();
- 
- 	amd_check_zenbleed();
--	amd_check_erratum_1485();
-+	amd_check_bp_cfg();
- 
- 	if (fam17_c6_disabled)
- 		fam17_disable_c6(NULL);
-diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
-index 17dd857af802..b324356fd550 100644
---- a/xen/arch/x86/include/asm/msr-index.h
-+++ b/xen/arch/x86/include/asm/msr-index.h
-@@ -412,6 +412,7 @@
- #define AMD64_DE_CFG_LFENCE_SERIALISE	(_AC(1, ULL) << 1)
- #define MSR_AMD64_EX_CFG		0xc001102cU
- #define MSR_AMD64_BP_CFG		0xc001102eU
-+#define  BP_CFG_SPEC_REDUCE		(_AC(1, ULL) <<  4)
- #define MSR_AMD64_DE_CFG2		0xc00110e3U
- 
- #define MSR_AMD64_DR0_ADDRESS_MASK	0xc0011027U
-diff --git a/xen/arch/x86/include/asm/spec_ctrl.h b/xen/arch/x86/include/asm/spec_ctrl.h
-index 72347ef2b959..077225418956 100644
---- a/xen/arch/x86/include/asm/spec_ctrl.h
-+++ b/xen/arch/x86/include/asm/spec_ctrl.h
-@@ -90,6 +90,7 @@ extern int8_t opt_xpti_hwdom, opt_xpti_domu;
- 
- extern bool cpu_has_bug_l1tf;
- extern int8_t opt_pv_l1tf_hwdom, opt_pv_l1tf_domu;
-+extern bool opt_bp_spec_reduce;
- 
- /*
-  * The L1D address mask, which might be wider than reported in CPUID, and the
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 40f6ae017010..35351044f901 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -83,6 +83,7 @@ static bool __initdata opt_unpriv_mmio;
- static bool __ro_after_init opt_verw_mmio;
- static int8_t __initdata opt_gds_mit = -1;
- static int8_t __initdata opt_div_scrub = -1;
-+bool __ro_after_init opt_bp_spec_reduce = true;
- 
- static int __init cf_check parse_spec_ctrl(const char *s)
- {
-@@ -143,6 +144,7 @@ static int __init cf_check parse_spec_ctrl(const char *s)
-             opt_unpriv_mmio = false;
-             opt_gds_mit = 0;
-             opt_div_scrub = 0;
-+            opt_bp_spec_reduce = false;
-         }
-         else if ( val > 0 )
-             rc = -EINVAL;
-@@ -363,6 +365,8 @@ static int __init cf_check parse_spec_ctrl(const char *s)
-             opt_gds_mit = val;
-         else if ( (val = parse_boolean("div-scrub", s, ss)) >= 0 )
-             opt_div_scrub = val;
-+        else if ( (val = parse_boolean("bp-spec-reduce", s, ss)) >= 0 )
-+            opt_bp_spec_reduce = val;
-         else
-             rc = -EINVAL;
- 
-@@ -505,7 +509,7 @@ static void __init print_details(enum ind_thunk thunk)
-      * Hardware read-only information, stating immunity to certain issues, or
-      * suggestions of which mitigation to use.
-      */
--    printk("  Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
-+    printk("  Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
-            (caps & ARCH_CAPS_RDCL_NO)                        ? " RDCL_NO"        : "",
-            (caps & ARCH_CAPS_EIBRS)                          ? " EIBRS"          : "",
-            (caps & ARCH_CAPS_RSBA)                           ? " RSBA"           : "",
-@@ -529,10 +533,11 @@ static void __init print_details(enum ind_thunk thunk)
-            (e8b  & cpufeat_mask(X86_FEATURE_BTC_NO))         ? " BTC_NO"         : "",
-            (e8b  & cpufeat_mask(X86_FEATURE_IBPB_RET))       ? " IBPB_RET"       : "",
-            (e21a & cpufeat_mask(X86_FEATURE_IBPB_BRTYPE))    ? " IBPB_BRTYPE"    : "",
--           (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO))        ? " SRSO_NO"        : "");
-+           (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO))        ? " SRSO_NO"        : "",
-+           (e21a & cpufeat_mask(X86_FEATURE_SRSO_US_NO))     ? " SRSO_US_NO"     : "");
- 
-     /* Hardware features which need driving to mitigate issues. */
--    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
-+    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
-            (e8b  & cpufeat_mask(X86_FEATURE_IBPB)) ||
-            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB))          ? " IBPB"           : "",
-            (e8b  & cpufeat_mask(X86_FEATURE_IBRS)) ||
-@@ -551,7 +556,8 @@ static void __init print_details(enum ind_thunk thunk)
-            (caps & ARCH_CAPS_FB_CLEAR_CTRL)                  ? " FB_CLEAR_CTRL"  : "",
-            (caps & ARCH_CAPS_GDS_CTRL)                       ? " GDS_CTRL"       : "",
-            (caps & ARCH_CAPS_RFDS_CLEAR)                     ? " RFDS_CLEAR"     : "",
--           (e21a & cpufeat_mask(X86_FEATURE_SBPB))           ? " SBPB"           : "");
-+           (e21a & cpufeat_mask(X86_FEATURE_SBPB))           ? " SBPB"           : "",
-+           (e21a & cpufeat_mask(X86_FEATURE_SRSO_MSR_FIX))   ? " SRSO_MSR_FIX"   : "");
- 
-     /* Compiled-in support which pertains to mitigations. */
-     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ||
-@@ -1120,7 +1126,7 @@ static void __init div_calculations(bool hw_smt_enabled)
- 
- static void __init ibpb_calculations(void)
- {
--    bool def_ibpb_entry = false;
-+    bool def_ibpb_entry_pv = false, def_ibpb_entry_hvm = false;
- 
-     /* Check we have hardware IBPB support before using it... */
-     if ( !boot_cpu_has(X86_FEATURE_IBRSB) && !boot_cpu_has(X86_FEATURE_IBPB) )
-@@ -1145,22 +1151,43 @@ static void __init ibpb_calculations(void)
-          * Confusion.  Mitigate with IBPB-on-entry.
-          */
-         if ( !boot_cpu_has(X86_FEATURE_BTC_NO) )
--            def_ibpb_entry = true;
-+            def_ibpb_entry_pv = def_ibpb_entry_hvm = true;
- 
-         /*
--         * Further to BTC, Zen3/4 CPUs suffer from Speculative Return Stack
--         * Overflow in most configurations.  Mitigate with IBPB-on-entry if we
--         * have the microcode that makes this an effective option.
-+         * In addition to BTC, Zen3 and later CPUs suffer from Speculative
-+         * Return Stack Overflow in most configurations.  If we have microcode
-+         * that makes IBPB-on-entry an effective mitigation, see about using
-+         * it.
-          */
-         if ( !boot_cpu_has(X86_FEATURE_SRSO_NO) &&
-              boot_cpu_has(X86_FEATURE_IBPB_BRTYPE) )
--            def_ibpb_entry = true;
-+        {
-+            /*
-+             * SRSO_U/S_NO is a subset of SRSO_NO, identifying that SRSO isn't
-+             * possible across the User (CPL3) / Supervisor (CPL<3) boundary.
-+             *
-+             * Ignoring PV32 (not security supported for speculative issues),
-+             * this means we only need to use IBPB-on-entry for PV guests on
-+             * hardware which doesn't enumerate SRSO_US_NO.
-+             */
-+            if ( !boot_cpu_has(X86_FEATURE_SRSO_US_NO) )
-+                def_ibpb_entry_pv = true;
-+
-+            /*
-+             * SRSO_MSR_FIX enumerates that we can use MSR_BP_CFG.SPEC_REDUCE
-+             * to mitigate SRSO across the host/guest boundary.  We only need
-+             * to use IBPB-on-entry for HVM guests if we haven't enabled this
-+             * control.
-+             */
-+            if ( !boot_cpu_has(X86_FEATURE_SRSO_MSR_FIX) || !opt_bp_spec_reduce )
-+                def_ibpb_entry_hvm = true;
-+        }
-     }
- 
-     if ( opt_ibpb_entry_pv == -1 )
--        opt_ibpb_entry_pv = IS_ENABLED(CONFIG_PV) && def_ibpb_entry;
-+        opt_ibpb_entry_pv = IS_ENABLED(CONFIG_PV) && def_ibpb_entry_pv;
-     if ( opt_ibpb_entry_hvm == -1 )
--        opt_ibpb_entry_hvm = IS_ENABLED(CONFIG_HVM) && def_ibpb_entry;
-+        opt_ibpb_entry_hvm = IS_ENABLED(CONFIG_HVM) && def_ibpb_entry_hvm;
- 
-     if ( opt_ibpb_entry_pv )
-     {
-diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
-index d9eba5e9a714..9c98e4992861 100644
---- a/xen/include/public/arch-x86/cpufeatureset.h
-+++ b/xen/include/public/arch-x86/cpufeatureset.h
-@@ -312,7 +312,9 @@ XEN_CPUFEATURE(FSRSC,              11*32+19) /*A  Fast Short REP SCASB */
- XEN_CPUFEATURE(AMD_PREFETCHI,      11*32+20) /*A  PREFETCHIT{0,1} Instructions */
- XEN_CPUFEATURE(SBPB,               11*32+27) /*A  Selective Branch Predictor Barrier */
- XEN_CPUFEATURE(IBPB_BRTYPE,        11*32+28) /*A  IBPB flushes Branch Type predictions too */
--XEN_CPUFEATURE(SRSO_NO,            11*32+29) /*A  Hardware not vulenrable to Speculative Return Stack Overflow */
-+XEN_CPUFEATURE(SRSO_NO,            11*32+29) /*A  Hardware not vulnerable to Speculative Return Stack Overflow */
-+XEN_CPUFEATURE(SRSO_US_NO,         11*32+30) /*A! Hardware not vulnerable to SRSO across the User/Supervisor boundary */
-+XEN_CPUFEATURE(SRSO_MSR_FIX,       11*32+31) /*   MSR_BP_CFG.BP_SPEC_REDUCE available */
- 
- /* Intel-defined CPU features, CPUID level 0x00000007:1.ebx, word 12 */
- XEN_CPUFEATURE(INTEL_PPIN,         12*32+ 0) /*   Protected Processor Inventory Number */
--- 
-2.46.0
-
diff --git a/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch b/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch
deleted file mode 100644
index 33000ba..0000000
--- a/0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch
+++ /dev/null
@@ -1,83 +0,0 @@
-From d4c0b38763f75845693855d1ac419af94866eece Mon Sep 17 00:00:00 2001
-From: Roger Pau Monne <roger.pau@citrix.com>
-Date: Mon, 16 Dec 2024 19:33:29 +0100
-Subject: [PATCH 306/306] x86/io-apic: prevent early exit from i8259 loop
- detection
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Avoid exiting early from the loop when a pin that could be connected to the
-i8259 is found, as such early exit would leave the EOI handler translation
-array only partially allocated and/or initialized.
-
-Otherwise on systems with multiple IO-APICs and an unmasked ExtINT pin on
-any IO-APIC that's no the last one the following NULL pointer dereference
-triggers:
-
-(XEN) Enabling APIC mode.  Using 2 I/O APICs
-(XEN) ----[ Xen-4.20-unstable  x86_64  debug=y  Not tainted ]----
-(XEN) CPU:    0
-(XEN) RIP:    e008:[<ffff82d040328046>] __ioapic_write_entry+0x83/0x95
-[...]
-(XEN) Xen call trace:
-(XEN)    [<ffff82d040328046>] R __ioapic_write_entry+0x83/0x95
-(XEN)    [<ffff82d04027464b>] F amd_iommu_ioapic_update_ire+0x1ea/0x273
-(XEN)    [<ffff82d0402755a1>] F iommu_update_ire_from_apic+0xa/0xc
-(XEN)    [<ffff82d040328056>] F __ioapic_write_entry+0x93/0x95
-(XEN)    [<ffff82d0403283c1>] F arch/x86/io_apic.c#clear_IO_APIC_pin+0x7c/0x10e
-(XEN)    [<ffff82d040328480>] F arch/x86/io_apic.c#clear_IO_APIC+0x2d/0x61
-(XEN)    [<ffff82d0404448b7>] F enable_IO_APIC+0x2e3/0x34f
-(XEN)    [<ffff82d04044c9b0>] F smp_prepare_cpus+0x254/0x27a
-(XEN)    [<ffff82d04044bec2>] F __start_xen+0x1ce1/0x23ae
-(XEN)    [<ffff82d0402033ae>] F __high_start+0x8e/0x90
-(XEN)
-(XEN) Pagetable walk from 0000000000000000:
-(XEN)  L4[0x000] = 000000007dbfd063 ffffffffffffffff
-(XEN)  L3[0x000] = 000000007dbfa063 ffffffffffffffff
-(XEN)  L2[0x000] = 000000007dbcc063 ffffffffffffffff
-(XEN)  L1[0x000] = 0000000000000000 ffffffffffffffff
-(XEN)
-(XEN) ****************************************
-(XEN) Panic on CPU 0:
-(XEN) FATAL PAGE FAULT
-(XEN) [error_code=0002]
-(XEN) Faulting linear address: 0000000000000000
-(XEN) ****************************************
-(XEN)
-(XEN) Reboot in five seconds...
-
-Reported-by: Sergii Dmytruk <sergii.dmytruk@3mdeb.com>
-Fixes: 86001b3970fe ('x86/io-apic: fix directed EOI when using AMD-Vi interrupt remapping')
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-(cherry picked from commit f38fd27c4ceadf7ec4e82e82d0731b6ea415c51e)
----
- xen/arch/x86/io_apic.c | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
-index d2a313c4ac72..f8b2aad9cba5 100644
---- a/xen/arch/x86/io_apic.c
-+++ b/xen/arch/x86/io_apic.c
-@@ -1307,14 +1307,14 @@ void __init enable_IO_APIC(void)
-             /* If the interrupt line is enabled and in ExtInt mode
-              * I have found the pin where the i8259 is connected.
-              */
--            if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-+            if ( ioapic_i8259.pin == -1 && entry.mask == 0 &&
-+                 entry.delivery_mode == dest_ExtINT )
-+            {
-                 ioapic_i8259.apic = apic;
-                 ioapic_i8259.pin  = pin;
--                goto found_i8259;
-             }
-         }
-     }
-- found_i8259:
-     /* Look to see what if the MP table has reported the ExtINT */
-     /* If we could not find the appropriate pin by looking at the ioapic
-      * the i8259 probably is not connected the ioapic but give the
--- 
-2.46.0
-
diff --git a/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch b/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch
index 15ee9ff..ff1dc3f 100644
--- a/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch
+++ b/0604-x86-time-Don-t-use-EFI-s-GetTime-call-by-default.patch
@@ -19,34 +19,30 @@ diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
 index a97d78484105..45f6baf6270b 100644
 --- a/xen/arch/x86/time.c
 +++ b/xen/arch/x86/time.c
-@@ -1245,20 +1245,25 @@ static void __get_cmos_time(struct rtc_time *rtc)
-         rtc->year += 100;
+@@ -1552,6 +1552,9 @@ static const char *__init wallclock_type
+     return "";
  }
- 
+
 +/* EFI's GetTime() is frequently broken so don't use it by default. */
 +#undef USE_EFI_GET_TIME
 +
- static unsigned long get_cmos_time(void)
+ static void __init probe_wallclock(void)
  {
--    unsigned long res, flags;
-+    unsigned long flags;
-     struct rtc_time rtc;
-     unsigned int seconds = 60;
-     static bool __read_mostly cmos_rtc_probe;
-     boolean_param("cmos-rtc-probe", cmos_rtc_probe);
- 
+     ASSERT(wallclock_source == WALLCLOCK_UNSET);
+@@ -1561,11 +1564,13 @@ static void __init probe_wallclock(void)
+         wallclock_source = WALLCLOCK_XEN;
+         return;
+     }
 +#ifdef USE_EFI_GET_TIME
-     if ( efi_enabled(EFI_RS) )
+     if ( efi_enabled(EFI_RS) && efi_get_time() )
      {
--        res = efi_get_time();
-+        unsigned long res = efi_get_time();
-         if ( res )
-             return res;
+         wallclock_source = WALLCLOCK_EFI;
+         return;
      }
 +#endif
- 
-     if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
-         cmos_rtc_probe = false;
+     if ( cmos_rtc_probe() )
+     {
+         wallclock_source = WALLCLOCK_CMOS;
 -- 
 2.44.0
 
diff --git a/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch b/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch
index b108088..5bcf416 100644
--- a/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch
+++ b/0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch
@@ -312,15 +312,18 @@ diff --git a/xen/arch/x86/include/asm/cpuidle.h b/xen/arch/x86/include/asm/cpuid
 index 707b3e948d45..3f5cd40fd596 100644
 --- a/xen/arch/x86/include/asm/cpuidle.h
 +++ b/xen/arch/x86/include/asm/cpuidle.h
-@@ -15,7 +15,7 @@ extern void (*lapic_timer_on)(void);
- 
+@@ -15,9 +15,9 @@ extern void (*lapic_timer_on)(void);
  extern uint64_t (*cpuidle_get_tick)(void);
  
+ #ifdef CONFIG_INTEL
 -int mwait_idle_init(struct notifier_block *nfb);
-+int mwait_idle_init(struct notifier_block *nfb, bool);
- int cpuidle_init_cpu(unsigned int cpu);
- void cf_check default_dead_idle(void);
- void cf_check acpi_dead_idle(void);
++int mwait_idle_init(struct notifier_block *nfb, bool from_acpi);
+ #else
+-static inline int mwait_idle_init(struct notifier_block *nfb)
++static inline int mwait_idle_init(struct notifier_block *nfb, bool from_acpi)
+ {
+     return -ENODEV;
+ }
 -- 
 2.44.0
 
diff --git a/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch b/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch
deleted file mode 100644
index 18aa820..0000000
--- a/0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch
+++ /dev/null
@@ -1,62 +0,0 @@
-From b3262b7069a51e460a9f044eec4fc5e2e5758db2 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
- <marmarek@invisiblethingslab.com>
-Date: Tue, 8 Oct 2024 23:24:31 +0200
-Subject: [PATCH] tools/xg: increase LZMA_BLOCK_SIZE for uncompressing the
- kernel
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Linux 6.12-rc2 fails to decompress with the current 128MiB, contrary to
-the code comment. It results in a failure like this:
-
-    domainbuilder: detail: xc_dom_kernel_file: filename="/var/lib/qubes/vm-kernels/6.12-rc2-1.1.fc37/vmlinuz"
-    domainbuilder: detail: xc_dom_malloc_filemap    : 12104 kB
-    domainbuilder: detail: xc_dom_module_file: filename="/var/lib/qubes/vm-kernels/6.12-rc2-1.1.fc37/initramfs"
-    domainbuilder: detail: xc_dom_malloc_filemap    : 7711 kB
-    domainbuilder: detail: xc_dom_boot_xen_init: ver 4.19, caps xen-3.0-x86_64 hvm-3.0-x86_32 hvm-3.0-x86_32p hvm-3.0-x86_64
-    domainbuilder: detail: xc_dom_parse_image: called
-    domainbuilder: detail: xc_dom_find_loader: trying multiboot-binary loader ...
-    domainbuilder: detail: loader probe failed
-    domainbuilder: detail: xc_dom_find_loader: trying HVM-generic loader ...
-    domainbuilder: detail: loader probe failed
-    domainbuilder: detail: xc_dom_find_loader: trying Linux bzImage loader ...
-    domainbuilder: detail: _xc_try_lzma_decode: XZ decompression error: Memory usage limit reached
-    xc: error: panic: xg_dom_bzimageloader.c:761: xc_dom_probe_bzimage_kernel unable to XZ decompress kernel: Invalid kernel
-    domainbuilder: detail: loader probe failed
-    domainbuilder: detail: xc_dom_find_loader: trying ELF-generic loader ...
-    domainbuilder: detail: loader probe failed
-    xc: error: panic: xg_dom_core.c:689: xc_dom_find_loader: no loader found: Invalid kernel
-    libxl: error: libxl_dom.c:566:libxl__build_dom: xc_dom_parse_image failed
-
-The important part: XZ decompression error: Memory usage limit reached
-
-This looks to be related to the following change in Linux:
-8653c909922743bceb4800e5cc26087208c9e0e6 ("xz: use 128 MiB dictionary and force single-threaded mode")
-
-Fix this by increasing the block size to 256MiB. And remove the
-misleading comment (from lack of better ideas).
-
-Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
----
- tools/libs/guest/xg_dom_bzimageloader.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/tools/libs/guest/xg_dom_bzimageloader.c b/tools/libs/guest/xg_dom_bzimageloader.c
-index c6ee6d83e7c6..1fb4e5a1f728 100644
---- a/tools/libs/guest/xg_dom_bzimageloader.c
-+++ b/tools/libs/guest/xg_dom_bzimageloader.c
-@@ -272,8 +272,7 @@ static int _xc_try_lzma_decode(
-     return retval;
- }
- 
--/* 128 Mb is the minimum size (half-way) documented to work for all inputs. */
--#define LZMA_BLOCK_SIZE (128*1024*1024)
-+#define LZMA_BLOCK_SIZE (256*1024*1024)
- 
- static int xc_try_xz_decode(
-     struct xc_dom_image *dom, void **blob, size_t *size)
--- 
-2.46.0
-
diff --git a/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch b/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch
index 9fdac44..9cfe865 100644
--- a/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch
+++ b/1100-Define-build-dates-time-based-on-SOURCE_DATE_EPOCH.patch
@@ -55,7 +55,7 @@ index 84cba171cd6b..d94df1cd88b6 100644
 -SMBIOS_REL_DATE ?= $(shell date +%m/%d/%Y)
 +SMBIOS_REL_DATE ?= $(shell date $(DATE_EPOCH_OPTS) +%m/%d/%Y)
  
- CFLAGS += $(CFLAGS_xeninclude) -fno-pic
+ CFLAGS += $(CFLAGS_xeninclude) -fno-pic -mregparm=3
  
 diff --git a/tools/firmware/vgabios/Makefile b/tools/firmware/vgabios/Makefile
 index 3284812fdec8..4350ef402127 100644
diff --git a/version b/version
index 1fc0e81..554b652 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-4.19.1
+4.20.0-rc0
diff --git a/xen.spec.in b/xen.spec.in
index 0d4e672..e50a8d0 100644
--- a/xen.spec.in
+++ b/xen.spec.in
@@ -97,11 +97,6 @@ Patch0202: 0202-Add-xen.cfg-options-for-mapbs-and-noexitboot.patch
 Patch0203: 0203-xen.efi.build.patch
 
 # Backports (300+)
-Patch0300: 0300-xen-list-add-LIST_HEAD_RO_AFTER_INIT.patch
-Patch0301: 0301-x86-mm-add-API-for-marking-only-part-of-a-MMIO-page-.patch
-Patch0302: 0302-drivers-char-Use-sub-page-ro-API-to-make-just-xhci-d.patch
-Patch0305: 0305-x86-spec-ctrl-Support-for-SRSO_U-S_NO-and-SRSO_MSR_F.patch
-Patch0306: 0306-x86-io-apic-prevent-early-exit-from-i8259-loop-detec.patch
 
 # Security fixes (500+)
 
@@ -141,8 +136,6 @@ Patch0627: 0627-x86-msr-Allow-hardware-domain-to-read-package-C-stat.patch
 Patch0628: 0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch
 Patch0629: 0629-libxl_pci-Pass-power_mgmt-via-QMP.patch
 
-Patch0630: 0630-tools-xg-increase-LZMA_BLOCK_SIZE-for-uncompressing-.patch
-
 # Qubes specific patches
 Patch1000: 1000-Do-not-access-network-during-the-build.patch
 Patch1001: 1001-hotplug-store-block-params-for-cleanup.patch

From aeb98de6b91abd026f86c49f179dc0445c2f189c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 8 Jan 2025 15:48:28 +0100
Subject: [PATCH 3/6] Fix build with Fedora-provided cflags

See patch description for details.
This solves similar problem as https://github.com/TrenchBoot/xen/pull/19/commits/11aee1127ff14acb08d066a3a123760ddcbbe8c3
---
 ...text-gap-diff-to-work-with-64-bytes-.patch | 44 +++++++++++++++++++
 xen.spec.in                                   |  2 +
 2 files changed, 46 insertions(+)
 create mode 100644 0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch

diff --git a/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch b/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch
new file mode 100644
index 0000000..bf4fd4c
--- /dev/null
+++ b/0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch
@@ -0,0 +1,44 @@
+From 77d5991de867a1b2d694147958d77c51a2b989ae Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
+ <marmarek@invisiblethingslab.com>
+Date: Wed, 8 Jan 2025 15:37:13 +0100
+Subject: [PATCH] x86/boot: adjust text gap/diff to work with 64-bytes
+ alignment too
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Xen compiled with -mtune=generic has .text alignment set to 64-bytes.
+Setting text_diff to non-64-bytes-aligned number breaks stuff:
+
+    Traceback (most recent call last):
+      File "/builddir/build/BUILD/xen-4.20.0-build/xen-4.20.0-rc0/xen/./tools/combine_two_binaries.py", line 96, in <module>
+        raise Exception('File sizes do not match')
+    Exception: File sizes do not match: 70160 != 4080 + 66048
+
+Adjust the numbers as suggested by Frediano to work with 64-bytes and
+even 128-bytes alignment.
+
+Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+---
+ xen/arch/x86/boot/Makefile | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/boot/Makefile b/xen/arch/x86/boot/Makefile
+index d45787665907..80c32163fbbd 100644
+--- a/xen/arch/x86/boot/Makefile
++++ b/xen/arch/x86/boot/Makefile
+@@ -40,8 +40,8 @@ LD32 := $(LD) $(subst x86_64,i386,$(LDFLAGS_DIRECT))
+ # are affected by both text_diff and text_gap.  Ensure the sum of gap and diff
+ # is greater than 2^16 so that any 16bit relocations if present in the object
+ # file turns into a build-time error.
+-text_gap := 0x010200
+-text_diff := 0x408020
++text_gap := 0x010240
++text_diff := 0x608040
+ 
+ $(obj)/build32.base.lds: AFLAGS-y += -DGAP=$(text_gap) -DTEXT_DIFF=$(text_diff)
+ $(obj)/build32.offset.lds: AFLAGS-y += -DGAP=$(text_gap) -DTEXT_DIFF=$(text_diff) -DAPPLY_OFFSET
+-- 
+2.46.0
+
diff --git a/xen.spec.in b/xen.spec.in
index e50a8d0..8252188 100644
--- a/xen.spec.in
+++ b/xen.spec.in
@@ -136,6 +136,8 @@ Patch0627: 0627-x86-msr-Allow-hardware-domain-to-read-package-C-stat.patch
 Patch0628: 0628-x86-mwait-idle-Use-ACPI-for-CPUs-without-hardcoded-C.patch
 Patch0629: 0629-libxl_pci-Pass-power_mgmt-via-QMP.patch
 
+Patch0630: 0630-x86-boot-adjust-text-gap-diff-to-work-with-64-bytes-.patch
+
 # Qubes specific patches
 Patch1000: 1000-Do-not-access-network-during-the-build.patch
 Patch1001: 1001-hotplug-store-block-params-for-cleanup.patch

From 03d4c203a2a40bfa12835a27699560efec1eea48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 8 Jan 2025 21:09:36 +0100
Subject: [PATCH 4/6] Apply PV-IOMMU series

---
 ...s-Add-a-design-document-for-PV-IOMMU.patch |  140 +
 ...-a-design-document-for-IOMMU-subsyst.patch |  437 +++
 ...Introduce-redesigned-IOMMU-subsystem.patch | 1904 +++++++++++
 ...d-Port-IOMMU-driver-to-new-subsystem.patch | 2965 +++++++++++++++++
 ...troduce-PV-IOMMU-hypercall-interface.patch |  965 ++++++
 xen.spec.in                                   |    6 +
 6 files changed, 6417 insertions(+)
 create mode 100644 0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch
 create mode 100644 0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch
 create mode 100644 0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch
 create mode 100644 0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch
 create mode 100644 0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch

diff --git a/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch b/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch
new file mode 100644
index 0000000..3ca961b
--- /dev/null
+++ b/0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch
@@ -0,0 +1,140 @@
+From 0b9a21e9fba41427921031f97346ead3f6b2a8d6 Mon Sep 17 00:00:00 2001
+From: Teddy Astie <teddy.astie@vates.tech>
+Date: Mon, 4 Nov 2024 14:28:38 +0000
+Subject: [PATCH 400/404] docs/designs: Add a design document for PV-IOMMU
+
+Some operating systems want to use IOMMU to implement various features (e.g
+VFIO) or DMA protection.
+This patch introduce a proposal for IOMMU paravirtualization for Dom0.
+
+Signed-off-by Teddy Astie <teddy.astie@vates.tech>
+---
+ docs/designs/pv-iommu.md | 116 +++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 116 insertions(+)
+ create mode 100644 docs/designs/pv-iommu.md
+
+diff --git a/docs/designs/pv-iommu.md b/docs/designs/pv-iommu.md
+new file mode 100644
+index 000000000000..7df9fa0b9489
+--- /dev/null
++++ b/docs/designs/pv-iommu.md
+@@ -0,0 +1,116 @@
++# IOMMU paravirtualization for Dom0
++
++Status: Experimental
++
++# Background
++
++By default, Xen only uses the IOMMU for itself, either to make device adress
++space coherent with guest adress space (x86 HVM/PVH) or to prevent devices
++from doing DMA outside it's expected memory regions including the hypervisor
++(x86 PV).
++
++A limitation is that guests (especially privildged ones) may want to use
++IOMMU hardware in order to implement features such as DMA protection and
++VFIO [1] as IOMMU functionality is not available outside of the hypervisor
++currently.
++
++[1] VFIO - "Virtual Function I/O" - https://www.kernel.org/doc/html/latest/driver-api/vfio.html
++
++# Design
++
++The operating system may want to have access to various IOMMU features such as
++context management and DMA remapping. We can create a new hypercall that allows
++the guest to have access to a new paravirtualized IOMMU interface.
++
++This feature is only meant to be available for the Dom0, as DomU have some
++emulated devices that can't be managed on Xen side and are not hardware, we
++can't rely on the hardware IOMMU to enforce DMA remapping.
++
++This interface is exposed under the `iommu_op` hypercall.
++
++In addition, Xen domains are modified in order to allow existence of several
++IOMMU context including a default one that implement default behavior (e.g
++hardware assisted paging) and can't be modified by guest. DomU cannot have
++contexts, and therefore act as if they only have the default domain.
++
++Each IOMMU context within a Xen domain is identified using a domain-specific
++context number that is used in the Xen IOMMU subsystem and the hypercall
++interface.
++
++The number of IOMMU context a domain is specified by either the toolstack or
++the domain itself.
++
++# IOMMU operations
++
++## Initialize PV-IOMMU
++
++Initialize PV-IOMMU for the domain.
++It can only be called once.
++
++## Alloc context
++
++Create a new IOMMU context for the guest and return the context number to the
++guest.
++Fail if the IOMMU context limit of the guest is reached.
++
++A flag can be specified to create a identity mapping.
++
++## Free context
++
++Destroy a IOMMU context created previously.
++It is not possible to free the default context.
++
++Reattach context devices to default context if specified by the guest.
++
++Fail if there is a device in the context and reattach-to-default flag is not
++specified.
++
++## Reattach device
++
++Reattach a device to another IOMMU context (including the default one).
++The target IOMMU context number must be valid and the context allocated.
++
++The guest needs to specify a PCI SBDF of a device he has access to.
++
++## Map/unmap page
++
++Map/unmap a page on a context.
++The guest needs to specify a gfn and target dfn to map.
++
++Refuse to create the mapping if one already exist for the same dfn.
++
++## Lookup page
++
++Get the gfn mapped by a specific dfn.
++
++## Remote command
++
++Make a PV-IOMMU operation on behalf of another domain.
++Especially useful for implementing IOMMU emulation (e.g using QEMU)
++or initializing PV-IOMMU with enforced limits.
++
++# Implementation considerations
++
++## Hypercall batching
++
++In order to prevent unneeded hypercalls and IOMMU flushing, it is advisable to
++be able to batch some critical IOMMU operations (e.g map/unmap multiple pages).
++
++## Hardware without IOMMU support
++
++Operating system needs to be aware on PV-IOMMU capability, and whether it is
++able to make contexts. However, some operating system may critically fail in
++case they are able to make a new IOMMU context. Which is supposed to happen
++if no IOMMU hardware is available.
++
++The hypercall interface needs a interface to advertise the ability to create
++and manage IOMMU contexts including the amount of context the guest is able
++to use. Using these informations, the Dom0 may decide whether to use or not
++the PV-IOMMU interface.
++
++## Page pool for contexts
++
++In order to prevent unexpected starving on the hypervisor memory with a
++buggy Dom0. We can preallocate the pages the contexts will use and make
++map/unmap use these pages instead of allocating them dynamically.
++
+-- 
+2.46.0
+
diff --git a/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch b/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch
new file mode 100644
index 0000000..d4acbab
--- /dev/null
+++ b/0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch
@@ -0,0 +1,437 @@
+From e68760dd296108259247af8ad218200af830c324 Mon Sep 17 00:00:00 2001
+From: Teddy Astie <teddy.astie@vates.tech>
+Date: Mon, 4 Nov 2024 14:28:38 +0000
+Subject: [PATCH 401/404] docs/designs: Add a design document for IOMMU
+ subsystem redesign
+
+Current IOMMU subsystem has some limitations that make PV-IOMMU practically impossible.
+One of them is the assumtion that each domain is bound to a single "IOMMU domain", which
+also causes complications with quarantine implementation.
+
+Moreover, current IOMMU subsystem is not entirely well-defined, for instance, the behavior
+of map_page between ARM SMMUv3 and x86 VT-d/AMD-Vi greatly differs. On ARM, it can modifies
+the domain page table while on x86, it may be forbidden (e.g using HAP with PVH), or only
+modifying the devices PoV (e.g using PV).
+
+The goal of this redesign is to define more explicitely the behavior and interface of the
+IOMMU subsystem while allowing PV-IOMMU to be effectively implemented.
+
+Signed-off-by Teddy Astie <teddy.astie@vates.tech>
+---
+ docs/designs/iommu-contexts.md | 403 +++++++++++++++++++++++++++++++++
+ 1 file changed, 403 insertions(+)
+ create mode 100644 docs/designs/iommu-contexts.md
+
+diff --git a/docs/designs/iommu-contexts.md b/docs/designs/iommu-contexts.md
+new file mode 100644
+index 000000000000..9d6fb9554953
+--- /dev/null
++++ b/docs/designs/iommu-contexts.md
+@@ -0,0 +1,403 @@
++# IOMMU context management in Xen
++
++Status: Experimental
++Revision: 0
++
++# Background
++
++The design for *IOMMU paravirtualization for Dom0* [1] explains that some guests may
++want to access to IOMMU features. In order to implement this in Xen, several adjustments
++needs to be made to the IOMMU subsystem.
++
++This "hardware IOMMU domain" is currently implemented on a per-domain basis such as each
++domain actually has a specific *hardware IOMMU domain*, this design aims to allow a
++single Xen domain to manage several "IOMMU context", and allow some domains (e.g Dom0
++[1]) to modify their IOMMU contexts.
++
++In addition to this, quarantine feature can be refactored into using IOMMU contexts
++to reduce the complexity of platform-specific implementations and ensuring more
++consistency across platforms.
++
++# IOMMU context
++
++We define a "IOMMU context" as being a *hardware IOMMU domain*, but named as a context
++to avoid confusion with Xen domains.
++It represents some hardware-specific data structure that contains mappings from a device
++frame-number to a machine frame-number (e.g using a pagetable) that can be applied to
++a device using IOMMU hardware.
++
++This structure is bound to a Xen domain, but a Xen domain may have several IOMMU context.
++These contexts may be modifiable using the interface as defined in [1] aside some
++specific cases (e.g modifying default context).
++
++This is implemented in Xen as a new structure that will hold context-specific
++data.
++
++```c
++struct iommu_context {
++    u16 id; /* Context id (0 means default context) */
++    struct list_head devices;
++
++    struct arch_iommu_context arch;
++
++    bool opaque; /* context can't be modified nor accessed (e.g HAP) */
++};
++```
++
++A context is identified by a number that is domain-specific and may be used by IOMMU
++users such as PV-IOMMU by the guest.
++
++struct arch_iommu_context is splited from struct arch_iommu
++
++```c
++struct arch_iommu_context
++{
++    spinlock_t pgtables_lock;
++    struct page_list_head pgtables;
++
++    union {
++        /* Intel VT-d */
++        struct {
++            uint64_t pgd_maddr; /* io page directory machine address */
++            domid_t *didmap; /* per-iommu DID */
++            unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the context uses */
++        } vtd;
++        /* AMD IOMMU */
++        struct {
++            struct page_info *root_table;
++        } amd;
++    };
++};
++
++struct arch_iommu
++{
++    spinlock_t mapping_lock; /* io page table lock */
++    struct {
++        struct page_list_head list;
++        spinlock_t lock;
++    } pgtables;
++
++    struct list_head identity_maps;
++
++    union {
++        /* Intel VT-d */
++        struct {
++            /* no more context-specific values */
++            unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */
++        } vtd;
++        /* AMD IOMMU */
++        struct {
++            unsigned int paging_mode;
++            struct guest_iommu *g_iommu;
++        } amd;
++    };
++};
++```
++
++IOMMU context information is now carried by iommu_context rather than being integrated to
++struct arch_iommu.
++
++# Xen domain IOMMU structure
++
++`struct domain_iommu` is modified to allow multiples context within a single Xen domain
++to exist :
++
++```c
++struct iommu_context_list {
++    uint16_t count; /* Context count excluding default context */
++
++    /* if count > 0 */
++
++    uint64_t *bitmap; /* bitmap of context allocation */
++    struct iommu_context *map; /* Map of contexts */
++};
++
++struct domain_iommu {
++    /* ... */
++
++    struct iommu_context default_ctx;
++    struct iommu_context_list other_contexts;
++
++    /* ... */
++}
++```
++
++default_ctx is a special context with id=0 that holds the page table mapping the entire
++domain, which basically preserve the previous behavior. All devices are expected to be
++bound to this context during initialization.
++
++Along with this default context that always exist, we use a pool of contexts that has a
++fixed size at domain initialization, where contexts can be allocated (if possible), and
++have a id matching their position in the map (considering that id != 0).
++These contexts may be used by IOMMU contexts users such as PV-IOMMU or quarantine domain
++(DomIO).
++
++# Platform independent context management interface
++
++A new platform independant interface is introduced in Xen hypervisor to allow
++IOMMU contexts users to create and manage contexts within domains.
++
++```c
++/* Direct context access functions (not supposed to be used directly) */
++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no);
++void iommu_put_context(struct iommu_context *ctx);
++
++/* Flag for default context initialization */
++#define IOMMU_CONTEXT_INIT_default (1 << 0)
++
++/* Flag for quarantine contexts (scratch page, DMA Abort mode, ...) */
++#define IOMMU_CONTEXT_INIT_quarantine (1 << 1)
++
++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no, u32 flags);
++
++/* Flag to specify that devices will need to be reattached to default domain */
++#define IOMMU_TEARDOWN_REATTACH_DEFAULT (1 << 0)
++
++/*
++ * Flag to specify that the context needs to be destroyed preemptively
++ * (multiple calls to iommu_context_teardown will be required)
++ */
++#define IOMMU_TEARDOWN_PREEMPT (1 << 1)
++
++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags);
++
++/* Allocate a new context, uses CONTEXT_INIT flags */
++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags);
++
++/* Free a context, uses CONTEXT_TEARDOWN flags */
++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags);
++
++/* Move a device from one context to another, including between different domains. */
++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom,
++                           device_t *dev, u16 ctx_no);
++
++/* Add a device to a context for first initialization */
++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no);
++
++/* Remove a device from a context, effectively removing it from the IOMMU. */
++int iommu_detach_context(struct domain *d, device_t *dev);
++```
++
++This interface will use a new interface with drivers to implement these features.
++
++Some existing functions will have a new parameter to specify on what context to do the operation.
++- iommu_map (iommu_legacy_map untouched)
++- iommu_unmap (iommu_legacy_unmap untouched)
++- iommu_lookup_page
++- iommu_iotlb_flush
++
++These functions will modify the iommu_context structure to accomodate with the
++operations applied, these functions will be used to replace some operations previously
++made in the IOMMU driver.
++
++# IOMMU platform_ops interface changes
++
++The IOMMU driver needs to expose a way to create and manage IOMMU contexts, the approach
++taken here is to modify the interface to allow specifying a IOMMU context on operations,
++and at the same time, simplifying the interface by relying more on iommu
++platform-independent code.
++
++Added functions in iommu_ops
++
++```c
++/* Initialize a context (creating page tables, allocating hardware, structures, ...) */
++int (*context_init)(struct domain *d, struct iommu_context *ctx,
++                    u32 flags);
++/* Destroy a context, assumes no device is bound to the context. */
++int (*context_teardown)(struct domain *d, struct iommu_context *ctx,
++                        u32 flags);
++/* Put a device in a context (assumes the device is not attached to another context) */
++int (*attach)(struct domain *d, device_t *dev,
++              struct iommu_context *ctx);
++/* Remove a device from a context, and from the IOMMU. */
++int (*detach)(struct domain *d, device_t *dev,
++              struct iommu_context *prev_ctx);
++/* Move the device from a context to another, including if the new context is in
++   another domain. d corresponds to the target domain. */
++int (*reattach)(struct domain *d, device_t *dev,
++                struct iommu_context *prev_ctx,
++                struct iommu_context *ctx);
++
++#ifdef CONFIG_HAS_PCI
++/* Specific interface for phantom function devices. */
++int (*add_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn,
++                 struct iommu_context *ctx);
++int (*remove_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn,
++                    struct iommu_context *ctx);
++#endif
++
++/* Changes in existing to use a specified iommu_context. */
++int __must_check (*map_page)(struct domain *d, dfn_t dfn, mfn_t mfn,
++                             unsigned int flags,
++                             unsigned int *flush_flags,
++                             struct iommu_context *ctx);
++int __must_check (*unmap_page)(struct domain *d, dfn_t dfn,
++                               unsigned int order,
++                               unsigned int *flush_flags,
++                               struct iommu_context *ctx);
++int __must_check (*lookup_page)(struct domain *d, dfn_t dfn, mfn_t *mfn,
++                                unsigned int *flags,
++                                struct iommu_context *ctx);
++
++int __must_check (*iotlb_flush)(struct domain *d,
++                                struct iommu_context *ctx, dfn_t dfn,
++                                unsigned long page_count,
++                                unsigned int flush_flags);
++
++void (*clear_root_pgtable)(struct domain *d, struct iommu_context *ctx);
++```
++
++These functions are redundant with existing functions, therefore, the following functions
++are replaced with new equivalents :
++- quarantine_init : platform-independent code and IOMMU_CONTEXT_INIT_quarantine flag
++- add_device : attach and add_devfn (phantom)
++- assign_device : attach and add_devfn (phantom)
++- remove_device : detach and remove_devfn (phantom)
++- reassign_device : reattach
++
++Some functionnal differences with previous functions, the following should be handled
++by platform-independent/arch-specific code instead of IOMMU driver :
++- identity mappings (unity mappings and rmrr)
++- device list in context and domain
++- domain of a device
++- quarantine
++
++The idea behind this is to implement IOMMU context features while simplifying IOMMU
++drivers implementations and ensuring more consistency between IOMMU drivers.
++
++## Phantom function handling
++
++PCI devices may use additionnal devfn to do DMA operations, in order to support such
++devices, an interface is added to map specific device functions without implying that
++the device is mapped to a new context (that may cause duplicates in Xen data structures).
++
++Functions add_devfn and remove_devfn allows to map a iommu context on specific devfn
++for a pci device, without altering platform-independent data structures.
++
++It is important for the reattach operation to care about these devices, in order
++to prevent devices from being partially reattached to the new context (see XSA-449 [2])
++by using a all-or-nothing approach for reattaching such devices.
++
++# Quarantine refactoring using IOMMU contexts
++
++The quarantine mecanism can be entirely reimplemented using IOMMU context, making
++it simpler, more consistent between platforms,
++
++Quarantine is currently only supported with x86 platforms and works by creating a
++single *hardware IOMMU domain* per quarantined device. All the quarantine logic is
++the implemented in a platform-specific fashion while actually implementing the same
++concepts :
++
++The *hardware IOMMU context* data structures for quarantine are currently stored in
++the device structure itself (using arch_pci_dev) and IOMMU driver needs to care about
++whether we are dealing with quarantine operations or regular operations (often dealt
++using macros such as QUARANTINE_SKIP or DEVICE_PGTABLE).
++
++The page table that will apply on the quarantined device is created reserved device
++regions, and adding mappings to a scratch page if enabled (quarantine=scratch-page).
++
++A new approach we can use is allowing the quarantine domain (DomIO) to manage IOMMU
++contexts, and implement all the quarantine logic using IOMMU contexts.
++
++That way, the quarantine implementation can be platform-independent, thus have a more
++consistent implementation between platforms. It will also allows quarantine to work
++with other IOMMU implementations without having to implement platform-specific behavior.
++Moreover, quarantine operations can be implemented using regular context operations
++instead of relying on driver-specific code.
++
++Quarantine implementation can be summarised as
++
++```c
++int iommu_quarantine_dev_init(device_t *dev)
++{
++    int ret;
++    u16 ctx_no;
++
++    if ( !iommu_quarantine )
++        return -EINVAL;
++
++    ret = iommu_context_alloc(dom_io, &ctx_no, IOMMU_CONTEXT_INIT_quarantine);
++
++    if ( ret )
++        return ret;
++
++    /** TODO: Setup scratch page, mappings... */
++
++    ret = iommu_reattach_context(dev->domain, dom_io, dev, ctx_no);
++
++    if ( ret )
++    {
++        ASSERT(!iommu_context_free(dom_io, ctx_no, 0));
++        return ret;
++    }
++
++    return ret;
++}
++```
++
++# Platform-specific considerations
++
++## Reference counters on target pages
++
++When mapping a guest page onto a IOMMU context, we need to make sure that
++this page is not reused for something else while being actually referenced
++by a IOMMU context. One way of doing it is incrementing the reference counter
++of each target page we map (excluding reserved regions), and decrementing it
++when the mapping isn't used anymore.
++
++One consideration to have is when destroying the context while having existing
++mappings in it. We can walk through the entire page table and decrement the
++reference counter of all mappings. All of that assumes that there is no reserved
++region mapped (which should be the case as a requirement of teardown, or as a
++consequence of REATTACH_DEFAULT flag).
++
++Another consideration is that the "cleanup mappings" operation may take a lot
++of time depending on the complexity of the page table. Making the teardown operation preemptable can allow the hypercall to be preempted if needed also preventing a malicious
++guest from stalling a CPU in a teardown operation with a specially crafted IOMMU
++context (e.g with several 1G superpages).
++
++## Limit the amount of pages IOMMU contexts can use
++
++In order to prevent a (eventually malicious) guest from causing too much allocations
++in Xen, we can enforce limits on the memory the IOMMU subsystem can use for IOMMU context.
++A possible implementation can be to preallocate a reasonably large chunk of memory
++and split it into pages for use by the IOMMU subsystem only for non-default IOMMU
++contexts (e.g PV-IOMMU interface), if this limitation is overcome, some operations
++may fail from the guest side. These limitations shouldn't impact "usual" operations
++of the IOMMU subsystem (e.g default context initialization).
++
++## x86 Architecture
++
++TODO
++
++### Intel VT-d
++
++VT-d uses DID to tag the *IOMMU domain* applied to a device and assumes that all entries
++with the same DID uses the same page table (i.e same IOMMU context).
++Under certain circonstances (e.g DRHD with DID limit below 16-bits), the *DID* is
++transparently converted into a DRHD-specific DID using a map managed internally.
++
++The current implementation of the code reuses the Xen domain_id as DID.
++However, by using multiples IOMMU contexts per domain, we can't use the domain_id for
++contexts (otherwise, different page tables will be mapped with the same DID).
++The following strategy is used :
++- on the default context, reuse the domain_id (the default context is unique per domain)
++- on non-default context, use a id allocated in the pseudo_domid map, (actually used by
++quarantine) which is a DID outside of Xen domain_id range
++
++### AMD-Vi
++
++TODO
++
++## Device-tree platforms
++
++### SMMU and SMMUv3
++
++TODO
++
++* * *
++
++[1] See pv-iommu.md
++
++[2] pci: phantom functions assigned to incorrect contexts
++https://xenbits.xen.org/xsa/advisory-449.html
+\ No newline at end of file
+-- 
+2.46.0
+
diff --git a/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch b/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch
new file mode 100644
index 0000000..1f55a5a
--- /dev/null
+++ b/0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch
@@ -0,0 +1,1904 @@
+From 5a6c3e24142e3b2eda546e1e1e277b9a5efa374e Mon Sep 17 00:00:00 2001
+From: Teddy Astie <teddy.astie@vates.tech>
+Date: Mon, 4 Nov 2024 14:28:40 +0000
+Subject: [PATCH 402/404] IOMMU: Introduce redesigned IOMMU subsystem
+
+Based on docs/designs/iommu-contexts.md, implement the redesigned IOMMU subsystem.
+
+Signed-off-by Teddy Astie <teddy.astie@vates.tech>
+---
+ xen/arch/x86/domain.c                |   2 +-
+ xen/arch/x86/mm/p2m-ept.c            |   2 +-
+ xen/arch/x86/pv/dom0_build.c         |   4 +-
+ xen/arch/x86/tboot.c                 |   4 +-
+ xen/common/memory.c                  |   4 +-
+ xen/drivers/passthrough/Makefile     |   3 +
+ xen/drivers/passthrough/context.c    | 711 +++++++++++++++++++++++++++
+ xen/drivers/passthrough/iommu.c      | 396 ++++++---------
+ xen/drivers/passthrough/pci.c        | 117 +----
+ xen/drivers/passthrough/quarantine.c |  49 ++
+ xen/include/xen/iommu.h              | 117 ++++-
+ xen/include/xen/pci.h                |   3 +
+ 12 files changed, 1032 insertions(+), 380 deletions(-)
+ create mode 100644 xen/drivers/passthrough/context.c
+ create mode 100644 xen/drivers/passthrough/quarantine.c
+
+diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
+index 78a13e6812c9..9b1946cbc0a1 100644
+--- a/xen/arch/x86/domain.c
++++ b/xen/arch/x86/domain.c
+@@ -2392,7 +2392,7 @@ int domain_relinquish_resources(struct domain *d)
+ 
+     PROGRESS(iommu_pagetables):
+ 
+-        ret = iommu_free_pgtables(d);
++        ret = iommu_free_pgtables(d, iommu_default_context(d));
+         if ( ret )
+             return ret;
+ 
+diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
+index 21728397f9ac..5ddeefb82675 100644
+--- a/xen/arch/x86/mm/p2m-ept.c
++++ b/xen/arch/x86/mm/p2m-ept.c
+@@ -974,7 +974,7 @@ out:
+             rc = iommu_iotlb_flush(d, _dfn(gfn), 1ul << order,
+                                    (iommu_flags ? IOMMU_FLUSHF_added : 0) |
+                                    (vtd_pte_present ? IOMMU_FLUSHF_modified
+-                                                    : 0));
++                                                    : 0), 0);
+         else if ( need_iommu_pt_sync(d) )
+             rc = iommu_flags ?
+                 iommu_legacy_map(d, _dfn(gfn), mfn, 1ul << order, iommu_flags) :
+diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
+index f54d1da5c6f4..345e6bec083f 100644
+--- a/xen/arch/x86/pv/dom0_build.c
++++ b/xen/arch/x86/pv/dom0_build.c
+@@ -77,7 +77,7 @@ static __init void mark_pv_pt_pages_rdonly(struct domain *d,
+          * iommu_memory_setup() ended up mapping them.
+          */
+         if ( need_iommu_pt_sync(d) &&
+-             iommu_unmap(d, _dfn(mfn_x(page_to_mfn(page))), 1, 0, flush_flags) )
++             iommu_unmap(d, _dfn(mfn_x(page_to_mfn(page))), 1, 0, flush_flags, 0) )
+             BUG();
+ 
+         /* Read-only mapping + PGC_allocated + page-table page. */
+@@ -128,7 +128,7 @@ static void __init iommu_memory_setup(struct domain *d, const char *what,
+ 
+     while ( (rc = iommu_map(d, _dfn(mfn_x(mfn)), mfn, nr,
+                             IOMMUF_readable | IOMMUF_writable | IOMMUF_preempt,
+-                            flush_flags)) > 0 )
++                            flush_flags, 0)) > 0 )
+     {
+         mfn = mfn_add(mfn, rc);
+         nr -= rc;
+diff --git a/xen/arch/x86/tboot.c b/xen/arch/x86/tboot.c
+index d5db60d335e3..25a5a6641261 100644
+--- a/xen/arch/x86/tboot.c
++++ b/xen/arch/x86/tboot.c
+@@ -218,9 +218,9 @@ static void tboot_gen_domain_integrity(const uint8_t key[TB_KEY_SIZE],
+ 
+         if ( is_iommu_enabled(d) && is_vtd )
+         {
+-            const struct domain_iommu *dio = dom_iommu(d);
++            struct domain_iommu *dio = dom_iommu(d);
+ 
+-            update_iommu_mac(&ctx, dio->arch.vtd.pgd_maddr,
++            update_iommu_mac(&ctx, iommu_default_context(d)->arch.vtd.pgd_maddr,
+                              agaw_to_level(dio->arch.vtd.agaw));
+         }
+     }
+diff --git a/xen/common/memory.c b/xen/common/memory.c
+index a6f2f6d1b348..acf305bcd0fd 100644
+--- a/xen/common/memory.c
++++ b/xen/common/memory.c
+@@ -926,7 +926,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
+         this_cpu(iommu_dont_flush_iotlb) = 0;
+ 
+         ret = iommu_iotlb_flush(d, _dfn(xatp->idx - done), done,
+-                                IOMMU_FLUSHF_modified);
++                                IOMMU_FLUSHF_modified, 0);
+         if ( unlikely(ret) && rc >= 0 )
+             rc = ret;
+ 
+@@ -940,7 +940,7 @@ int xenmem_add_to_physmap(struct domain *d, struct xen_add_to_physmap *xatp,
+             put_page(pages[i]);
+ 
+         ret = iommu_iotlb_flush(d, _dfn(xatp->gpfn - done), done,
+-                                IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified);
++                                IOMMU_FLUSHF_added | IOMMU_FLUSHF_modified, 0);
+         if ( unlikely(ret) && rc >= 0 )
+             rc = ret;
+     }
+diff --git a/xen/drivers/passthrough/Makefile b/xen/drivers/passthrough/Makefile
+index a1621540b78d..69327080abe6 100644
+--- a/xen/drivers/passthrough/Makefile
++++ b/xen/drivers/passthrough/Makefile
+@@ -4,6 +4,9 @@ obj-$(CONFIG_X86) += x86/
+ obj-$(CONFIG_ARM) += arm/
+ 
+ obj-y += iommu.o
++obj-y += context.o
++obj-y += quarantine.o
++
+ obj-$(CONFIG_HAS_PCI) += pci.o
+ obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
+ obj-$(CONFIG_HAS_PCI) += ats.o
+diff --git a/xen/drivers/passthrough/context.c b/xen/drivers/passthrough/context.c
+new file mode 100644
+index 000000000000..edf660b617a9
+--- /dev/null
++++ b/xen/drivers/passthrough/context.c
+@@ -0,0 +1,711 @@
++/*
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; If not, see <http://www.gnu.org/licenses/>.
++ */
++
++#include <xen/iommu.h>
++#include <xen/event.h>
++#include <xen/sched.h>
++#include <xen/spinlock.h>
++#include <xen/bitops.h>
++#include <xen/bitmap.h>
++
++bool iommu_check_context(struct domain *d, u16 ctx_no) {
++    struct domain_iommu *hd = dom_iommu(d);
++
++    if (ctx_no == 0)
++        return 1; /* Default context always exist. */
++
++    if ((ctx_no - 1) >= hd->other_contexts.count)
++        return 0; /* out of bounds */
++
++    return test_bit(ctx_no - 1, hd->other_contexts.bitmap);
++}
++
++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no) {
++    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx;
++
++    if ( !iommu_check_context(d, ctx_no) )
++        return NULL;
++
++    if (ctx_no == 0)
++        ctx = &hd->default_ctx;
++    else
++        ctx = &hd->other_contexts.map[ctx_no - 1];
++
++    rspin_lock(&ctx->lock);
++    /* Check if the context is still valid at this point */
++    if ( unlikely(!iommu_check_context(d, ctx_no)) )
++    {
++        /* Context has been destroyed in between */
++        rspin_unlock(&ctx->lock);
++        return NULL;
++    }
++
++    return ctx;
++}
++
++void iommu_put_context(struct iommu_context *ctx)
++{
++    rspin_unlock(&ctx->lock);
++}
++
++static unsigned int mapping_order(const struct domain_iommu *hd,
++                                  dfn_t dfn, mfn_t mfn, unsigned long nr)
++{
++    unsigned long res = dfn_x(dfn) | mfn_x(mfn);
++    unsigned long sizes = hd->platform_ops->page_sizes;
++    unsigned int bit = ffsl(sizes) - 1, order = 0;
++
++    ASSERT(bit == PAGE_SHIFT);
++
++    while ( (sizes = (sizes >> bit) & ~1) )
++    {
++        unsigned long mask;
++
++        bit = ffsl(sizes) - 1;
++        mask = (1UL << bit) - 1;
++        if ( nr <= mask || (res & mask) )
++            break;
++        order += bit;
++        nr >>= bit;
++        res >>= bit;
++    }
++
++    return order;
++}
++
++static long _iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0,
++                       unsigned long page_count, unsigned int flags,
++                       unsigned int *flush_flags, struct iommu_context *ctx)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    unsigned long i;
++    unsigned int order, j = 0;
++    int rc = 0;
++
++    if ( !is_iommu_enabled(d) )
++        return 0;
++
++    ASSERT(!IOMMUF_order(flags));
++
++    for ( i = 0; i < page_count; i += 1UL << order )
++    {
++        dfn_t dfn = dfn_add(dfn0, i);
++        mfn_t mfn = mfn_add(mfn0, i);
++
++        order = mapping_order(hd, dfn, mfn, page_count - i);
++
++        if ( (flags & IOMMUF_preempt) &&
++             ((!(++j & 0xfff) && general_preempt_check()) ||
++              i > LONG_MAX - (1UL << order)) )
++            return i;
++
++        rc = iommu_call(hd->platform_ops, map_page, d, dfn, mfn,
++                        flags | IOMMUF_order(order), flush_flags, ctx);
++
++        if ( likely(!rc) )
++            continue;
++
++        if ( !d->is_shutting_down && printk_ratelimit() )
++            printk(XENLOG_ERR
++                   "d%d: IOMMU mapping dfn %"PRI_dfn" to mfn %"PRI_mfn" failed: %d\n",
++                   d->domain_id, dfn_x(dfn), mfn_x(mfn), rc);
++
++        /* while statement to satisfy __must_check */
++        while ( iommu_unmap(d, dfn0, i, 0, flush_flags, ctx->id) )
++            break;
++
++        if ( !ctx->id && !is_hardware_domain(d) )
++            domain_crash(d);
++
++        break;
++    }
++
++    /*
++     * Something went wrong so, if we were dealing with more than a single
++     * page, flush everything and clear flush flags.
++     */
++    if ( page_count > 1 && unlikely(rc) &&
++         !iommu_iotlb_flush_all(d, *flush_flags) )
++        *flush_flags = 0;
++
++    return rc;
++}
++
++long iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0,
++               unsigned long page_count, unsigned int flags,
++               unsigned int *flush_flags, u16 ctx_no)
++{
++    struct iommu_context *ctx;
++    long ret;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++        return -ENOENT;
++
++    ret = _iommu_map(d, dfn0, mfn0, page_count, flags, flush_flags, ctx);
++
++    iommu_put_context(ctx);
++
++    return ret;
++}
++
++int iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn,
++                     unsigned long page_count, unsigned int flags)
++{
++    struct iommu_context *ctx;
++    unsigned int flush_flags = 0;
++    int rc = 0;
++
++    ASSERT(!(flags & IOMMUF_preempt));
++
++    if ( dom_iommu(d)->no_dma )
++        return 0;
++
++    ctx = iommu_get_context(d, 0);
++
++    if ( !ctx->opaque )
++    {
++        rc = iommu_map(d, dfn, mfn, page_count, flags, &flush_flags, 0);
++
++        if ( !this_cpu(iommu_dont_flush_iotlb) && !rc )
++            rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags, 0);
++    }
++
++    iommu_put_context(ctx);
++
++    return rc;
++}
++
++static long _iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count,
++                         unsigned int flags, unsigned int *flush_flags,
++                         struct iommu_context *ctx)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    unsigned long i;
++    unsigned int order, j = 0;
++    int rc = 0;
++
++    if ( !is_iommu_enabled(d) )
++        return 0;
++
++    ASSERT(!(flags & ~IOMMUF_preempt));
++
++    for ( i = 0; i < page_count; i += 1UL << order )
++    {
++        dfn_t dfn = dfn_add(dfn0, i);
++        int err;
++
++        order = mapping_order(hd, dfn, _mfn(0), page_count - i);
++
++        if ( (flags & IOMMUF_preempt) &&
++             ((!(++j & 0xfff) && general_preempt_check()) ||
++              i > LONG_MAX - (1UL << order)) )
++            return i;
++
++        err = iommu_call(hd->platform_ops, unmap_page, d, dfn,
++                         flags | IOMMUF_order(order), flush_flags,
++                         ctx);
++
++        if ( likely(!err) )
++            continue;
++
++        if ( !d->is_shutting_down && printk_ratelimit() )
++            printk(XENLOG_ERR
++                   "d%d: IOMMU unmapping dfn %"PRI_dfn" failed: %d\n",
++                   d->domain_id, dfn_x(dfn), err);
++
++        if ( !rc )
++            rc = err;
++
++        if ( !ctx->id && !is_hardware_domain(d) )
++        {
++            domain_crash(d);
++            break;
++        }
++    }
++
++    /*
++     * Something went wrong so, if we were dealing with more than a single
++     * page, flush everything and clear flush flags.
++     */
++    if ( page_count > 1 && unlikely(rc) &&
++         !iommu_iotlb_flush_all(d, *flush_flags) )
++        *flush_flags = 0;
++
++    return rc;
++}
++
++long iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count,
++                 unsigned int flags, unsigned int *flush_flags,
++                 u16 ctx_no)
++{
++    struct iommu_context *ctx;
++    long ret;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++        return -ENOENT;
++
++    ret = _iommu_unmap(d, dfn0, page_count, flags, flush_flags, ctx);
++
++    iommu_put_context(ctx);
++
++    return ret;
++}
++
++int iommu_legacy_unmap(struct domain *d, dfn_t dfn, unsigned long page_count)
++{
++    unsigned int flush_flags = 0;
++    struct iommu_context *ctx;
++    int rc;
++
++    if ( dom_iommu(d)->no_dma )
++        return 0;
++
++    ctx = iommu_get_context(d, 0);
++
++    if ( ctx->opaque )
++        return 0;
++
++    rc = iommu_unmap(d, dfn, page_count, 0, &flush_flags, 0);
++
++    if ( !this_cpu(iommu_dont_flush_iotlb) && !rc )
++        rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags, 0);
++
++    iommu_put_context(ctx);
++
++    return rc;
++}
++
++int iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
++                      unsigned int *flags, u16 ctx_no)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx;
++    int ret = 0;
++
++    if ( !is_iommu_enabled(d) || !hd->platform_ops->lookup_page )
++        return -EOPNOTSUPP;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++        return -ENOENT;
++
++    ret = iommu_call(hd->platform_ops, lookup_page, d, dfn, mfn, flags, ctx);
++
++    iommu_put_context(ctx);
++    return ret;
++}
++
++int iommu_iotlb_flush(struct domain *d, dfn_t dfn, unsigned long page_count,
++                      unsigned int flush_flags, u16 ctx_no)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx;
++    int rc;
++
++    if ( !is_iommu_enabled(d) || !hd->platform_ops->iotlb_flush ||
++         !page_count || !flush_flags )
++        return 0;
++
++    if ( dfn_eq(dfn, INVALID_DFN) )
++        return -EINVAL;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++        return -ENOENT;
++
++    rc = iommu_call(hd->platform_ops, iotlb_flush, d, ctx, dfn, page_count,
++                    flush_flags);
++    if ( unlikely(rc) )
++    {
++        if ( !d->is_shutting_down && printk_ratelimit() )
++            printk(XENLOG_ERR
++                   "d%d: IOMMU IOTLB flush failed: %d, dfn %"PRI_dfn", page count %lu flags %x\n",
++                   d->domain_id, rc, dfn_x(dfn), page_count, flush_flags);
++
++        if ( !ctx->id && !is_hardware_domain(d) )
++            domain_crash(d);
++    }
++
++    iommu_put_context(ctx);
++
++    return rc;
++}
++
++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no,
++                       u32 flags)
++{
++    if ( !dom_iommu(d)->platform_ops->context_init )
++        return -ENOSYS;
++
++    INIT_LIST_HEAD(&ctx->devices);
++    ctx->id = ctx_no;
++    ctx->dying = false;
++    ctx->opaque = false; /* assume opaque by default */
++
++    return iommu_call(dom_iommu(d)->platform_ops, context_init, d, ctx, flags);
++}
++
++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags)
++{
++    unsigned int i;
++    int ret;
++    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx;
++
++    do {
++        i = find_first_zero_bit(hd->other_contexts.bitmap, hd->other_contexts.count);
++
++        if ( i >= hd->other_contexts.count )
++            return -ENOSPC;
++
++        ctx = &hd->other_contexts.map[i];
++
++        /* Try to lock the mutex, can fail on concurrent accesses */
++        if ( !rspin_trylock(&ctx->lock) )
++            continue;
++
++        /* We can now set it as used, we keep the lock for initialization. */
++        set_bit(i, hd->other_contexts.bitmap);
++    } while (0);
++
++    *ctx_no = i + 1;
++
++    ret = iommu_context_init(d, ctx, *ctx_no, flags);
++
++    if ( ret )
++        clear_bit(*ctx_no, hd->other_contexts.bitmap);
++
++    iommu_put_context(ctx);
++    return ret;
++}
++
++/**
++ * Attach dev phantom functions to ctx, override any existing
++ * mapped context.
++ */
++static int iommu_reattach_phantom(struct domain *d, device_t *dev,
++                                  struct iommu_context *ctx)
++{
++    int ret = 0;
++    uint8_t devfn = dev->devfn;
++    struct domain_iommu *hd = dom_iommu(d);
++
++    while ( dev->phantom_stride )
++    {
++        devfn += dev->phantom_stride;
++
++        if ( PCI_SLOT(devfn) != PCI_SLOT(dev->devfn) )
++            break;
++
++        ret = iommu_call(hd->platform_ops, add_devfn, d, dev, devfn, ctx);
++
++        if ( ret )
++            break;
++    }
++
++    return ret;
++}
++
++/**
++ * Detach all device phantom functions.
++ */
++static int iommu_detach_phantom(struct domain *d, device_t *dev)
++{
++    int ret = 0;
++    uint8_t devfn = dev->devfn;
++    struct domain_iommu *hd = dom_iommu(d);
++
++    while ( dev->phantom_stride )
++    {
++        devfn += dev->phantom_stride;
++
++        if ( PCI_SLOT(devfn) != PCI_SLOT(dev->devfn) )
++            break;
++
++        ret = iommu_call(hd->platform_ops, remove_devfn, d, dev, devfn);
++
++        if ( ret )
++            break;
++    }
++
++    return ret;
++}
++
++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no)
++{
++    struct iommu_context *ctx = NULL;
++    int ret, rc;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++    {
++        ret = -ENOENT;
++        goto unlock;
++    }
++
++    pcidevs_lock();
++
++    if ( ctx->dying )
++    {
++        ret = -EINVAL;
++        goto unlock;
++    }
++
++    ret = iommu_call(dom_iommu(d)->platform_ops, attach, d, dev, ctx);
++
++    if ( ret )
++        goto unlock;
++
++    /* See iommu_reattach_context() */
++    rc = iommu_reattach_phantom(d, dev, ctx);
++
++    if ( rc )
++    {
++        printk(XENLOG_ERR "IOMMU: Unable to attach %pp phantom functions\n",
++               &dev->sbdf);
++
++        if( iommu_call(dom_iommu(d)->platform_ops, detach, d, dev, ctx)
++            || iommu_detach_phantom(d, dev) )
++        {
++            printk(XENLOG_ERR "IOMMU: Improperly detached %pp\n", &dev->sbdf);
++            WARN();
++        }
++
++        ret = -EIO;
++        goto unlock;
++    }
++
++    dev->context = ctx_no;
++    list_add(&dev->context_list, &ctx->devices);
++
++unlock:
++    pcidevs_unlock();
++
++    if ( ctx )
++        iommu_put_context(ctx);
++
++    return ret;
++}
++
++int iommu_detach_context(struct domain *d, device_t *dev)
++{
++    struct iommu_context *ctx;
++    int ret, rc;
++
++    if ( !dev->domain )
++    {
++        printk(XENLOG_WARNING "IOMMU: Trying to detach a non-attached device\n");
++        WARN();
++        return 0;
++    }
++
++    /* Make sure device is actually in the domain. */
++    ASSERT(d == dev->domain);
++
++    pcidevs_lock();
++
++    ctx = iommu_get_context(d, dev->context);
++    ASSERT(ctx); /* device is using an invalid context ?
++                    dev->context invalid ? */
++
++    ret = iommu_call(dom_iommu(d)->platform_ops, detach, d, dev, ctx);
++
++    if ( ret )
++        goto unlock;
++
++    rc = iommu_detach_phantom(d, dev);
++
++    if ( rc )
++        printk(XENLOG_WARNING "IOMMU: "
++               "Improperly detached device functions (%d)\n", rc);
++
++    list_del(&dev->context_list);
++
++unlock:
++    pcidevs_unlock();
++    iommu_put_context(ctx);
++    return ret;
++}
++
++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom,
++                           device_t *dev, u16 ctx_no)
++{
++    u16 prev_ctx_no;
++    device_t *ctx_dev;
++    struct domain_iommu *prev_hd, *next_hd;
++    struct iommu_context *prev_ctx = NULL, *next_ctx = NULL;
++    int ret, rc;
++    bool same_domain;
++
++    /* Make sure we actually are doing something meaningful */
++    BUG_ON(!prev_dom && !next_dom);
++
++    /// TODO: Do such cases exists ?
++    // /* Platform ops must match */
++    // if (dom_iommu(prev_dom)->platform_ops != dom_iommu(next_dom)->platform_ops)
++    //     return -EINVAL;
++
++    if ( !prev_dom )
++        return iommu_attach_context(next_dom, dev, ctx_no);
++
++    if ( !next_dom )
++        return iommu_detach_context(prev_dom, dev);
++
++    prev_hd = dom_iommu(prev_dom);
++    next_hd = dom_iommu(next_dom);
++
++    pcidevs_lock();
++
++    same_domain = prev_dom == next_dom;
++
++    prev_ctx_no = dev->context;
++
++    if ( !same_domain && (ctx_no == prev_ctx_no) )
++    {
++        printk(XENLOG_DEBUG
++               "IOMMU: Reattaching %pp to same IOMMU context c%hu\n",
++               &dev, ctx_no);
++        ret = 0;
++        goto unlock;
++    }
++
++    if ( !(prev_ctx = iommu_get_context(prev_dom, prev_ctx_no)) )
++    {
++        ret = -ENOENT;
++        goto unlock;
++    }
++
++    if ( !(next_ctx = iommu_get_context(next_dom, ctx_no)) )
++    {
++        ret = -ENOENT;
++        goto unlock;
++    }
++
++    if ( next_ctx->dying )
++    {
++        ret = -EINVAL;
++        goto unlock;
++    }
++
++    ret = iommu_call(prev_hd->platform_ops, reattach, next_dom, dev, prev_ctx,
++                     next_ctx);
++
++    if ( ret )
++        goto unlock;
++
++    /*
++     * We need to do special handling for phantom devices as they
++     * also use some other PCI functions behind the scenes.
++     */
++    rc = iommu_reattach_phantom(next_dom, dev, next_ctx);
++
++    if ( rc )
++    {
++        /**
++         * Device is being partially reattached (we have primary function and
++         * maybe some phantom functions attached to next_ctx, some others to prev_ctx),
++         * some functions of the device will be attached to next_ctx.
++         */
++        printk(XENLOG_WARNING "IOMMU: "
++               "Device %pp improperly reattached due to phantom function"
++               " reattach failure between %dd%dc and %dd%dc (%d)\n", dev,
++               prev_dom->domain_id, prev_ctx->id, next_dom->domain_id,
++               next_dom->domain_id, rc);
++
++        /* Try reattaching to previous context, reverting into a consistent state. */
++        if ( iommu_call(prev_hd->platform_ops, reattach, prev_dom, dev, next_ctx,
++                        prev_ctx) || iommu_reattach_phantom(prev_dom, dev, prev_ctx) )
++        {
++            printk(XENLOG_ERR "Unable to reattach %pp back to %dd%dc\n",
++                   &dev->sbdf, prev_dom->domain_id, prev_ctx->id);
++
++            if ( !is_hardware_domain(prev_dom) )
++                domain_crash(prev_dom);
++
++            if ( prev_dom != next_dom && !is_hardware_domain(next_dom) )
++                domain_crash(next_dom);
++
++            rc = -EIO;
++        }
++
++        ret = rc;
++        goto unlock;
++    }
++
++    /* Remove device from previous context, and add it to new one. */
++    list_for_each_entry(ctx_dev, &prev_ctx->devices, context_list)
++    {
++        if ( ctx_dev == dev )
++        {
++            list_del(&ctx_dev->context_list);
++            list_add(&ctx_dev->context_list, &next_ctx->devices);
++            break;
++        }
++    }
++
++    if (!ret)
++        dev->context = ctx_no; /* update device context*/
++
++unlock:
++    pcidevs_unlock();
++
++    if ( prev_ctx )
++        iommu_put_context(prev_ctx);
++
++    if ( next_ctx )
++        iommu_put_context(next_ctx);
++
++    return ret;
++}
++
++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++
++    if ( !hd->platform_ops->context_teardown )
++        return -ENOSYS;
++
++    ctx->dying = true;
++
++    /* first reattach devices back to default context if needed */
++    if ( flags & IOMMU_TEARDOWN_REATTACH_DEFAULT )
++    {
++        struct pci_dev *device;
++        list_for_each_entry(device, &ctx->devices, context_list)
++            iommu_reattach_context(d, d, device, 0);
++    }
++    else if (!list_empty(&ctx->devices))
++        return -EBUSY; /* there is a device in context */
++
++    return iommu_call(hd->platform_ops, context_teardown, d, ctx, flags);
++}
++
++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags)
++{
++    int ret;
++    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx;
++
++    if ( ctx_no == 0 )
++        return -EINVAL;
++
++    if ( !(ctx = iommu_get_context(d, ctx_no)) )
++        return -ENOENT;
++
++    ret = iommu_context_teardown(d, ctx, flags);
++
++    if ( !ret )
++        clear_bit(ctx_no - 1, hd->other_contexts.bitmap);
++
++    iommu_put_context(ctx);
++    return ret;
++}
+diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
+index 9e74a1fc72fa..e109ebe40470 100644
+--- a/xen/drivers/passthrough/iommu.c
++++ b/xen/drivers/passthrough/iommu.c
+@@ -12,15 +12,18 @@
+  * this program; If not, see <http://www.gnu.org/licenses/>.
+  */
+ 
++#include <xen/atomic.h>
++#include <xen/errno.h>
++#include <xen/xmalloc.h>
++#include <xen/pci.h>
+ #include <xen/sched.h>
++#include <xen/spinlock.h>
+ #include <xen/iommu.h>
+-#include <xen/paging.h>
+-#include <xen/guest_access.h>
+-#include <xen/event.h>
+ #include <xen/param.h>
+-#include <xen/softirq.h>
+ #include <xen/keyhandler.h>
+-#include <xsm/xsm.h>
++#include <asm/arena.h>
++#include <asm/iommu.h>
++#include <asm/bitops.h>
+ 
+ #ifdef CONFIG_X86
+ #include <asm/e820.h>
+@@ -35,26 +38,11 @@ bool __read_mostly force_iommu;
+ bool __read_mostly iommu_verbose;
+ static bool __read_mostly iommu_crash_disable;
+ 
+-#define IOMMU_quarantine_none         0 /* aka false */
+-#define IOMMU_quarantine_basic        1 /* aka true */
+-#define IOMMU_quarantine_scratch_page 2
+-#ifdef CONFIG_HAS_PCI
+-uint8_t __read_mostly iommu_quarantine =
+-# if defined(CONFIG_IOMMU_QUARANTINE_NONE)
+-    IOMMU_quarantine_none;
+-# elif defined(CONFIG_IOMMU_QUARANTINE_BASIC)
+-    IOMMU_quarantine_basic;
+-# elif defined(CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE)
+-    IOMMU_quarantine_scratch_page;
+-# endif
+-#else
+-# define iommu_quarantine IOMMU_quarantine_none
+-#endif /* CONFIG_HAS_PCI */
+-
+ static bool __hwdom_initdata iommu_hwdom_none;
+ bool __hwdom_initdata iommu_hwdom_strict;
+ bool __read_mostly iommu_hwdom_passthrough;
+ bool __hwdom_initdata iommu_hwdom_inclusive;
++bool __read_mostly iommu_hwdom_no_dma = false;
+ int8_t __hwdom_initdata iommu_hwdom_reserved = -1;
+ 
+ #ifndef iommu_hap_pt_share
+@@ -172,6 +160,8 @@ static int __init cf_check parse_dom0_iommu_param(const char *s)
+             iommu_hwdom_reserved = val;
+         else if ( !cmdline_strcmp(s, "none") )
+             iommu_hwdom_none = true;
++        else if ( (val = parse_boolean("dma", s, ss)) >= 0 )
++            iommu_hwdom_no_dma = !val;
+         else
+             rc = -EINVAL;
+ 
+@@ -193,6 +183,98 @@ static void __hwdom_init check_hwdom_reqs(struct domain *d)
+     arch_iommu_check_autotranslated_hwdom(d);
+ }
+ 
++int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    int rc;
++
++    BUG_ON(nb_ctx == 0); /* sanity check (prevent underflow) */
++
++    /*
++     * hd->other_contexts.count is always reported as 0 during initialization
++     * preventing misuse of partially initialized IOMMU contexts.
++     */
++
++    if ( atomic_cmpxchg(&hd->other_contexts.initialized, 0, 1) == 1 )
++        return -EACCES;
++
++    if ( (nb_ctx - 1) > 0 ) {
++        /* Initialize context bitmap */
++        size_t i;
++
++        hd->other_contexts.bitmap = xzalloc_array(unsigned long,
++                                                  BITS_TO_LONGS(nb_ctx - 1));
++
++        if (!hd->other_contexts.bitmap)
++        {
++            rc = -ENOMEM;
++            goto cleanup;
++        }
++
++        hd->other_contexts.map = xzalloc_array(struct iommu_context, nb_ctx - 1);
++
++        if (!hd->other_contexts.map)
++        {
++            rc = -ENOMEM;
++            goto cleanup;
++        }
++
++        for (i = 0; i < (nb_ctx - 1); i++)
++            rspin_lock_init(&hd->other_contexts.map[i].lock);
++    }
++
++    rc = arch_iommu_pviommu_init(d, nb_ctx, arena_order);
++
++    if ( rc )
++        goto cleanup;
++
++    /* Make sure initialization is complete before making it visible to other CPUs. */
++    smp_wmb();
++
++    hd->other_contexts.count = nb_ctx - 1;
++
++    printk(XENLOG_INFO "Dom%d uses %lu IOMMU contexts (%llu pages arena)\n",
++           d->domain_id, (unsigned long)nb_ctx, 1llu << arena_order);
++
++    return 0;
++
++cleanup:
++    /* TODO: Reset hd->other_contexts.initialized */
++    if ( hd->other_contexts.bitmap )
++    {
++        xfree(hd->other_contexts.bitmap);
++        hd->other_contexts.bitmap = NULL;
++    }
++
++    if ( hd->other_contexts.map )
++    {
++        xfree(hd->other_contexts.map);
++        hd->other_contexts.bitmap = NULL;
++    }
++
++    return rc;
++}
++
++int iommu_domain_pviommu_teardown(struct domain *d)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++    int i;
++    /* FIXME: Potential race condition with remote_op ? */
++
++    for (i = 0; i < hd->other_contexts.count; i++)
++        WARN_ON(iommu_context_free(d, i, IOMMU_TEARDOWN_REATTACH_DEFAULT) != ENOENT);
++
++    hd->other_contexts.count = 0;
++
++    if ( hd->other_contexts.bitmap )
++        xfree(hd->other_contexts.bitmap);
++
++    if ( hd->other_contexts.map )
++        xfree(hd->other_contexts.map);
++
++    return 0;
++}
++
+ int iommu_domain_init(struct domain *d, unsigned int opts)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+@@ -208,6 +290,8 @@ int iommu_domain_init(struct domain *d, unsigned int opts)
+     hd->node = NUMA_NO_NODE;
+ #endif
+ 
++    rspin_lock_init(&hd->default_ctx.lock);
++
+     ret = arch_iommu_domain_init(d);
+     if ( ret )
+         return ret;
+@@ -236,6 +320,23 @@ int iommu_domain_init(struct domain *d, unsigned int opts)
+ 
+     ASSERT(!(hd->need_sync && hd->hap_pt_share));
+ 
++    if ( hd->no_dma )
++    {
++        /* No-DMA mode is exclusive with HAP and sync_pt. */
++        hd->hap_pt_share = false;
++        hd->need_sync = false;
++    }
++
++    hd->allow_pv_iommu = true;
++
++    iommu_context_init(d, &hd->default_ctx, 0, IOMMU_CONTEXT_INIT_default);
++
++    rwlock_init(&hd->other_contexts.lock);
++    hd->other_contexts.initialized = (atomic_t)ATOMIC_INIT(0);
++    hd->other_contexts.count = 0;
++    hd->other_contexts.bitmap = NULL;
++    hd->other_contexts.map = NULL;
++
+     return 0;
+ }
+ 
+@@ -249,13 +350,12 @@ static void cf_check iommu_dump_page_tables(unsigned char key)
+ 
+     for_each_domain(d)
+     {
+-        if ( is_hardware_domain(d) || !is_iommu_enabled(d) )
++        if ( !is_iommu_enabled(d) )
+             continue;
+ 
+         if ( iommu_use_hap_pt(d) )
+         {
+             printk("%pd sharing page tables\n", d);
+-            continue;
+         }
+ 
+         iommu_vcall(dom_iommu(d)->platform_ops, dump_page_tables, d);
+@@ -274,10 +374,13 @@ void __hwdom_init iommu_hwdom_init(struct domain *d)
+     iommu_vcall(hd->platform_ops, hwdom_init, d);
+ }
+ 
+-static void iommu_teardown(struct domain *d)
++void iommu_domain_destroy(struct domain *d)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+ 
++    if ( !is_iommu_enabled(d) )
++        return;
++
+     /*
+      * During early domain creation failure, we may reach here with the
+      * ops not yet initialized.
+@@ -286,222 +389,9 @@ static void iommu_teardown(struct domain *d)
+         return;
+ 
+     iommu_vcall(hd->platform_ops, teardown, d);
+-}
+-
+-void iommu_domain_destroy(struct domain *d)
+-{
+-    if ( !is_iommu_enabled(d) )
+-        return;
+-
+-    iommu_teardown(d);
+ 
+     arch_iommu_domain_destroy(d);
+-}
+-
+-static unsigned int mapping_order(const struct domain_iommu *hd,
+-                                  dfn_t dfn, mfn_t mfn, unsigned long nr)
+-{
+-    unsigned long res = dfn_x(dfn) | mfn_x(mfn);
+-    unsigned long sizes = hd->platform_ops->page_sizes;
+-    unsigned int bit = ffsl(sizes) - 1, order = 0;
+-
+-    ASSERT(bit == PAGE_SHIFT);
+-
+-    while ( (sizes = (sizes >> bit) & ~1) )
+-    {
+-        unsigned long mask;
+-
+-        bit = ffsl(sizes) - 1;
+-        mask = (1UL << bit) - 1;
+-        if ( nr <= mask || (res & mask) )
+-            break;
+-        order += bit;
+-        nr >>= bit;
+-        res >>= bit;
+-    }
+-
+-    return order;
+-}
+-
+-long iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0,
+-               unsigned long page_count, unsigned int flags,
+-               unsigned int *flush_flags)
+-{
+-    const struct domain_iommu *hd = dom_iommu(d);
+-    unsigned long i;
+-    unsigned int order, j = 0;
+-    int rc = 0;
+-
+-    if ( !is_iommu_enabled(d) )
+-        return 0;
+-
+-    ASSERT(!IOMMUF_order(flags));
+-
+-    for ( i = 0; i < page_count; i += 1UL << order )
+-    {
+-        dfn_t dfn = dfn_add(dfn0, i);
+-        mfn_t mfn = mfn_add(mfn0, i);
+-
+-        order = mapping_order(hd, dfn, mfn, page_count - i);
+-
+-        if ( (flags & IOMMUF_preempt) &&
+-             ((!(++j & 0xfff) && general_preempt_check()) ||
+-              i > LONG_MAX - (1UL << order)) )
+-            return i;
+-
+-        rc = iommu_call(hd->platform_ops, map_page, d, dfn, mfn,
+-                        flags | IOMMUF_order(order), flush_flags);
+-
+-        if ( likely(!rc) )
+-            continue;
+-
+-        if ( !d->is_shutting_down && printk_ratelimit() )
+-            printk(XENLOG_ERR
+-                   "d%d: IOMMU mapping dfn %"PRI_dfn" to mfn %"PRI_mfn" failed: %d\n",
+-                   d->domain_id, dfn_x(dfn), mfn_x(mfn), rc);
+-
+-        /* while statement to satisfy __must_check */
+-        while ( iommu_unmap(d, dfn0, i, 0, flush_flags) )
+-            break;
+-
+-        if ( !is_hardware_domain(d) )
+-            domain_crash(d);
+-
+-        break;
+-    }
+-
+-    /*
+-     * Something went wrong so, if we were dealing with more than a single
+-     * page, flush everything and clear flush flags.
+-     */
+-    if ( page_count > 1 && unlikely(rc) &&
+-         !iommu_iotlb_flush_all(d, *flush_flags) )
+-        *flush_flags = 0;
+-
+-    return rc;
+-}
+-
+-int iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn,
+-                     unsigned long page_count, unsigned int flags)
+-{
+-    unsigned int flush_flags = 0;
+-    int rc;
+-
+-    ASSERT(!(flags & IOMMUF_preempt));
+-    rc = iommu_map(d, dfn, mfn, page_count, flags, &flush_flags);
+-
+-    if ( !this_cpu(iommu_dont_flush_iotlb) && !rc )
+-        rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags);
+-
+-    return rc;
+-}
+-
+-long iommu_unmap(struct domain *d, dfn_t dfn0, unsigned long page_count,
+-                 unsigned int flags, unsigned int *flush_flags)
+-{
+-    const struct domain_iommu *hd = dom_iommu(d);
+-    unsigned long i;
+-    unsigned int order, j = 0;
+-    int rc = 0;
+-
+-    if ( !is_iommu_enabled(d) )
+-        return 0;
+-
+-    ASSERT(!(flags & ~IOMMUF_preempt));
+-
+-    for ( i = 0; i < page_count; i += 1UL << order )
+-    {
+-        dfn_t dfn = dfn_add(dfn0, i);
+-        int err;
+-
+-        order = mapping_order(hd, dfn, _mfn(0), page_count - i);
+-
+-        if ( (flags & IOMMUF_preempt) &&
+-             ((!(++j & 0xfff) && general_preempt_check()) ||
+-              i > LONG_MAX - (1UL << order)) )
+-            return i;
+-
+-        err = iommu_call(hd->platform_ops, unmap_page, d, dfn,
+-                         flags | IOMMUF_order(order), flush_flags);
+-
+-        if ( likely(!err) )
+-            continue;
+-
+-        if ( !d->is_shutting_down && printk_ratelimit() )
+-            printk(XENLOG_ERR
+-                   "d%d: IOMMU unmapping dfn %"PRI_dfn" failed: %d\n",
+-                   d->domain_id, dfn_x(dfn), err);
+-
+-        if ( !rc )
+-            rc = err;
+-
+-        if ( !is_hardware_domain(d) )
+-        {
+-            domain_crash(d);
+-            break;
+-        }
+-    }
+-
+-    /*
+-     * Something went wrong so, if we were dealing with more than a single
+-     * page, flush everything and clear flush flags.
+-     */
+-    if ( page_count > 1 && unlikely(rc) &&
+-         !iommu_iotlb_flush_all(d, *flush_flags) )
+-        *flush_flags = 0;
+-
+-    return rc;
+-}
+-
+-int iommu_legacy_unmap(struct domain *d, dfn_t dfn, unsigned long page_count)
+-{
+-    unsigned int flush_flags = 0;
+-    int rc = iommu_unmap(d, dfn, page_count, 0, &flush_flags);
+-
+-    if ( !this_cpu(iommu_dont_flush_iotlb) && !rc )
+-        rc = iommu_iotlb_flush(d, dfn, page_count, flush_flags);
+-
+-    return rc;
+-}
+-
+-int iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
+-                      unsigned int *flags)
+-{
+-    const struct domain_iommu *hd = dom_iommu(d);
+-
+-    if ( !is_iommu_enabled(d) || !hd->platform_ops->lookup_page )
+-        return -EOPNOTSUPP;
+-
+-    return iommu_call(hd->platform_ops, lookup_page, d, dfn, mfn, flags);
+-}
+-
+-int iommu_iotlb_flush(struct domain *d, dfn_t dfn, unsigned long page_count,
+-                      unsigned int flush_flags)
+-{
+-    const struct domain_iommu *hd = dom_iommu(d);
+-    int rc;
+-
+-    if ( !is_iommu_enabled(d) || !hd->platform_ops->iotlb_flush ||
+-         !page_count || !flush_flags )
+-        return 0;
+-
+-    if ( dfn_eq(dfn, INVALID_DFN) )
+-        return -EINVAL;
+-
+-    rc = iommu_call(hd->platform_ops, iotlb_flush, d, dfn, page_count,
+-                    flush_flags);
+-    if ( unlikely(rc) )
+-    {
+-        if ( !d->is_shutting_down && printk_ratelimit() )
+-            printk(XENLOG_ERR
+-                   "d%d: IOMMU IOTLB flush failed: %d, dfn %"PRI_dfn", page count %lu flags %x\n",
+-                   d->domain_id, rc, dfn_x(dfn), page_count, flush_flags);
+-
+-        if ( !is_hardware_domain(d) )
+-            domain_crash(d);
+-    }
+-
+-    return rc;
++    iommu_domain_pviommu_teardown(d);
+ }
+ 
+ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags)
+@@ -513,7 +403,7 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags)
+          !flush_flags )
+         return 0;
+ 
+-    rc = iommu_call(hd->platform_ops, iotlb_flush, d, INVALID_DFN, 0,
++    rc = iommu_call(hd->platform_ops, iotlb_flush, d, NULL, INVALID_DFN, 0,
+                     flush_flags | IOMMU_FLUSHF_all);
+     if ( unlikely(rc) )
+     {
+@@ -529,24 +419,6 @@ int iommu_iotlb_flush_all(struct domain *d, unsigned int flush_flags)
+     return rc;
+ }
+ 
+-int iommu_quarantine_dev_init(device_t *dev)
+-{
+-    const struct domain_iommu *hd = dom_iommu(dom_io);
+-
+-    if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
+-        return 0;
+-
+-    return iommu_call(hd->platform_ops, quarantine_init,
+-                      dev, iommu_quarantine == IOMMU_quarantine_scratch_page);
+-}
+-
+-static int __init iommu_quarantine_init(void)
+-{
+-    dom_io->options |= XEN_DOMCTL_CDF_iommu;
+-
+-    return iommu_domain_init(dom_io, 0);
+-}
+-
+ int __init iommu_setup(void)
+ {
+     int rc = -ENODEV;
+@@ -682,6 +554,16 @@ bool iommu_has_feature(struct domain *d, enum iommu_feature feature)
+     return is_iommu_enabled(d) && test_bit(feature, dom_iommu(d)->features);
+ }
+ 
++uint64_t iommu_get_max_iova(struct domain *d)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++
++    if ( !hd->platform_ops->get_max_iova )
++        return 0;
++
++    return iommu_call(hd->platform_ops, get_max_iova, d);
++}
++
+ #define MAX_EXTRA_RESERVED_RANGES 20
+ struct extra_reserved_range {
+     unsigned long start;
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 777c6b1a7fdc..49d014d90fd5 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -1,6 +1,6 @@
+ /*
+  * Copyright (C) 2008,  Netronome Systems, Inc.
+- *                
++ *
+  * This program is free software; you can redistribute it and/or modify it
+  * under the terms and conditions of the GNU General Public License,
+  * version 2, as published by the Free Software Foundation.
+@@ -289,14 +289,14 @@ static void apply_quirks(struct pci_dev *pdev)
+          * Device [8086:2fc0]
+          * Erratum HSE43
+          * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset
+-         * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html 
++         * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html
+          */
+         { PCI_VENDOR_ID_INTEL, 0x2fc0 },
+         /*
+          * Devices [8086:6f60,6fa0,6fc0]
+          * Errata BDF2 / BDX2
+          * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration
+-         * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html 
++         * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html
+         */
+         { PCI_VENDOR_ID_INTEL, 0x6f60 },
+         { PCI_VENDOR_ID_INTEL, 0x6fa0 },
+@@ -911,8 +911,8 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
+         devfn += pdev->phantom_stride;
+         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+             break;
+-        ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
+-                         pci_to_dev(pdev));
++        ret = iommu_call(hd->platform_ops, add_devfn, d, pci_to_dev(pdev), devfn,
++                         &target->iommu.default_ctx);
+         if ( ret )
+             goto out;
+     }
+@@ -921,9 +921,8 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
+     vpci_deassign_device(pdev);
+     write_unlock(&d->pci_lock);
+ 
+-    devfn = pdev->devfn;
+-    ret = iommu_call(hd->platform_ops, reassign_device, d, target, devfn,
+-                     pci_to_dev(pdev));
++    ret = iommu_reattach_context(pdev->domain, target, pci_to_dev(pdev), 0);
++
+     if ( ret )
+         goto out;
+ 
+@@ -931,6 +930,7 @@ static int deassign_device(struct domain *d, uint16_t seg, uint8_t bus,
+         pdev->quarantine = false;
+ 
+     pdev->fault.count = 0;
++    pdev->domain = target;
+ 
+     write_lock(&target->pci_lock);
+     /* Re-assign back to hardware_domain */
+@@ -1180,25 +1180,18 @@ struct setup_hwdom {
+ static void __hwdom_init setup_one_hwdom_device(const struct setup_hwdom *ctxt,
+                                                 struct pci_dev *pdev)
+ {
+-    u8 devfn = pdev->devfn;
+     int err;
+ 
+-    do {
+-        err = ctxt->handler(devfn, pdev);
+-        if ( err )
+-        {
+-            printk(XENLOG_ERR "setup %pp for d%d failed (%d)\n",
+-                   &pdev->sbdf, ctxt->d->domain_id, err);
+-            if ( devfn == pdev->devfn )
+-                return;
+-        }
+-        devfn += pdev->phantom_stride;
+-    } while ( devfn != pdev->devfn &&
+-              PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
++    err = ctxt->handler(pdev->devfn, pdev);
++
++    if ( err )
++        goto done;
+ 
+     write_lock(&ctxt->d->pci_lock);
+     err = vpci_assign_device(pdev);
+     write_unlock(&ctxt->d->pci_lock);
++
++done:
+     if ( err )
+         printk(XENLOG_ERR "setup of vPCI for d%d failed: %d\n",
+                ctxt->d->domain_id, err);
+@@ -1370,12 +1363,7 @@ static int cf_check _dump_pci_devices(struct pci_seg *pseg, void *arg)
+     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
+     {
+         printk("%pp - ", &pdev->sbdf);
+-#ifdef CONFIG_X86
+-        if ( pdev->domain == dom_io )
+-            printk("DomIO:%x", pdev->arch.pseudo_domid);
+-        else
+-#endif
+-            printk("%pd", pdev->domain);
++        printk("%pd", pdev->domain);
+         printk(" - node %-3d", (pdev->node != NUMA_NO_NODE) ? pdev->node : -1);
+         pdev_dump_msi(pdev);
+         printk("\n");
+@@ -1402,8 +1390,6 @@ __initcall(setup_dump_pcidevs);
+ static int iommu_add_device(struct pci_dev *pdev)
+ {
+     const struct domain_iommu *hd;
+-    int rc;
+-    unsigned int devfn = pdev->devfn;
+ 
+     if ( !pdev->domain )
+         return -EINVAL;
+@@ -1414,20 +1400,7 @@ static int iommu_add_device(struct pci_dev *pdev)
+     if ( !is_iommu_enabled(pdev->domain) )
+         return 0;
+ 
+-    rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev));
+-    if ( rc || !pdev->phantom_stride )
+-        return rc;
+-
+-    for ( ; ; )
+-    {
+-        devfn += pdev->phantom_stride;
+-        if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+-            return 0;
+-        rc = iommu_call(hd->platform_ops, add_device, devfn, pci_to_dev(pdev));
+-        if ( rc )
+-            printk(XENLOG_WARNING "IOMMU: add %pp failed (%d)\n",
+-                   &PCI_SBDF(pdev->seg, pdev->bus, devfn), rc);
+-    }
++    return iommu_attach_context(pdev->domain, pci_to_dev(pdev), 0);
+ }
+ 
+ static int iommu_enable_device(struct pci_dev *pdev)
+@@ -1449,36 +1422,13 @@ static int iommu_enable_device(struct pci_dev *pdev)
+ 
+ static int iommu_remove_device(struct pci_dev *pdev)
+ {
+-    const struct domain_iommu *hd;
+-    u8 devfn;
+-
+     if ( !pdev->domain )
+         return -EINVAL;
+ 
+-    hd = dom_iommu(pdev->domain);
+     if ( !is_iommu_enabled(pdev->domain) )
+         return 0;
+ 
+-    for ( devfn = pdev->devfn ; pdev->phantom_stride; )
+-    {
+-        int rc;
+-
+-        devfn += pdev->phantom_stride;
+-        if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+-            break;
+-        rc = iommu_call(hd->platform_ops, remove_device, devfn,
+-                        pci_to_dev(pdev));
+-        if ( !rc )
+-            continue;
+-
+-        printk(XENLOG_ERR "IOMMU: remove %pp failed (%d)\n",
+-               &PCI_SBDF(pdev->seg, pdev->bus, devfn), rc);
+-        return rc;
+-    }
+-
+-    devfn = pdev->devfn;
+-
+-    return iommu_call(hd->platform_ops, remove_device, devfn, pci_to_dev(pdev));
++    return iommu_detach_context(pdev->domain, pdev);
+ }
+ 
+ static int device_assigned(u16 seg, u8 bus, u8 devfn)
+@@ -1506,7 +1456,6 @@ static int device_assigned(u16 seg, u8 bus, u8 devfn)
+ /* Caller should hold the pcidevs_lock */
+ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ {
+-    const struct domain_iommu *hd = dom_iommu(d);
+     struct pci_dev *pdev;
+     int rc = 0;
+ 
+@@ -1544,17 +1493,7 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ 
+     pdev->fault.count = 0;
+ 
+-    rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev),
+-                    flag);
+-
+-    while ( pdev->phantom_stride && !rc )
+-    {
+-        devfn += pdev->phantom_stride;
+-        if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+-            break;
+-        rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
+-                        pci_to_dev(pdev), flag);
+-    }
++    rc = iommu_reattach_context(pdev->domain, d, pci_to_dev(pdev), 0);
+ 
+     if ( rc )
+         goto done;
+@@ -1564,27 +1503,9 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+     write_unlock(&d->pci_lock);
+ 
+  done:
+-    if ( rc )
+-    {
+-        printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n",
+-               d, devfn != pdev->devfn ? "phantom function " : "",
+-               &PCI_SBDF(seg, bus, devfn), rc);
+ 
+-        if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) )
+-        {
+-            /*
+-             * Device with phantom functions that failed to both assign and
+-             * rollback.  Mark the device as broken and crash the target domain,
+-             * as the state of the functions at this point is unknown and Xen
+-             * has no way to assert consistent context assignment among them.
+-             */
+-            pdev->broken = true;
+-            if ( !is_hardware_domain(d) && d != dom_io )
+-                domain_crash(d);
+-        }
+-    }
+     /* The device is assigned to dom_io so mark it as quarantined */
+-    else if ( d == dom_io )
++    if ( !rc && d == dom_io )
+         pdev->quarantine = true;
+ 
+     return rc;
+diff --git a/xen/drivers/passthrough/quarantine.c b/xen/drivers/passthrough/quarantine.c
+new file mode 100644
+index 000000000000..b58f136ad81b
+--- /dev/null
++++ b/xen/drivers/passthrough/quarantine.c
+@@ -0,0 +1,49 @@
++#include <xen/stdint.h>
++#include <xen/iommu.h>
++#include <xen/sched.h>
++
++#ifdef CONFIG_HAS_PCI
++uint8_t __read_mostly iommu_quarantine =
++# if defined(CONFIG_IOMMU_QUARANTINE_NONE)
++    IOMMU_quarantine_none;
++# elif defined(CONFIG_IOMMU_QUARANTINE_BASIC)
++    IOMMU_quarantine_basic;
++# elif defined(CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE)
++    IOMMU_quarantine_scratch_page;
++# endif
++#else
++# define iommu_quarantine IOMMU_quarantine_none
++#endif /* CONFIG_HAS_PCI */
++
++int iommu_quarantine_dev_init(device_t *dev)
++{
++    int ret;
++    u16 ctx_no;
++
++    if ( !iommu_quarantine )
++        return 0;
++
++    ret = iommu_context_alloc(dom_io, &ctx_no, IOMMU_CONTEXT_INIT_quarantine);
++
++    if ( ret )
++        return ret;
++
++    /** TODO: Setup scratch page, mappings... */
++
++    ret = iommu_reattach_context(dev->domain, dom_io, dev, ctx_no);
++
++    if ( ret )
++    {
++        ASSERT(!iommu_context_free(dom_io, ctx_no, 0));
++        return ret;
++    }
++
++    return ret;
++}
++
++int __init iommu_quarantine_init(void)
++{
++    dom_io->options |= XEN_DOMCTL_CDF_iommu;
++
++    return iommu_domain_init(dom_io, 0);
++}
+diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
+index b928c67e1995..f74f3b107578 100644
+--- a/xen/include/xen/iommu.h
++++ b/xen/include/xen/iommu.h
+@@ -52,7 +52,11 @@ static inline bool dfn_eq(dfn_t x, dfn_t y)
+ #ifdef CONFIG_HAS_PASSTHROUGH
+ extern bool iommu_enable, iommu_enabled;
+ extern bool force_iommu, iommu_verbose;
++
+ /* Boolean except for the specific purposes of drivers/passthrough/iommu.c. */
++#define IOMMU_quarantine_none         0 /* aka false */
++#define IOMMU_quarantine_basic        1 /* aka true */
++#define IOMMU_quarantine_scratch_page 2
+ extern uint8_t iommu_quarantine;
+ #else
+ #define iommu_enabled false
+@@ -106,6 +110,7 @@ extern bool iommu_debug;
+ extern bool amd_iommu_perdev_intremap;
+ 
+ extern bool iommu_hwdom_strict, iommu_hwdom_passthrough, iommu_hwdom_inclusive;
++extern bool iommu_hwdom_no_dma;
+ extern int8_t iommu_hwdom_reserved;
+ 
+ extern unsigned int iommu_dev_iotlb_timeout;
+@@ -161,11 +166,10 @@ enum
+  */
+ long __must_check iommu_map(struct domain *d, dfn_t dfn0, mfn_t mfn0,
+                             unsigned long page_count, unsigned int flags,
+-                            unsigned int *flush_flags);
++                            unsigned int *flush_flags, u16 ctx_no);
+ long __must_check iommu_unmap(struct domain *d, dfn_t dfn0,
+                               unsigned long page_count, unsigned int flags,
+-                              unsigned int *flush_flags);
+-
++                              unsigned int *flush_flags, u16 ctx_no);
+ int __must_check iommu_legacy_map(struct domain *d, dfn_t dfn, mfn_t mfn,
+                                   unsigned long page_count,
+                                   unsigned int flags);
+@@ -173,11 +177,12 @@ int __must_check iommu_legacy_unmap(struct domain *d, dfn_t dfn,
+                                     unsigned long page_count);
+ 
+ int __must_check iommu_lookup_page(struct domain *d, dfn_t dfn, mfn_t *mfn,
+-                                   unsigned int *flags);
++                                   unsigned int *flags, u16 ctx_no);
+ 
+ int __must_check iommu_iotlb_flush(struct domain *d, dfn_t dfn,
+                                    unsigned long page_count,
+-                                   unsigned int flush_flags);
++                                   unsigned int flush_flags,
++                                   u16 ctx_no);
+ int __must_check iommu_iotlb_flush_all(struct domain *d,
+                                        unsigned int flush_flags);
+ 
+@@ -250,20 +255,30 @@ struct page_info;
+  */
+ typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt);
+ 
++struct iommu_context;
++
+ struct iommu_ops {
+     unsigned long page_sizes;
+     int (*init)(struct domain *d);
+     void (*hwdom_init)(struct domain *d);
+-    int (*quarantine_init)(device_t *dev, bool scratch_page);
+-    int (*add_device)(uint8_t devfn, device_t *dev);
++    int (*context_init)(struct domain *d, struct iommu_context *ctx,
++                        u32 flags);
++    int (*context_teardown)(struct domain *d, struct iommu_context *ctx,
++                            u32 flags);
++    int (*attach)(struct domain *d, device_t *dev,
++                  struct iommu_context *ctx);
++    int (*detach)(struct domain *d, device_t *dev,
++                   struct iommu_context *prev_ctx);
++    int (*reattach)(struct domain *d, device_t *dev,
++                    struct iommu_context *prev_ctx,
++                    struct iommu_context *ctx);
++
+     int (*enable_device)(device_t *dev);
+-    int (*remove_device)(uint8_t devfn, device_t *dev);
+-    int (*assign_device)(struct domain *d, uint8_t devfn, device_t *dev,
+-                         uint32_t flag);
+-    int (*reassign_device)(struct domain *s, struct domain *t,
+-                           uint8_t devfn, device_t *dev);
+ #ifdef CONFIG_HAS_PCI
+     int (*get_device_group_id)(uint16_t seg, uint8_t bus, uint8_t devfn);
++    int (*add_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn,
++                     struct iommu_context *ctx);
++    int (*remove_devfn)(struct domain *d, struct pci_dev *pdev, u16 devfn);
+ #endif /* HAS_PCI */
+ 
+     void (*teardown)(struct domain *d);
+@@ -274,12 +289,15 @@ struct iommu_ops {
+      */
+     int __must_check (*map_page)(struct domain *d, dfn_t dfn, mfn_t mfn,
+                                  unsigned int flags,
+-                                 unsigned int *flush_flags);
++                                 unsigned int *flush_flags,
++                                 struct iommu_context *ctx);
+     int __must_check (*unmap_page)(struct domain *d, dfn_t dfn,
+                                    unsigned int order,
+-                                   unsigned int *flush_flags);
++                                   unsigned int *flush_flags,
++                                   struct iommu_context *ctx);
+     int __must_check (*lookup_page)(struct domain *d, dfn_t dfn, mfn_t *mfn,
+-                                    unsigned int *flags);
++                                    unsigned int *flags,
++                                    struct iommu_context *ctx);
+ 
+ #ifdef CONFIG_X86
+     int (*enable_x2apic)(void);
+@@ -292,14 +310,15 @@ struct iommu_ops {
+     int (*setup_hpet_msi)(struct msi_desc *msi_desc);
+ 
+     void (*adjust_irq_affinities)(void);
+-    void (*clear_root_pgtable)(struct domain *d);
++    void (*clear_root_pgtable)(struct domain *d, struct iommu_context *ctx);
+     int (*update_ire_from_msi)(struct msi_desc *msi_desc, struct msi_msg *msg);
+ #endif /* CONFIG_X86 */
+ 
+     int __must_check (*suspend)(void);
+     void (*resume)(void);
+     void (*crash_shutdown)(void);
+-    int __must_check (*iotlb_flush)(struct domain *d, dfn_t dfn,
++    int __must_check (*iotlb_flush)(struct domain *d,
++                                    struct iommu_context *ctx, dfn_t dfn,
+                                     unsigned long page_count,
+                                     unsigned int flush_flags);
+     int (*get_reserved_device_memory)(iommu_grdm_t *func, void *ctxt);
+@@ -314,6 +333,8 @@ struct iommu_ops {
+      */
+     int (*dt_xlate)(device_t *dev, const struct dt_phandle_args *args);
+ #endif
++
++    uint64_t (*get_max_iova)(struct domain *d);
+ };
+ 
+ /*
+@@ -343,11 +364,39 @@ extern int iommu_get_extra_reserved_device_memory(iommu_grdm_t *func,
+ # define iommu_vcall iommu_call
+ #endif
+ 
++struct iommu_context {
++    u16 id; /* Context id (0 means default context) */
++    rspinlock_t lock; /* context lock */
++
++    struct list_head devices;
++
++    struct arch_iommu_context arch;
++
++    bool opaque; /* context can't be modified nor accessed (e.g HAP) */
++    bool dying; /* the context is tearing down */
++};
++
++struct iommu_context_list {
++    atomic_t initialized; /* has/is context list being initialized ? */
++    rwlock_t lock; /* prevent concurrent destruction and access of contexts */
++    uint16_t count; /* Context count excluding default context */
++
++    /* if count > 0 */
++
++    uint64_t *bitmap; /* bitmap of context allocation */
++    struct iommu_context *map; /* Map of contexts */
++};
++
++
+ struct domain_iommu {
++
+ #ifdef CONFIG_HAS_PASSTHROUGH
+     struct arch_iommu arch;
+ #endif
+ 
++    struct iommu_context default_ctx;
++    struct iommu_context_list other_contexts;
++
+     /* iommu_ops */
+     const struct iommu_ops *platform_ops;
+ 
+@@ -365,6 +414,12 @@ struct domain_iommu {
+     /* SAF-2-safe enum constant in arithmetic operation */
+     DECLARE_BITMAP(features, IOMMU_FEAT_count);
+ 
++    /* Do the IOMMU block all DMA on default context (implies !has_pt_share) ? */
++    bool no_dma;
++
++    /* Is the domain allowed to use PV-IOMMU ? */
++    bool allow_pv_iommu;
++
+     /* Does the guest share HAP mapping with the IOMMU? */
+     bool hap_pt_share;
+ 
+@@ -380,6 +435,7 @@ struct domain_iommu {
+ #define dom_iommu(d)              (&(d)->iommu)
+ #define iommu_set_feature(d, f)   set_bit(f, dom_iommu(d)->features)
+ #define iommu_clear_feature(d, f) clear_bit(f, dom_iommu(d)->features)
++#define iommu_default_context(d) (&dom_iommu(d)->default_ctx) /* does not lock ! */
+ 
+ /* Are we using the domain P2M table as its IOMMU pagetable? */
+ #define iommu_use_hap_pt(d)       (IS_ENABLED(CONFIG_HVM) && \
+@@ -401,10 +457,14 @@ static inline int iommu_do_domctl(struct xen_domctl *domctl, struct domain *d,
+ }
+ #endif
+ 
++int iommu_domain_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order);
++
+ int __must_check iommu_suspend(void);
+ void iommu_resume(void);
+ void iommu_crash_shutdown(void);
+ int iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
++
++int __init iommu_quarantine_init(void);
+ int iommu_quarantine_dev_init(device_t *dev);
+ 
+ #ifdef CONFIG_HAS_PCI
+@@ -414,6 +474,27 @@ int iommu_do_pci_domctl(struct xen_domctl *domctl, struct domain *d,
+ 
+ void iommu_dev_iotlb_flush_timeout(struct domain *d, struct pci_dev *pdev);
+ 
++uint64_t iommu_get_max_iova(struct domain *d);
++
++struct iommu_context *iommu_get_context(struct domain *d, u16 ctx_no);
++void iommu_put_context(struct iommu_context *ctx);
++
++#define IOMMU_CONTEXT_INIT_default (1 << 0)
++#define IOMMU_CONTEXT_INIT_quarantine (1 << 1)
++int iommu_context_init(struct domain *d, struct iommu_context *ctx, u16 ctx_no, u32 flags);
++
++#define IOMMU_TEARDOWN_REATTACH_DEFAULT (1 << 0)
++#define IOMMU_TEARDOWN_PREEMPT (1 << 1)
++int iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags);
++
++int iommu_context_alloc(struct domain *d, u16 *ctx_no, u32 flags);
++int iommu_context_free(struct domain *d, u16 ctx_no, u32 flags);
++
++int iommu_reattach_context(struct domain *prev_dom, struct domain *next_dom,
++                           device_t *dev, u16 ctx_no);
++int iommu_attach_context(struct domain *d, device_t *dev, u16 ctx_no);
++int iommu_detach_context(struct domain *d, device_t *dev);
++
+ /*
+  * The purpose of the iommu_dont_flush_iotlb optional cpu flag is to
+  * avoid unecessary iotlb_flush in the low level IOMMU code.
+@@ -429,6 +510,8 @@ DECLARE_PER_CPU(bool, iommu_dont_flush_iotlb);
+ extern struct spinlock iommu_pt_cleanup_lock;
+ extern struct page_list_head iommu_pt_cleanup_list;
+ 
++int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order);
++int arch_iommu_pviommu_teardown(struct domain *d);
+ bool arch_iommu_use_permitted(const struct domain *d);
+ 
+ #ifdef CONFIG_X86
+diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
+index f784e9116059..a421ead1a423 100644
+--- a/xen/include/xen/pci.h
++++ b/xen/include/xen/pci.h
+@@ -97,6 +97,7 @@ struct pci_dev_info {
+ struct pci_dev {
+     struct list_head alldevs_list;
+     struct list_head domain_list;
++    struct list_head context_list;
+ 
+     struct list_head msi_list;
+ 
+@@ -104,6 +105,8 @@ struct pci_dev {
+ 
+     struct domain *domain;
+ 
++    uint16_t context; /* IOMMU context number of domain */
++
+     const union {
+         struct {
+             uint8_t devfn;
+-- 
+2.46.0
+
diff --git a/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch b/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch
new file mode 100644
index 0000000..0001ef4
--- /dev/null
+++ b/0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch
@@ -0,0 +1,2965 @@
+From 074b5fd3767e8dc53cfd9506bb0a62b32869f6da Mon Sep 17 00:00:00 2001
+From: Teddy Astie <teddy.astie@vates.tech>
+Date: Mon, 4 Nov 2024 14:28:40 +0000
+Subject: [PATCH 403/404] VT-d: Port IOMMU driver to new subsystem
+
+Port the driver with guidances specified in iommu-contexts.md.
+
+Add a arena-based allocator for allocating a fixed chunk of memory and
+split it into 4k pages for use by the IOMMU contexts. This chunk size
+is configurable with X86_ARENA_ORDER and dom0-iommu=arena-order=N.
+
+Signed-off-by Teddy Astie <teddy.astie@vates.tech>
+---
+ xen/arch/x86/include/asm/arena.h     |   54 +
+ xen/arch/x86/include/asm/iommu.h     |   58 +-
+ xen/arch/x86/include/asm/pci.h       |   17 -
+ xen/drivers/passthrough/vtd/Makefile |    2 +-
+ xen/drivers/passthrough/vtd/extern.h |   14 +-
+ xen/drivers/passthrough/vtd/iommu.c  | 1479 +++++++++-----------------
+ xen/drivers/passthrough/vtd/quirks.c |   20 +-
+ xen/drivers/passthrough/x86/Makefile |    1 +
+ xen/drivers/passthrough/x86/arena.c  |  157 +++
+ xen/drivers/passthrough/x86/iommu.c  |  270 +++--
+ 10 files changed, 984 insertions(+), 1088 deletions(-)
+ create mode 100644 xen/arch/x86/include/asm/arena.h
+ create mode 100644 xen/drivers/passthrough/x86/arena.c
+
+diff --git a/xen/arch/x86/include/asm/arena.h b/xen/arch/x86/include/asm/arena.h
+new file mode 100644
+index 000000000000..7555b100e0b8
+--- /dev/null
++++ b/xen/arch/x86/include/asm/arena.h
+@@ -0,0 +1,54 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/**
++ * Simple arena-based page allocator.
++ */
++
++#ifndef __XEN_IOMMU_ARENA_H__
++#define __XEN_IOMMU_ARENA_H__
++
++#include "xen/domain.h"
++#include "xen/atomic.h"
++#include "xen/mm-frame.h"
++#include "xen/types.h"
++
++/**
++ * struct page_arena: Page arena structure
++ */
++struct iommu_arena {
++    /* mfn of the first page of the memory region */
++    mfn_t region_start;
++    /* bitmap of allocations */
++    unsigned long *map;
++
++    /* Order of the arena */
++    unsigned int order;
++
++    /* Used page count */
++    atomic_t used_pages;
++};
++
++/**
++ * Initialize a arena using domheap allocator.
++ * @param [out] arena Arena to allocate
++ * @param [in] domain domain that has ownership of arena pages
++ * @param [in] order order of the arena (power of two of the size)
++ * @param [in] memflags Flags for domheap_alloc_pages()
++ * @return -ENOMEM on arena allocation error, 0 otherwise
++ */
++int iommu_arena_initialize(struct iommu_arena *arena, struct domain *domain,
++                           unsigned int order, unsigned int memflags);
++
++/**
++ * Teardown a arena.
++ * @param [out] arena arena to allocate
++ * @param [in] check check for existing allocations
++ * @return -EBUSY if check is specified
++ */
++int iommu_arena_teardown(struct iommu_arena *arena, bool check);
++
++struct page_info *iommu_arena_allocate_page(struct iommu_arena *arena);
++bool iommu_arena_free_page(struct iommu_arena *arena, struct page_info *page);
++
++#define iommu_arena_size(arena) (1LLU << (arena)->order)
++
++#endif
+diff --git a/xen/arch/x86/include/asm/iommu.h b/xen/arch/x86/include/asm/iommu.h
+index 8dc464fbd3ca..533bb8d77742 100644
+--- a/xen/arch/x86/include/asm/iommu.h
++++ b/xen/arch/x86/include/asm/iommu.h
+@@ -2,14 +2,18 @@
+ #ifndef __ARCH_X86_IOMMU_H__
+ #define __ARCH_X86_IOMMU_H__
+ 
++#include <xen/bitmap.h>
+ #include <xen/errno.h>
+ #include <xen/list.h>
+ #include <xen/mem_access.h>
+ #include <xen/spinlock.h>
++#include <xen/stdbool.h>
+ #include <asm/apicdef.h>
+ #include <asm/cache.h>
+ #include <asm/processor.h>
+ 
++#include "arena.h"
++
+ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
+ 
+ struct g2m_ioport {
+@@ -31,27 +35,45 @@ typedef uint64_t daddr_t;
+ #define dfn_to_daddr(dfn) __dfn_to_daddr(dfn_x(dfn))
+ #define daddr_to_dfn(daddr) _dfn(__daddr_to_dfn(daddr))
+ 
+-struct arch_iommu
++struct arch_iommu_context
+ {
+-    spinlock_t mapping_lock; /* io page table lock */
+-    struct {
+-        struct page_list_head list;
+-        spinlock_t lock;
+-    } pgtables;
+-
++    struct page_list_head pgtables;
+     struct list_head identity_maps;
+ 
++    /* Queue for freeing pages */
++    struct page_list_head free_queue;
++
++    /* Is this context reusing domain P2M ? */
++    bool hap_context;
++
+     union {
+         /* Intel VT-d */
+         struct {
+             uint64_t pgd_maddr; /* io page directory machine address */
++            domid_t *didmap; /* per-iommu DID */
++            unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the context uses */
++            uint32_t superpage_progress; /* superpage progress during teardown */
++        } vtd;
++        /* AMD IOMMU */
++        struct {
++            struct page_info *root_table;
++        } amd;
++    };
++};
++
++struct arch_iommu
++{
++    struct iommu_arena pt_arena; /* allocator for non-default contexts */
++
++    union {
++        /* Intel VT-d */
++        struct {
+             unsigned int agaw; /* adjusted guest address width, 0 is level 2 30-bit */
+-            unsigned long *iommu_bitmap; /* bitmap of iommu(s) that the domain uses */
+         } vtd;
+         /* AMD IOMMU */
+         struct {
+             unsigned int paging_mode;
+-            struct page_info *root_table;
++            struct guest_iommu *g_iommu;
+         } amd;
+     };
+ };
+@@ -109,10 +131,13 @@ static inline void iommu_disable_x2apic(void)
+         iommu_vcall(&iommu_ops, disable_x2apic);
+ }
+ 
+-int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma,
+-                           paddr_t base, paddr_t end,
++int iommu_identity_mapping(struct domain *d, struct iommu_context *ctx,
++                           p2m_access_t p2ma, paddr_t base, paddr_t end,
+                            unsigned int flag);
+-void iommu_identity_map_teardown(struct domain *d);
++void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx);
++bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx,
++                              mfn_t mfn);
++
+ 
+ extern bool untrusted_msi;
+ 
+@@ -128,14 +153,19 @@ unsigned long *iommu_init_domid(domid_t reserve);
+ domid_t iommu_alloc_domid(unsigned long *map);
+ void iommu_free_domid(domid_t domid, unsigned long *map);
+ 
+-int __must_check iommu_free_pgtables(struct domain *d);
++struct iommu_context;
++int __must_check iommu_free_pgtables(struct domain *d, struct iommu_context *ctx);
+ struct domain_iommu;
+ struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd,
++                                                   struct iommu_context *ctx,
+                                                    uint64_t contig_mask);
+-void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
++void iommu_queue_free_pgtable(struct iommu_context *ctx, struct page_info *pg);
+ 
+ /* Check [start, end] unity map range for correctness. */
+ bool iommu_unity_region_ok(const char *prefix, mfn_t start, mfn_t end);
++int arch_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags);
++int arch_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags);
++int arch_iommu_flush_free_queue(struct domain *d, struct iommu_context *ctx);
+ 
+ #endif /* !__ARCH_X86_IOMMU_H__ */
+ /*
+diff --git a/xen/arch/x86/include/asm/pci.h b/xen/arch/x86/include/asm/pci.h
+index fd5480d67d43..214c1a0948a8 100644
+--- a/xen/arch/x86/include/asm/pci.h
++++ b/xen/arch/x86/include/asm/pci.h
+@@ -15,23 +15,6 @@
+ 
+ struct arch_pci_dev {
+     vmask_t used_vectors;
+-    /*
+-     * These fields are (de)initialized under pcidevs-lock. Other uses of
+-     * them don't race (de)initialization and hence don't strictly need any
+-     * locking.
+-     */
+-    union {
+-        /* Subset of struct arch_iommu's fields, to be used in dom_io. */
+-        struct {
+-            uint64_t pgd_maddr;
+-        } vtd;
+-        struct {
+-            struct page_info *root_table;
+-        } amd;
+-    };
+-    domid_t pseudo_domid;
+-    mfn_t leaf_mfn;
+-    struct page_list_head pgtables_list;
+ };
+ 
+ int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
+diff --git a/xen/drivers/passthrough/vtd/Makefile b/xen/drivers/passthrough/vtd/Makefile
+index fde7555fac07..81e1f46179b9 100644
+--- a/xen/drivers/passthrough/vtd/Makefile
++++ b/xen/drivers/passthrough/vtd/Makefile
+@@ -5,4 +5,4 @@ obj-y += dmar.o
+ obj-y += utils.o
+ obj-y += qinval.o
+ obj-y += intremap.o
+-obj-y += quirks.o
++obj-y += quirks.o
+\ No newline at end of file
+diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
+index c16583c951d5..6cdde26efb53 100644
+--- a/xen/drivers/passthrough/vtd/extern.h
++++ b/xen/drivers/passthrough/vtd/extern.h
+@@ -80,12 +80,10 @@ uint64_t alloc_pgtable_maddr(unsigned long npages, nodeid_t node);
+ void free_pgtable_maddr(u64 maddr);
+ void *map_vtd_domain_page(u64 maddr);
+ void unmap_vtd_domain_page(const void *va);
+-int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
+-                               uint8_t bus, uint8_t devfn,
+-                               const struct pci_dev *pdev, domid_t domid,
+-                               paddr_t pgd_maddr, unsigned int mode);
+-int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
+-                             uint8_t bus, uint8_t devfn);
++int apply_context_single(struct domain *domain, struct iommu_context *ctx,
++                         struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn);
++int unapply_context_single(struct domain *domain, struct vtd_iommu *iommu,
++                           uint8_t bus, uint8_t devfn);
+ int cf_check intel_iommu_get_reserved_device_memory(
+     iommu_grdm_t *func, void *ctxt);
+ 
+@@ -106,8 +104,8 @@ void platform_quirks_init(void);
+ void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
+ void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
+ int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
+-                               uint8_t devfn, domid_t domid, paddr_t pgd_maddr,
+-                               unsigned int mode);
++                               uint8_t devfn, domid_t domid,
++                               unsigned int mode, struct iommu_context *ctx);
+ void pci_vtd_quirk(const struct pci_dev *);
+ void quirk_iommu_caps(struct vtd_iommu *iommu);
+ 
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index 27a4d1640189..4e803735c318 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -20,6 +20,7 @@
+ 
+ #include <xen/irq.h>
+ #include <xen/sched.h>
++#include <xen/mem_access.h>
+ #include <xen/xmalloc.h>
+ #include <xen/domain_page.h>
+ #include <xen/err.h>
+@@ -30,15 +31,22 @@
+ #include <xen/time.h>
+ #include <xen/pci.h>
+ #include <xen/pci_regs.h>
++#include <xen/sched.h>
++#include <xen/event.h>
+ #include <xen/keyhandler.h>
++#include <xen/list.h>
++#include <xen/spinlock.h>
++#include <xen/lib.h>
+ 
+ #include <asm/apic.h>
+ #include <asm/io_apic.h>
+ #include <asm/msi.h>
+ #include <asm/nops.h>
+-#include <asm/irq.h>
+ #include <asm/hvm/vmx/vmx.h>
+ #include <asm/p2m.h>
++#include <asm/bitops.h>
++#include <asm/iommu.h>
++#include <asm/page.h>
+ 
+ #include "iommu.h"
+ #include "dmar.h"
+@@ -49,14 +57,6 @@
+ #define CONTIG_MASK DMA_PTE_CONTIG_MASK
+ #include <asm/pt-contig-markers.h>
+ 
+-/* dom_io is used as a sentinel for quarantined devices */
+-#define QUARANTINE_SKIP(d, pgd_maddr) ((d) == dom_io && !(pgd_maddr))
+-#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
+-                                             : (pdev)->arch.pseudo_domid)
+-#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
+-                                 ? dom_iommu(d)->arch.vtd.pgd_maddr \
+-                                 : (pdev)->arch.vtd.pgd_maddr)
+-
+ bool __read_mostly iommu_igfx = true;
+ bool __read_mostly iommu_qinval = true;
+ #ifndef iommu_snoop
+@@ -69,7 +69,6 @@ static unsigned int __ro_after_init min_pt_levels = UINT_MAX;
+ static struct tasklet vtd_fault_tasklet;
+ 
+ static int cf_check setup_hwdom_device(u8 devfn, struct pci_dev *);
+-static void setup_hwdom_rmrr(struct domain *d);
+ 
+ static bool domid_mapping(const struct vtd_iommu *iommu)
+ {
+@@ -209,26 +208,14 @@ static bool any_pdev_behind_iommu(const struct domain *d,
+  * clear iommu in iommu_bitmap and clear domain_id in domid_bitmap.
+  */
+ static void check_cleanup_domid_map(const struct domain *d,
++                                    const struct iommu_context *ctx,
+                                     const struct pci_dev *exclude,
+                                     struct vtd_iommu *iommu)
+ {
+-    bool found;
+-
+-    if ( d == dom_io )
+-        return;
+-
+-    found = any_pdev_behind_iommu(d, exclude, iommu);
+-    /*
+-     * Hidden devices are associated with DomXEN but usable by the hardware
+-     * domain. Hence they need considering here as well.
+-     */
+-    if ( !found && is_hardware_domain(d) )
+-        found = any_pdev_behind_iommu(dom_xen, exclude, iommu);
+-
+-    if ( !found )
++    if ( !any_pdev_behind_iommu(d, exclude, iommu) )
+     {
+-        clear_bit(iommu->index, dom_iommu(d)->arch.vtd.iommu_bitmap);
+-        cleanup_domid_map(d->domain_id, iommu);
++        clear_bit(iommu->index, ctx->arch.vtd.iommu_bitmap);
++        cleanup_domid_map(ctx->arch.vtd.didmap[iommu->index], iommu);
+     }
+ }
+ 
+@@ -315,8 +302,9 @@ static u64 bus_to_context_maddr(struct vtd_iommu *iommu, u8 bus)
+  *   PTE for the requested address,
+  * - for target == 0 the full PTE contents below PADDR_BITS limit.
+  */
+-static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+-                                       unsigned int target,
++static uint64_t addr_to_dma_page_maddr(struct domain *domain,
++                                       struct iommu_context *ctx,
++                                       daddr_t addr, unsigned int target,
+                                        unsigned int *flush_flags, bool alloc)
+ {
+     struct domain_iommu *hd = dom_iommu(domain);
+@@ -326,10 +314,9 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+     u64 pte_maddr = 0;
+ 
+     addr &= (((u64)1) << addr_width) - 1;
+-    ASSERT(spin_is_locked(&hd->arch.mapping_lock));
+     ASSERT(target || !alloc);
+ 
+-    if ( !hd->arch.vtd.pgd_maddr )
++    if ( !ctx->arch.vtd.pgd_maddr )
+     {
+         struct page_info *pg;
+ 
+@@ -337,13 +324,13 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+             goto out;
+ 
+         pte_maddr = level;
+-        if ( !(pg = iommu_alloc_pgtable(hd, 0)) )
++        if ( !(pg = iommu_alloc_pgtable(hd, ctx, 0)) )
+             goto out;
+ 
+-        hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
++        ctx->arch.vtd.pgd_maddr = page_to_maddr(pg);
+     }
+ 
+-    pte_maddr = hd->arch.vtd.pgd_maddr;
++    pte_maddr = ctx->arch.vtd.pgd_maddr;
+     parent = map_vtd_domain_page(pte_maddr);
+     while ( level > target )
+     {
+@@ -379,7 +366,7 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+             }
+ 
+             pte_maddr = level - 1;
+-            pg = iommu_alloc_pgtable(hd, DMA_PTE_CONTIG_MASK);
++            pg = iommu_alloc_pgtable(hd, ctx, DMA_PTE_CONTIG_MASK);
+             if ( !pg )
+                 break;
+ 
+@@ -431,38 +418,25 @@ static uint64_t addr_to_dma_page_maddr(struct domain *domain, daddr_t addr,
+     return pte_maddr;
+ }
+ 
+-static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr,
+-                                unsigned int nr_pt_levels)
++static paddr_t get_context_pgd(struct domain *d, struct iommu_context *ctx,
++                               unsigned int nr_pt_levels)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+     unsigned int agaw;
++    paddr_t pgd_maddr = ctx->arch.vtd.pgd_maddr;
+ 
+-    ASSERT(spin_is_locked(&hd->arch.mapping_lock));
+-
+-    if ( pgd_maddr )
+-        /* nothing */;
+-    else if ( iommu_use_hap_pt(d) )
++    if ( !ctx->arch.vtd.pgd_maddr )
+     {
+-        pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d));
++        /*
++         * Ensure we have pagetables allocated down to the smallest
++         * level the loop below may need to run to.
++         */
++        addr_to_dma_page_maddr(d, ctx, 0, min_pt_levels, NULL, true);
+ 
+-        pgd_maddr = pagetable_get_paddr(pgt);
++        if ( !ctx->arch.vtd.pgd_maddr )
++            return 0;
+     }
+-    else
+-    {
+-        if ( !hd->arch.vtd.pgd_maddr )
+-        {
+-            /*
+-             * Ensure we have pagetables allocated down to the smallest
+-             * level the loop below may need to run to.
+-             */
+-            addr_to_dma_page_maddr(d, 0, min_pt_levels, NULL, true);
+-
+-            if ( !hd->arch.vtd.pgd_maddr )
+-                return 0;
+-        }
+ 
+-        pgd_maddr = hd->arch.vtd.pgd_maddr;
+-    }
++    pgd_maddr = ctx->arch.vtd.pgd_maddr;
+ 
+     /* Skip top level(s) of page tables for less-than-maximum level DRHDs. */
+     for ( agaw = level_to_agaw(4);
+@@ -730,28 +704,18 @@ static int __must_check iommu_flush_all(void)
+     return rc;
+ }
+ 
+-static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
++static int __must_check cf_check iommu_flush_iotlb(struct domain *d,
++                                                   struct iommu_context *ctx,
++                                                   dfn_t dfn,
+                                                    unsigned long page_count,
+                                                    unsigned int flush_flags)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+     struct acpi_drhd_unit *drhd;
+     struct vtd_iommu *iommu;
+     bool flush_dev_iotlb;
+     int iommu_domid;
+     int ret = 0;
+ 
+-    if ( flush_flags & IOMMU_FLUSHF_all )
+-    {
+-        dfn = INVALID_DFN;
+-        page_count = 0;
+-    }
+-    else
+-    {
+-        ASSERT(page_count && !dfn_eq(dfn, INVALID_DFN));
+-        ASSERT(flush_flags);
+-    }
+-
+     /*
+      * No need pcideves_lock here because we have flush
+      * when assign/deassign device
+@@ -762,13 +726,20 @@ static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
+ 
+         iommu = drhd->iommu;
+ 
+-        if ( !test_bit(iommu->index, hd->arch.vtd.iommu_bitmap) )
+-            continue;
++        if ( ctx )
++        {
++            if ( !test_bit(iommu->index, ctx->arch.vtd.iommu_bitmap) )
++                continue;
++
++            iommu_domid = get_iommu_did(ctx->arch.vtd.didmap[iommu->index], iommu, true);
++
++            if ( iommu_domid == -1 )
++                continue;
++        }
++        else
++            iommu_domid = 0;
+ 
+         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+-        iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying);
+-        if ( iommu_domid == -1 )
+-            continue;
+ 
+         if ( !page_count || (page_count & (page_count - 1)) ||
+              dfn_eq(dfn, INVALID_DFN) || !IS_ALIGNED(dfn_x(dfn), page_count) )
+@@ -787,10 +758,13 @@ static int __must_check cf_check iommu_flush_iotlb(struct domain *d, dfn_t dfn,
+             ret = rc;
+     }
+ 
++    if ( !ret && ctx )
++        arch_iommu_flush_free_queue(d, ctx);
++
+     return ret;
+ }
+ 
+-static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int level)
++static void queue_free_pt(struct iommu_context *ctx, mfn_t mfn, unsigned int level)
+ {
+     if ( level > 1 )
+     {
+@@ -799,13 +773,13 @@ static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int level
+ 
+         for ( i = 0; i < PTE_NUM; ++i )
+             if ( dma_pte_present(pt[i]) && !dma_pte_superpage(pt[i]) )
+-                queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(pt[i])),
++                queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(pt[i])),
+                               level - 1);
+ 
+         unmap_domain_page(pt);
+     }
+ 
+-    iommu_queue_free_pgtable(hd, mfn_to_page(mfn));
++    iommu_queue_free_pgtable(ctx, mfn_to_page(mfn));
+ }
+ 
+ static int iommu_set_root_entry(struct vtd_iommu *iommu)
+@@ -1436,11 +1410,6 @@ static int cf_check intel_iommu_domain_init(struct domain *d)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+ 
+-    hd->arch.vtd.iommu_bitmap = xzalloc_array(unsigned long,
+-                                              BITS_TO_LONGS(nr_iommus));
+-    if ( !hd->arch.vtd.iommu_bitmap )
+-        return -ENOMEM;
+-
+     hd->arch.vtd.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
+ 
+     return 0;
+@@ -1451,7 +1420,7 @@ static void __hwdom_init cf_check intel_iommu_hwdom_init(struct domain *d)
+     struct acpi_drhd_unit *drhd;
+ 
+     setup_hwdom_pci_devices(d, setup_hwdom_device);
+-    setup_hwdom_rmrr(d);
++
+     /* Make sure workarounds are applied before enabling the IOMMU(s). */
+     arch_iommu_hwdom_init(d);
+ 
+@@ -1468,32 +1437,22 @@ static void __hwdom_init cf_check intel_iommu_hwdom_init(struct domain *d)
+     }
+ }
+ 
+-/*
+- * This function returns
+- * - a negative errno value upon error,
+- * - zero upon success when previously the entry was non-present, or this isn't
+- *   the "main" request for a device (pdev == NULL), or for no-op quarantining
+- *   assignments,
+- * - positive (one) upon success when previously the entry was present and this
+- *   is the "main" request for a device (pdev != NULL).
++/**
++ * Apply a context on a device.
++ * @param domain Domain of the context
++ * @param iommu IOMMU hardware to use (must match device iommu)
++ * @param ctx IOMMU context to apply
++ * @param devfn PCI device function (may be different to pdev)
+  */
+-int domain_context_mapping_one(
+-    struct domain *domain,
+-    struct vtd_iommu *iommu,
+-    uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
+-    domid_t domid, paddr_t pgd_maddr, unsigned int mode)
++int apply_context_single(struct domain *domain, struct iommu_context *ctx,
++                         struct vtd_iommu *iommu, uint8_t bus, uint8_t devfn)
+ {
+-    struct domain_iommu *hd = dom_iommu(domain);
+     struct context_entry *context, *context_entries, lctxt;
+-    __uint128_t old;
++    __uint128_t res, old;
+     uint64_t maddr;
+-    uint16_t seg = iommu->drhd->segment, prev_did = 0;
+-    struct domain *prev_dom = NULL;
++    uint16_t seg = iommu->drhd->segment, prev_did = 0, did;
+     int rc, ret;
+-    bool flush_dev_iotlb;
+-
+-    if ( QUARANTINE_SKIP(domain, pgd_maddr) )
+-        return 0;
++    bool flush_dev_iotlb, overwrite_entry = false;
+ 
+     ASSERT(pcidevs_locked());
+     spin_lock(&iommu->lock);
+@@ -1502,28 +1461,15 @@ int domain_context_mapping_one(
+     context = &context_entries[devfn];
+     old = (lctxt = *context).full;
+ 
+-    if ( context_present(lctxt) )
+-    {
+-        domid_t domid;
++    did = ctx->arch.vtd.didmap[iommu->index];
+ 
++    if ( context_present(*context) )
++    {
+         prev_did = context_domain_id(lctxt);
+-        domid = did_to_domain_id(iommu, prev_did);
+-        if ( domid < DOMID_FIRST_RESERVED )
+-            prev_dom = rcu_lock_domain_by_id(domid);
+-        else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK )
+-            prev_dom = rcu_lock_domain(dom_io);
+-        if ( !prev_dom )
+-        {
+-            spin_unlock(&iommu->lock);
+-            unmap_vtd_domain_page(context_entries);
+-            dprintk(XENLOG_DEBUG VTDPREFIX,
+-                    "no domain for did %u (nr_dom %u)\n",
+-                    prev_did, cap_ndoms(iommu->cap));
+-            return -ESRCH;
+-        }
++        overwrite_entry = true;
+     }
+ 
+-    if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
++    if ( iommu_hwdom_passthrough && is_hardware_domain(domain) && !ctx->id )
+     {
+         context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU);
+     }
+@@ -1531,16 +1477,10 @@ int domain_context_mapping_one(
+     {
+         paddr_t root;
+ 
+-        spin_lock(&hd->arch.mapping_lock);
+-
+-        root = domain_pgd_maddr(domain, pgd_maddr, iommu->nr_pt_levels);
++        root = get_context_pgd(domain, ctx, iommu->nr_pt_levels);
+         if ( !root )
+         {
+-            spin_unlock(&hd->arch.mapping_lock);
+-            spin_unlock(&iommu->lock);
+             unmap_vtd_domain_page(context_entries);
+-            if ( prev_dom )
+-                rcu_unlock_domain(prev_dom);
+             return -ENOMEM;
+         }
+ 
+@@ -1549,98 +1489,39 @@ int domain_context_mapping_one(
+             context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
+         else
+             context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL);
+-
+-        spin_unlock(&hd->arch.mapping_lock);
+     }
+ 
+-    rc = context_set_domain_id(&lctxt, domid, iommu);
++    rc = context_set_domain_id(&lctxt, did, iommu);
+     if ( rc )
+-    {
+-    unlock:
+-        spin_unlock(&iommu->lock);
+-        unmap_vtd_domain_page(context_entries);
+-        if ( prev_dom )
+-            rcu_unlock_domain(prev_dom);
+-        return rc;
+-    }
+-
+-    if ( !prev_dom )
+-    {
+-        context_set_address_width(lctxt, level_to_agaw(iommu->nr_pt_levels));
+-        context_set_fault_enable(lctxt);
+-        context_set_present(lctxt);
+-    }
+-    else if ( prev_dom == domain )
+-    {
+-        ASSERT(lctxt.full == context->full);
+-        rc = !!pdev;
+         goto unlock;
+-    }
+-    else
+-    {
+-        ASSERT(context_address_width(lctxt) ==
+-               level_to_agaw(iommu->nr_pt_levels));
+-        ASSERT(!context_fault_disable(lctxt));
+-    }
+-
+-    if ( cpu_has_cx16 )
+-    {
+-        __uint128_t res = cmpxchg16b(context, &old, &lctxt.full);
+ 
+-        /*
+-         * Hardware does not update the context entry behind our backs,
+-         * so the return value should match "old".
+-         */
+-        if ( res != old )
+-        {
+-            if ( pdev )
+-                check_cleanup_domid_map(domain, pdev, iommu);
+-            printk(XENLOG_ERR
+-                   "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
+-                   &PCI_SBDF(seg, bus, devfn),
+-                   (uint64_t)(res >> 64), (uint64_t)res,
+-                   (uint64_t)(old >> 64), (uint64_t)old);
+-            rc = -EILSEQ;
+-            goto unlock;
+-        }
+-    }
+-    else if ( !prev_dom || !(mode & MAP_WITH_RMRR) )
+-    {
+-        context_clear_present(*context);
+-        iommu_sync_cache(context, sizeof(*context));
++    context_set_address_width(lctxt, level_to_agaw(iommu->nr_pt_levels));
++    context_set_fault_enable(lctxt);
++    context_set_present(lctxt);
+ 
+-        write_atomic(&context->hi, lctxt.hi);
+-        /* No barrier should be needed between these two. */
+-        write_atomic(&context->lo, lctxt.lo);
+-    }
+-    else /* Best effort, updating DID last. */
+-    {
+-         /*
+-          * By non-atomically updating the context entry's DID field last,
+-          * during a short window in time TLB entries with the old domain ID
+-          * but the new page tables may be inserted.  This could affect I/O
+-          * of other devices using this same (old) domain ID.  Such updating
+-          * therefore is not a problem if this was the only device associated
+-          * with the old domain ID.  Diverting I/O of any of a dying domain's
+-          * devices to the quarantine page tables is intended anyway.
+-          */
+-        if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) )
+-            printk(XENLOG_WARNING VTDPREFIX
+-                   " %pp: reassignment may cause %pd data corruption\n",
+-                   &PCI_SBDF(seg, bus, devfn), prev_dom);
++    res = cmpxchg16b(context, &old, &lctxt.full);
+ 
+-        write_atomic(&context->lo, lctxt.lo);
+-        /* No barrier should be needed between these two. */
+-        write_atomic(&context->hi, lctxt.hi);
++    /*
++     * Hardware does not update the context entry behind our backs,
++     * so the return value should match "old".
++     */
++    if ( res != old )
++    {
++        printk(XENLOG_ERR
++                "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
++                &PCI_SBDF(seg, bus, devfn),
++                (uint64_t)(res >> 64), (uint64_t)res,
++                (uint64_t)(old >> 64), (uint64_t)old);
++        rc = -EILSEQ;
++        goto unlock;
+     }
+ 
+     iommu_sync_cache(context, sizeof(struct context_entry));
+-    spin_unlock(&iommu->lock);
+ 
+     rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF(bus, devfn),
+-                                    DMA_CCMD_MASK_NOBIT, !prev_dom);
++                                    DMA_CCMD_MASK_NOBIT, !overwrite_entry);
+     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
+-    ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb);
++    ret = iommu_flush_iotlb_dsi(iommu, prev_did, !overwrite_entry, flush_dev_iotlb);
+ 
+     /*
+      * The current logic for returns:
+@@ -1656,230 +1537,55 @@ int domain_context_mapping_one(
+     if ( rc > 0 )
+         rc = 0;
+ 
+-    set_bit(iommu->index, hd->arch.vtd.iommu_bitmap);
++    set_bit(iommu->index, ctx->arch.vtd.iommu_bitmap);
+ 
+     unmap_vtd_domain_page(context_entries);
++    spin_unlock(&iommu->lock);
+ 
+     if ( !seg && !rc )
+-        rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode);
+-
+-    if ( rc && !(mode & MAP_ERROR_RECOVERY) )
+-    {
+-        if ( !prev_dom ||
+-             /*
+-              * Unmapping here means DEV_TYPE_PCI devices with RMRRs (if such
+-              * exist) would cause problems if such a region was actually
+-              * accessed.
+-              */
+-             (prev_dom == dom_io && !pdev) )
+-            ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+-        else
+-            ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+-                                             DEVICE_DOMID(prev_dom, pdev),
+-                                             DEVICE_PGTABLE(prev_dom, pdev),
+-                                             (mode & MAP_WITH_RMRR) |
+-                                             MAP_ERROR_RECOVERY) < 0;
+-
+-        if ( !ret && pdev && pdev->devfn == devfn )
+-            check_cleanup_domid_map(domain, pdev, iommu);
+-    }
++        rc = me_wifi_quirk(domain, bus, devfn, did, 0, ctx);
+ 
+-    if ( prev_dom )
+-        rcu_unlock_domain(prev_dom);
++    return rc;
+ 
+-    return rc ?: pdev && prev_dom;
++    unlock:
++        unmap_vtd_domain_page(context_entries);
++        spin_unlock(&iommu->lock);
++        return rc;
+ }
+ 
+-static const struct acpi_drhd_unit *domain_context_unmap(
+-    struct domain *d, uint8_t devfn, struct pci_dev *pdev);
+-
+-static int domain_context_mapping(struct domain *domain, u8 devfn,
+-                                  struct pci_dev *pdev)
++int apply_context(struct domain *d, struct iommu_context *ctx,
++                  struct pci_dev *pdev, u8 devfn)
+ {
+     const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+-    const struct acpi_rmrr_unit *rmrr;
+-    paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
+-    domid_t orig_domid = pdev->arch.pseudo_domid;
+     int ret = 0;
+-    unsigned int i, mode = 0;
+-    uint16_t seg = pdev->seg, bdf;
+-    uint8_t bus = pdev->bus, secbus;
+-
+-    /*
+-     * Generally we assume only devices from one node to get assigned to a
+-     * given guest.  But even if not, by replacing the prior value here we
+-     * guarantee that at least some basic allocations for the device being
+-     * added will get done against its node.  Any further allocations for
+-     * this or other devices may be penalized then, but some would also be
+-     * if we left other than NUMA_NO_NODE untouched here.
+-     */
+-    if ( drhd && drhd->iommu->node != NUMA_NO_NODE )
+-        dom_iommu(domain)->node = drhd->iommu->node;
+-
+-    ASSERT(pcidevs_locked());
+-
+-    for_each_rmrr_device( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf )
+-            continue;
+ 
+-        mode |= MAP_WITH_RMRR;
+-        break;
+-    }
++    if ( !drhd )
++        return -EINVAL;
+ 
+-    if ( domain != pdev->domain && pdev->domain != dom_io )
++    if ( pdev->type == DEV_TYPE_PCI_HOST_BRIDGE ||
++         pdev->type == DEV_TYPE_PCIe_BRIDGE ||
++         pdev->type == DEV_TYPE_PCIe2PCI_BRIDGE ||
++         pdev->type == DEV_TYPE_LEGACY_PCI_BRIDGE )
+     {
+-        if ( pdev->domain->is_dying )
+-            mode |= MAP_OWNER_DYING;
+-        else if ( drhd &&
+-                  !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) &&
+-                  !pdev->phantom_stride )
+-            mode |= MAP_SINGLE_DEVICE;
++        printk(XENLOG_WARNING VTDPREFIX " Ignoring apply_context on PCI bridge\n");
++        return 0;
+     }
+ 
+-    switch ( pdev->type )
+-    {
+-        bool prev_present;
+-
+-    case DEV_TYPE_PCI_HOST_BRIDGE:
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:Hostbridge: skip %pp map\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-        if ( !is_hardware_domain(domain) )
+-            return -EPERM;
+-        break;
+-
+-    case DEV_TYPE_PCIe_BRIDGE:
+-    case DEV_TYPE_PCIe2PCI_BRIDGE:
+-    case DEV_TYPE_LEGACY_PCI_BRIDGE:
+-        break;
+-
+-    case DEV_TYPE_PCIe_ENDPOINT:
+-        if ( !drhd )
+-            return -ENODEV;
+-
+-        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
+-        {
+-            pdev->arch.pseudo_domid =
+-                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
+-            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
+-                return -ENOSPC;
+-        }
+-
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:PCIe: map %pp\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
+-                                         DEVICE_DOMID(domain, pdev), pgd_maddr,
+-                                         mode);
+-        if ( ret > 0 )
+-            ret = 0;
+-        if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+-            enable_ats_device(pdev, &drhd->iommu->ats_devices);
+-
+-        break;
+-
+-    case DEV_TYPE_PCI:
+-        if ( !drhd )
+-            return -ENODEV;
+-
+-        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
+-        {
+-            pdev->arch.pseudo_domid =
+-                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
+-            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
+-                return -ENOSPC;
+-        }
+-
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:PCI: map %pp\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-
+-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                         pdev, DEVICE_DOMID(domain, pdev),
+-                                         pgd_maddr, mode);
+-        if ( ret < 0 )
+-            break;
+-        prev_present = ret;
+-
+-        if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 )
+-        {
+-            if ( !ret )
+-                break;
+-            ret = -ENXIO;
+-        }
+-        /*
+-         * Strictly speaking if the device is the only one behind this bridge
+-         * and the only one with this (secbus,0,0) tuple, it could be allowed
+-         * to be re-assigned regardless of RMRR presence.  But let's deal with
+-         * that case only if it is actually found in the wild.  Note that
+-         * dealing with this just here would still not render the operation
+-         * secure.
+-         */
+-        else if ( prev_present && (mode & MAP_WITH_RMRR) &&
+-                  domain != pdev->domain )
+-            ret = -EOPNOTSUPP;
+-
+-        /*
+-         * Mapping a bridge should, if anything, pass the struct pci_dev of
+-         * that bridge. Since bridges don't normally get assigned to guests,
+-         * their owner would be the wrong one. Pass NULL instead.
+-         */
+-        if ( ret >= 0 )
+-            ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
+-                                             NULL, DEVICE_DOMID(domain, pdev),
+-                                             pgd_maddr, mode);
+-
+-        /*
+-         * Devices behind PCIe-to-PCI/PCIx bridge may generate different
+-         * requester-id. It may originate from devfn=0 on the secondary bus
+-         * behind the bridge. Map that id as well if we didn't already.
+-         *
+-         * Somewhat similar as for bridges, we don't want to pass a struct
+-         * pci_dev here - there may not even exist one for this (secbus,0,0)
+-         * tuple. If there is one, without properly working device groups it
+-         * may again not have the correct owner.
+-         */
+-        if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
+-             (secbus != pdev->bus || pdev->devfn != 0) )
+-            ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
+-                                             NULL, DEVICE_DOMID(domain, pdev),
+-                                             pgd_maddr, mode);
+-
+-        if ( ret )
+-        {
+-            if ( !prev_present )
+-                domain_context_unmap(domain, devfn, pdev);
+-            else if ( pdev->domain != domain ) /* Avoid infinite recursion. */
+-                domain_context_mapping(pdev->domain, devfn, pdev);
+-        }
++    ASSERT(pcidevs_locked());
+ 
+-        break;
++    ret = apply_context_single(d, ctx, drhd->iommu, pdev->bus, devfn);
+ 
+-    default:
+-        dprintk(XENLOG_ERR VTDPREFIX, "%pd:unknown(%u): %pp\n",
+-                domain, pdev->type, &PCI_SBDF(seg, bus, devfn));
+-        ret = -EINVAL;
+-        break;
+-    }
++    if ( !ret && ats_device(pdev, drhd) > 0 )
++        enable_ats_device(pdev, &drhd->iommu->ats_devices);
+ 
+     if ( !ret && devfn == pdev->devfn )
+         pci_vtd_quirk(pdev);
+ 
+-    if ( ret && drhd && orig_domid == DOMID_INVALID )
+-    {
+-        iommu_free_domid(pdev->arch.pseudo_domid,
+-                         drhd->iommu->pseudo_domid_map);
+-        pdev->arch.pseudo_domid = DOMID_INVALID;
+-    }
+-
+     return ret;
+ }
+ 
+-int domain_context_unmap_one(
+-    struct domain *domain,
+-    struct vtd_iommu *iommu,
+-    uint8_t bus, uint8_t devfn)
++int unapply_context_single(struct domain *domain, struct vtd_iommu *iommu,
++                           uint8_t bus, uint8_t devfn)
+ {
+     struct context_entry *context, *context_entries;
+     u64 maddr;
+@@ -1931,8 +1637,8 @@ int domain_context_unmap_one(
+     unmap_vtd_domain_page(context_entries);
+ 
+     if ( !iommu->drhd->segment && !rc )
+-        rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, 0,
+-                           UNMAP_ME_PHANTOM_FUNC);
++        rc = me_wifi_quirk(domain, bus, devfn, DOMID_INVALID, UNMAP_ME_PHANTOM_FUNC,
++                           NULL);
+ 
+     if ( rc && !is_hardware_domain(domain) && domain != dom_io )
+     {
+@@ -1950,143 +1656,28 @@ int domain_context_unmap_one(
+     return rc;
+ }
+ 
+-static const struct acpi_drhd_unit *domain_context_unmap(
+-    struct domain *domain,
+-    uint8_t devfn,
+-    struct pci_dev *pdev)
+-{
+-    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+-    struct vtd_iommu *iommu = drhd ? drhd->iommu : NULL;
+-    int ret;
+-    uint16_t seg = pdev->seg;
+-    uint8_t bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
+-
+-    switch ( pdev->type )
+-    {
+-    case DEV_TYPE_PCI_HOST_BRIDGE:
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:Hostbridge: skip %pp unmap\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-        return ERR_PTR(is_hardware_domain(domain) ? 0 : -EPERM);
+-
+-    case DEV_TYPE_PCIe_BRIDGE:
+-    case DEV_TYPE_PCIe2PCI_BRIDGE:
+-    case DEV_TYPE_LEGACY_PCI_BRIDGE:
+-        return ERR_PTR(0);
+-
+-    case DEV_TYPE_PCIe_ENDPOINT:
+-        if ( !iommu )
+-            return ERR_PTR(-ENODEV);
+-
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:PCIe: unmap %pp\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+-        if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
+-            disable_ats_device(pdev);
+-
+-        break;
+-
+-    case DEV_TYPE_PCI:
+-        if ( !iommu )
+-            return ERR_PTR(-ENODEV);
+-
+-        if ( iommu_debug )
+-            printk(VTDPREFIX "%pd:PCI: unmap %pp\n",
+-                   domain, &PCI_SBDF(seg, bus, devfn));
+-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+-        if ( ret )
+-            break;
+-
+-        tmp_bus = bus;
+-        tmp_devfn = devfn;
+-        if ( (ret = find_upstream_bridge(seg, &tmp_bus, &tmp_devfn,
+-                                         &secbus)) < 1 )
+-        {
+-            if ( ret )
+-            {
+-                ret = -ENXIO;
+-                if ( !domain->is_dying &&
+-                     !is_hardware_domain(domain) && domain != dom_io )
+-                {
+-                    domain_crash(domain);
+-                    /* Make upper layers continue in a best effort manner. */
+-                    ret = 0;
+-                }
+-            }
+-            break;
+-        }
+-
+-        ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
+-        /* PCIe to PCI/PCIx bridge */
+-        if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+-            ret = domain_context_unmap_one(domain, iommu, secbus, 0);
+-
+-        break;
+-
+-    default:
+-        dprintk(XENLOG_ERR VTDPREFIX, "%pd:unknown(%u): %pp\n",
+-                domain, pdev->type, &PCI_SBDF(seg, bus, devfn));
+-        return ERR_PTR(-EINVAL);
+-    }
+-
+-    if ( !ret && pdev->devfn == devfn &&
+-         !QUARANTINE_SKIP(domain, pdev->arch.vtd.pgd_maddr) )
+-        check_cleanup_domid_map(domain, pdev, iommu);
+-
+-    return drhd;
+-}
+-
+-static void cf_check iommu_clear_root_pgtable(struct domain *d)
++static void cf_check iommu_clear_root_pgtable(struct domain *d, struct iommu_context *ctx)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+-
+-    spin_lock(&hd->arch.mapping_lock);
+-    hd->arch.vtd.pgd_maddr = 0;
+-    spin_unlock(&hd->arch.mapping_lock);
++    ctx->arch.vtd.pgd_maddr = 0;
+ }
+ 
+ static void cf_check iommu_domain_teardown(struct domain *d)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
++    struct iommu_context *ctx = iommu_default_context(d);
+     const struct acpi_drhd_unit *drhd;
+ 
+     if ( list_empty(&acpi_drhd_units) )
+         return;
+ 
+-    iommu_identity_map_teardown(d);
+-
+-    ASSERT(!hd->arch.vtd.pgd_maddr);
++    ASSERT(!ctx->arch.vtd.pgd_maddr);
+ 
+     for_each_drhd_unit ( drhd )
+         cleanup_domid_map(d->domain_id, drhd->iommu);
+-
+-    XFREE(hd->arch.vtd.iommu_bitmap);
+-}
+-
+-static void quarantine_teardown(struct pci_dev *pdev,
+-                                const struct acpi_drhd_unit *drhd)
+-{
+-    struct domain_iommu *hd = dom_iommu(dom_io);
+-
+-    ASSERT(pcidevs_locked());
+-
+-    if ( !pdev->arch.vtd.pgd_maddr )
+-        return;
+-
+-    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+-    page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list);
+-    while ( iommu_free_pgtables(dom_io) == -ERESTART )
+-        /* nothing */;
+-    pdev->arch.vtd.pgd_maddr = 0;
+-
+-    if ( drhd )
+-        cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
+ }
+ 
+ static int __must_check cf_check intel_iommu_map_page(
+     struct domain *d, dfn_t dfn, mfn_t mfn, unsigned int flags,
+-    unsigned int *flush_flags)
++    unsigned int *flush_flags, struct iommu_context *ctx)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+     struct dma_pte *page, *pte, old, new = {};
+@@ -2097,33 +1688,24 @@ static int __must_check cf_check intel_iommu_map_page(
+     ASSERT((hd->platform_ops->page_sizes >> IOMMUF_order(flags)) &
+            PAGE_SIZE_4K);
+ 
+-    /* Do nothing if VT-d shares EPT page table */
+-    if ( iommu_use_hap_pt(d) )
++    if ( ctx->opaque )
+         return 0;
+ 
+-    /* Do nothing if hardware domain and iommu supports pass thru. */
+-    if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
+-        return 0;
+-
+-    spin_lock(&hd->arch.mapping_lock);
+-
+     /*
+      * IOMMU mapping request can be safely ignored when the domain is dying.
+      *
+-     * hd->arch.mapping_lock guarantees that d->is_dying will be observed
++     * hd->lock guarantees that d->is_dying will be observed
+      * before any page tables are freed (see iommu_free_pgtables())
+      */
+     if ( d->is_dying )
+     {
+-        spin_unlock(&hd->arch.mapping_lock);
+         return 0;
+     }
+ 
+-    pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), level, flush_flags,
++    pg_maddr = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), level, flush_flags,
+                                       true);
+     if ( pg_maddr < PAGE_SIZE )
+     {
+-        spin_unlock(&hd->arch.mapping_lock);
+         return -ENOMEM;
+     }
+ 
+@@ -2144,7 +1726,6 @@ static int __must_check cf_check intel_iommu_map_page(
+ 
+     if ( !((old.val ^ new.val) & ~DMA_PTE_CONTIG_MASK) )
+     {
+-        spin_unlock(&hd->arch.mapping_lock);
+         unmap_vtd_domain_page(page);
+         return 0;
+     }
+@@ -2173,7 +1754,7 @@ static int __must_check cf_check intel_iommu_map_page(
+         new.val &= ~(LEVEL_MASK << level_to_offset_bits(level));
+         dma_set_pte_superpage(new);
+ 
+-        pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), ++level,
++        pg_maddr = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), ++level,
+                                           flush_flags, false);
+         BUG_ON(pg_maddr < PAGE_SIZE);
+ 
+@@ -2183,11 +1764,10 @@ static int __must_check cf_check intel_iommu_map_page(
+         iommu_sync_cache(pte, sizeof(*pte));
+ 
+         *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
+-        iommu_queue_free_pgtable(hd, pg);
++        iommu_queue_free_pgtable(ctx, pg);
+         perfc_incr(iommu_pt_coalesces);
+     }
+ 
+-    spin_unlock(&hd->arch.mapping_lock);
+     unmap_vtd_domain_page(page);
+ 
+     *flush_flags |= IOMMU_FLUSHF_added;
+@@ -2196,7 +1776,7 @@ static int __must_check cf_check intel_iommu_map_page(
+         *flush_flags |= IOMMU_FLUSHF_modified;
+ 
+         if ( IOMMUF_order(flags) && !dma_pte_superpage(old) )
+-            queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)),
++            queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(old)),
+                           IOMMUF_order(flags) / LEVEL_STRIDE);
+     }
+ 
+@@ -2204,7 +1784,8 @@ static int __must_check cf_check intel_iommu_map_page(
+ }
+ 
+ static int __must_check cf_check intel_iommu_unmap_page(
+-    struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags)
++    struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags,
++    struct iommu_context *ctx)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+     daddr_t addr = dfn_to_daddr(dfn);
+@@ -2218,29 +1799,19 @@ static int __must_check cf_check intel_iommu_unmap_page(
+      */
+     ASSERT((hd->platform_ops->page_sizes >> order) & PAGE_SIZE_4K);
+ 
+-    /* Do nothing if VT-d shares EPT page table */
+-    if ( iommu_use_hap_pt(d) )
+-        return 0;
+-
+-    /* Do nothing if hardware domain and iommu supports pass thru. */
+-    if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
++    if ( ctx->opaque )
+         return 0;
+ 
+-    spin_lock(&hd->arch.mapping_lock);
+     /* get target level pte */
+-    pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false);
++    pg_maddr = addr_to_dma_page_maddr(d, ctx, addr, level, flush_flags, false);
+     if ( pg_maddr < PAGE_SIZE )
+-    {
+-        spin_unlock(&hd->arch.mapping_lock);
+         return pg_maddr ? -ENOMEM : 0;
+-    }
+ 
+     page = map_vtd_domain_page(pg_maddr);
+     pte = &page[address_level_offset(addr, level)];
+ 
+     if ( !dma_pte_present(*pte) )
+     {
+-        spin_unlock(&hd->arch.mapping_lock);
+         unmap_vtd_domain_page(page);
+         return 0;
+     }
+@@ -2258,7 +1829,7 @@ static int __must_check cf_check intel_iommu_unmap_page(
+ 
+         unmap_vtd_domain_page(page);
+ 
+-        pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false);
++        pg_maddr = addr_to_dma_page_maddr(d, ctx, addr, level, flush_flags, false);
+         BUG_ON(pg_maddr < PAGE_SIZE);
+ 
+         page = map_vtd_domain_page(pg_maddr);
+@@ -2267,42 +1838,31 @@ static int __must_check cf_check intel_iommu_unmap_page(
+         iommu_sync_cache(pte, sizeof(*pte));
+ 
+         *flush_flags |= IOMMU_FLUSHF_all;
+-        iommu_queue_free_pgtable(hd, pg);
++        iommu_queue_free_pgtable(ctx, pg);
+         perfc_incr(iommu_pt_coalesces);
+     }
+ 
+-    spin_unlock(&hd->arch.mapping_lock);
+-
+     unmap_vtd_domain_page(page);
+ 
+     *flush_flags |= IOMMU_FLUSHF_modified;
+ 
+     if ( order && !dma_pte_superpage(old) )
+-        queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)),
++        queue_free_pt(ctx, maddr_to_mfn(dma_pte_addr(old)),
+                       order / LEVEL_STRIDE);
+ 
+     return 0;
+ }
+ 
+ static int cf_check intel_iommu_lookup_page(
+-    struct domain *d, dfn_t dfn, mfn_t *mfn, unsigned int *flags)
++    struct domain *d, dfn_t dfn, mfn_t *mfn, unsigned int *flags,
++    struct iommu_context *ctx)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+     uint64_t val;
+ 
+-    /*
+-     * If VT-d shares EPT page table or if the domain is the hardware
+-     * domain and iommu_passthrough is set then pass back the dfn.
+-     */
+-    if ( iommu_use_hap_pt(d) ||
+-         (iommu_hwdom_passthrough && is_hardware_domain(d)) )
++    if ( ctx->opaque )
+         return -EOPNOTSUPP;
+ 
+-    spin_lock(&hd->arch.mapping_lock);
+-
+-    val = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 0, NULL, false);
+-
+-    spin_unlock(&hd->arch.mapping_lock);
++    val = addr_to_dma_page_maddr(d, ctx, dfn_to_daddr(dfn), 0, NULL, false);
+ 
+     if ( val < PAGE_SIZE )
+         return -ENOENT;
+@@ -2323,7 +1883,7 @@ static bool __init vtd_ept_page_compatible(const struct vtd_iommu *iommu)
+ 
+     /* EPT is not initialised yet, so we must check the capability in
+      * the MSR explicitly rather than use cpu_has_vmx_ept_*() */
+-    if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) 
++    if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 )
+         return false;
+ 
+     return (ept_has_2mb(ept_cap) && opt_hap_2mb) <=
+@@ -2332,44 +1892,6 @@ static bool __init vtd_ept_page_compatible(const struct vtd_iommu *iommu)
+             (cap_sps_1gb(vtd_cap) && iommu_superpages);
+ }
+ 
+-static int cf_check intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
+-{
+-    struct acpi_rmrr_unit *rmrr;
+-    u16 bdf;
+-    int ret, i;
+-
+-    ASSERT(pcidevs_locked());
+-
+-    if ( !pdev->domain )
+-        return -EINVAL;
+-
+-    for_each_rmrr_device ( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment == pdev->seg && bdf == PCI_BDF(pdev->bus, devfn) )
+-        {
+-            /*
+-             * iommu_add_device() is only called for the hardware
+-             * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
+-             * Since RMRRs are always reserved in the e820 map for the hardware
+-             * domain, there shouldn't be a conflict.
+-             */
+-            ret = iommu_identity_mapping(pdev->domain, p2m_access_rw,
+-                                         rmrr->base_address, rmrr->end_address,
+-                                         0);
+-            if ( ret )
+-                dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n",
+-                        pdev->domain);
+-        }
+-    }
+-
+-    ret = domain_context_mapping(pdev->domain, devfn, pdev);
+-    if ( ret )
+-        dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n",
+-                pdev->domain);
+-
+-    return ret;
+-}
+-
+ static int cf_check intel_iommu_enable_device(struct pci_dev *pdev)
+ {
+     struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+@@ -2385,49 +1907,16 @@ static int cf_check intel_iommu_enable_device(struct pci_dev *pdev)
+     return ret >= 0 ? 0 : ret;
+ }
+ 
+-static int cf_check intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
+-{
+-    const struct acpi_drhd_unit *drhd;
+-    struct acpi_rmrr_unit *rmrr;
+-    u16 bdf;
+-    unsigned int i;
+-
+-    if ( !pdev->domain )
+-        return -EINVAL;
+-
+-    drhd = domain_context_unmap(pdev->domain, devfn, pdev);
+-    if ( IS_ERR(drhd) )
+-        return PTR_ERR(drhd);
+-
+-    for_each_rmrr_device ( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment != pdev->seg || bdf != PCI_BDF(pdev->bus, devfn) )
+-            continue;
+-
+-        /*
+-         * Any flag is nothing to clear these mappings but here
+-         * its always safe and strict to set 0.
+-         */
+-        iommu_identity_mapping(pdev->domain, p2m_access_x, rmrr->base_address,
+-                               rmrr->end_address, 0);
+-    }
+-
+-    quarantine_teardown(pdev, drhd);
+-
+-    if ( drhd )
+-    {
+-        iommu_free_domid(pdev->arch.pseudo_domid,
+-                         drhd->iommu->pseudo_domid_map);
+-        pdev->arch.pseudo_domid = DOMID_INVALID;
+-    }
+-
+-    return 0;
+-}
+-
+ static int __hwdom_init cf_check setup_hwdom_device(
+     u8 devfn, struct pci_dev *pdev)
+ {
+-    return domain_context_mapping(pdev->domain, devfn, pdev);
++    if (pdev->type == DEV_TYPE_PCI_HOST_BRIDGE ||
++        pdev->type == DEV_TYPE_PCIe_BRIDGE ||
++        pdev->type == DEV_TYPE_PCIe2PCI_BRIDGE ||
++        pdev->type == DEV_TYPE_LEGACY_PCI_BRIDGE)
++        return 0;
++
++    return iommu_attach_context(hardware_domain, pdev, 0);
+ }
+ 
+ void clear_fault_bits(struct vtd_iommu *iommu)
+@@ -2521,7 +2010,7 @@ static int __must_check init_vtd_hw(bool resume)
+ 
+     /*
+      * Enable queue invalidation
+-     */   
++     */
+     for_each_drhd_unit ( drhd )
+     {
+         iommu = drhd->iommu;
+@@ -2542,7 +2031,7 @@ static int __must_check init_vtd_hw(bool resume)
+ 
+     /*
+      * Enable interrupt remapping
+-     */  
++     */
+     if ( iommu_intremap != iommu_intremap_off )
+     {
+         int apic;
+@@ -2597,34 +2086,53 @@ static int __must_check init_vtd_hw(bool resume)
+     return iommu_flush_all();
+ }
+ 
+-static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
+-{
+-    struct acpi_rmrr_unit *rmrr;
+-    u16 bdf;
+-    int ret, i;
++static struct iommu_state {
++    uint32_t fectl;
++} *__read_mostly iommu_state;
+ 
+-    pcidevs_lock();
+-    for_each_rmrr_device ( rmrr, bdf, i )
++static void arch_iommu_dump_domain_contexts(struct domain *d)
++{
++    unsigned int i, iommu_no;
++    struct pci_dev *pdev;
++    struct iommu_context *ctx;
++    struct domain_iommu *hd = dom_iommu(d);
++
++    printk("d%hu contexts\n", d->domain_id);
++
++    for (i = 0; i < (1 + hd->other_contexts.count); ++i)
+     {
+-        /*
+-         * Here means we're add a device to the hardware domain.
+-         * Since RMRRs are always reserved in the e820 map for the hardware
+-         * domain, there shouldn't be a conflict. So its always safe and
+-         * strict to set 0.
+-         */
+-        ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address,
+-                                     rmrr->end_address, 0);
+-        if ( ret )
+-            dprintk(XENLOG_ERR VTDPREFIX,
+-                     "IOMMU: mapping reserved region failed\n");
++        if ( (ctx = iommu_get_context(d, i)) )
++        {
++            printk(" Context %d (%"PRIx64")\n", i, ctx->arch.vtd.pgd_maddr);
++
++            for (iommu_no = 0; iommu_no < nr_iommus; iommu_no++)
++                printk("  IOMMU %hu (used=%u; did=%hu)\n", iommu_no,
++                       test_bit(iommu_no, ctx->arch.vtd.iommu_bitmap),
++                       ctx->arch.vtd.didmap[iommu_no]);
++
++            list_for_each_entry(pdev, &ctx->devices, context_list)
++            {
++                printk("  - %pp\n", &pdev->sbdf);
++            }
++
++            iommu_put_context(ctx);
++        }
+     }
+-    pcidevs_unlock();
+ }
+ 
+-static struct iommu_state {
+-    uint32_t fectl;
+-} *__read_mostly iommu_state;
++static void arch_iommu_dump_contexts(unsigned char key)
++{
++    struct domain *d;
++
++    for_each_domain(d)
++        if (is_iommu_enabled(d)) {
++            struct domain_iommu *hd = dom_iommu(d);
++            printk("d%hu arena page usage: %d\n", d->domain_id,
++                atomic_read(&hd->arch.pt_arena.used_pages));
+ 
++            arch_iommu_dump_domain_contexts(d);
++        }
++}
+ static int __init cf_check vtd_setup(void)
+ {
+     struct acpi_drhd_unit *drhd;
+@@ -2752,6 +2260,7 @@ static int __init cf_check vtd_setup(void)
+     iommu_ops.page_sizes |= large_sizes;
+ 
+     register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
++    register_keyhandler('X', arch_iommu_dump_contexts, "dump iommu contexts", 1);
+ 
+     return 0;
+ 
+@@ -2766,192 +2275,6 @@ static int __init cf_check vtd_setup(void)
+     return ret;
+ }
+ 
+-static int cf_check reassign_device_ownership(
+-    struct domain *source,
+-    struct domain *target,
+-    u8 devfn, struct pci_dev *pdev)
+-{
+-    int ret;
+-
+-    if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
+-    {
+-        if ( !has_arch_pdevs(target) )
+-            vmx_pi_hooks_assign(target);
+-
+-#ifdef CONFIG_PV
+-        /*
+-         * Devices assigned to untrusted domains (here assumed to be any domU)
+-         * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
+-         * by the root complex unless interrupt remapping is enabled.
+-         */
+-        if ( !iommu_intremap && !is_hardware_domain(target) &&
+-             !is_system_domain(target) )
+-            untrusted_msi = true;
+-#endif
+-
+-        ret = domain_context_mapping(target, devfn, pdev);
+-
+-        if ( !ret && pdev->devfn == devfn &&
+-             !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
+-        {
+-            const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+-
+-            if ( drhd )
+-                check_cleanup_domid_map(source, pdev, drhd->iommu);
+-        }
+-    }
+-    else
+-    {
+-        const struct acpi_drhd_unit *drhd;
+-
+-        drhd = domain_context_unmap(source, devfn, pdev);
+-        ret = IS_ERR(drhd) ? PTR_ERR(drhd) : 0;
+-    }
+-    if ( ret )
+-    {
+-        if ( !has_arch_pdevs(target) )
+-            vmx_pi_hooks_deassign(target);
+-        return ret;
+-    }
+-
+-    if ( devfn == pdev->devfn && pdev->domain != target )
+-    {
+-        write_lock(&source->pci_lock);
+-        list_del(&pdev->domain_list);
+-        write_unlock(&source->pci_lock);
+-
+-        pdev->domain = target;
+-
+-        write_lock(&target->pci_lock);
+-        list_add(&pdev->domain_list, &target->pdev_list);
+-        write_unlock(&target->pci_lock);
+-    }
+-
+-    if ( !has_arch_pdevs(source) )
+-        vmx_pi_hooks_deassign(source);
+-
+-    /*
+-     * If the device belongs to the hardware domain, and it has RMRR, don't
+-     * remove it from the hardware domain, because BIOS may use RMRR at
+-     * booting time.
+-     */
+-    if ( !is_hardware_domain(source) )
+-    {
+-        const struct acpi_rmrr_unit *rmrr;
+-        u16 bdf;
+-        unsigned int i;
+-
+-        for_each_rmrr_device( rmrr, bdf, i )
+-            if ( rmrr->segment == pdev->seg &&
+-                 bdf == PCI_BDF(pdev->bus, devfn) )
+-            {
+-                /*
+-                 * Any RMRR flag is always ignored when remove a device,
+-                 * but its always safe and strict to set 0.
+-                 */
+-                ret = iommu_identity_mapping(source, p2m_access_x,
+-                                             rmrr->base_address,
+-                                             rmrr->end_address, 0);
+-                if ( ret && ret != -ENOENT )
+-                    return ret;
+-            }
+-    }
+-
+-    return 0;
+-}
+-
+-static int cf_check intel_iommu_assign_device(
+-    struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
+-{
+-    struct domain *s = pdev->domain;
+-    struct acpi_rmrr_unit *rmrr;
+-    int ret = 0, i;
+-    u16 bdf, seg;
+-    u8 bus;
+-
+-    if ( list_empty(&acpi_drhd_units) )
+-        return -ENODEV;
+-
+-    seg = pdev->seg;
+-    bus = pdev->bus;
+-    /*
+-     * In rare cases one given rmrr is shared by multiple devices but
+-     * obviously this would put the security of a system at risk. So
+-     * we would prevent from this sort of device assignment. But this
+-     * can be permitted if user set
+-     *      "pci = [ 'sbdf, rdm_policy=relaxed' ]"
+-     *
+-     * TODO: in the future we can introduce group device assignment
+-     * interface to make sure devices sharing RMRR are assigned to the
+-     * same domain together.
+-     */
+-    for_each_rmrr_device( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) &&
+-             rmrr->scope.devices_cnt > 1 )
+-        {
+-            bool relaxed = flag & XEN_DOMCTL_DEV_RDM_RELAXED;
+-
+-            printk(XENLOG_GUEST "%s" VTDPREFIX
+-                   " It's %s to assign %pp"
+-                   " with shared RMRR at %"PRIx64" for %pd.\n",
+-                   relaxed ? XENLOG_WARNING : XENLOG_ERR,
+-                   relaxed ? "risky" : "disallowed",
+-                   &PCI_SBDF(seg, bus, devfn), rmrr->base_address, d);
+-            if ( !relaxed )
+-                return -EPERM;
+-        }
+-    }
+-
+-    if ( d == dom_io )
+-        return reassign_device_ownership(s, d, devfn, pdev);
+-
+-    /* Setup rmrr identity mapping */
+-    for_each_rmrr_device( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) )
+-        {
+-            ret = iommu_identity_mapping(d, p2m_access_rw, rmrr->base_address,
+-                                         rmrr->end_address, flag);
+-            if ( ret )
+-            {
+-                printk(XENLOG_G_ERR VTDPREFIX
+-                       "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n",
+-                       d, rmrr->base_address, rmrr->end_address, ret);
+-                break;
+-            }
+-        }
+-    }
+-
+-    if ( !ret )
+-        ret = reassign_device_ownership(s, d, devfn, pdev);
+-
+-    /* See reassign_device_ownership() for the hwdom aspect. */
+-    if ( !ret || is_hardware_domain(d) )
+-        return ret;
+-
+-    for_each_rmrr_device( rmrr, bdf, i )
+-    {
+-        if ( rmrr->segment == seg && bdf == PCI_BDF(bus, devfn) )
+-        {
+-            int rc = iommu_identity_mapping(d, p2m_access_x,
+-                                            rmrr->base_address,
+-                                            rmrr->end_address, 0);
+-
+-            if ( rc && rc != -ENOENT )
+-            {
+-                printk(XENLOG_ERR VTDPREFIX
+-                       "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n",
+-                       d, rmrr->base_address, rmrr->end_address, rc);
+-                domain_crash(d);
+-                break;
+-            }
+-        }
+-    }
+-
+-    return ret;
+-}
+-
+ static int cf_check intel_iommu_group_id(u16 seg, u8 bus, u8 devfn)
+ {
+     u8 secbus;
+@@ -3076,6 +2399,11 @@ static void vtd_dump_page_table_level(paddr_t pt_maddr, int level, paddr_t gpa,
+     if ( level < 1 )
+         return;
+ 
++    if (pt_maddr == 0) {
++        printk(" (empty)\n");
++        return;
++    }
++
+     pt_vaddr = map_vtd_domain_page(pt_maddr);
+ 
+     next_level = level - 1;
+@@ -3106,158 +2434,374 @@ static void vtd_dump_page_table_level(paddr_t pt_maddr, int level, paddr_t gpa,
+ 
+ static void cf_check vtd_dump_page_tables(struct domain *d)
+ {
+-    const struct domain_iommu *hd = dom_iommu(d);
++    struct domain_iommu *hd = dom_iommu(d);
++    unsigned int i;
+ 
+-    printk(VTDPREFIX" %pd table has %d levels\n", d,
++    printk(VTDPREFIX " %pd table has %d levels\n", d,
+            agaw_to_level(hd->arch.vtd.agaw));
+-    vtd_dump_page_table_level(hd->arch.vtd.pgd_maddr,
+-                              agaw_to_level(hd->arch.vtd.agaw), 0, 0);
++
++    for (i = 1; i < (1 + hd->other_contexts.count); ++i)
++    {
++        struct iommu_context *ctx = iommu_get_context(d, i);
++
++        printk(VTDPREFIX " %pd context %d: %s\n", d, i,
++               ctx ? "allocated" : "non-allocated");
++
++        if (ctx)
++        {
++            vtd_dump_page_table_level(ctx->arch.vtd.pgd_maddr,
++                                      agaw_to_level(hd->arch.vtd.agaw), 0, 0);
++            iommu_put_context(ctx);
++        }
++    }
+ }
+ 
+-static int fill_qpt(struct dma_pte *this, unsigned int level,
+-                    struct page_info *pgs[6])
++static int intel_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags)
+ {
+-    struct domain_iommu *hd = dom_iommu(dom_io);
+-    unsigned int i;
+-    int rc = 0;
++    struct acpi_drhd_unit *drhd;
++
++    ctx->arch.vtd.didmap = xzalloc_array(u16, nr_iommus);
++
++    if ( !ctx->arch.vtd.didmap )
++        return -ENOMEM;
+ 
+-    for ( i = 0; !rc && i < PTE_NUM; ++i )
++    ctx->arch.vtd.iommu_bitmap = xzalloc_array(unsigned long,
++                                               BITS_TO_LONGS(nr_iommus));
++    if ( !ctx->arch.vtd.iommu_bitmap )
++        return -ENOMEM;
++
++    ctx->arch.vtd.superpage_progress = 0;
++
++    if ( flags & IOMMU_CONTEXT_INIT_default )
+     {
+-        struct dma_pte *pte = &this[i], *next;
++        ctx->arch.vtd.pgd_maddr = 0;
+ 
+-        if ( !dma_pte_present(*pte) )
++        /*
++         * Context is considered "opaque" (non-managed) in these cases :
++         *  - HAP is enabled, in this case, the pagetable is not managed by the
++         *    IOMMU code, thus opaque
++         *  - IOMMU is in passthrough which means that there is no actual pagetable
++         *
++         * If no-dma mode is specified, it's always non-opaque as the pagetable is
++         * always managed regardless of the rest.
++         */
++        ctx->arch.hap_context = !iommu_hwdom_no_dma && (iommu_use_hap_pt(d) || iommu_hwdom_passthrough);
++
++        ctx->opaque = ctx->arch.hap_context;
++
++        /* Populate context DID map using domain id. */
++        for_each_drhd_unit(drhd)
+         {
+-            if ( !pgs[level] )
+-            {
+-                /*
+-                 * The pgtable allocator is fine for the leaf page, as well as
+-                 * page table pages, and the resulting allocations are always
+-                 * zeroed.
+-                 */
+-                pgs[level] = iommu_alloc_pgtable(hd, 0);
+-                if ( !pgs[level] )
+-                {
+-                    rc = -ENOMEM;
+-                    break;
+-                }
+-
+-                if ( level )
+-                {
+-                    next = map_vtd_domain_page(page_to_maddr(pgs[level]));
+-                    rc = fill_qpt(next, level - 1, pgs);
+-                    unmap_vtd_domain_page(next);
+-                }
+-            }
++            ctx->arch.vtd.didmap[drhd->iommu->index] =
++                convert_domid(drhd->iommu, d->domain_id);
++        }
++    }
++    else
++    {
++        /* Populate context DID map using pseudo DIDs */
++        for_each_drhd_unit(drhd)
++        {
++            ctx->arch.vtd.didmap[drhd->iommu->index] =
++                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
++        }
++    }
++
++    if ( !ctx->opaque )
++        /* Create initial context page */
++        addr_to_dma_page_maddr(d, ctx, 0, min_pt_levels, NULL, true);
++
++    return arch_iommu_context_init(d, ctx, flags);
++}
++
++static int intel_iommu_cleanup_pte(uint64_t pte_maddr, bool preempt)
++{
++    size_t i;
++    struct dma_pte *pte = map_vtd_domain_page(pte_maddr);
++
++    for (i = 0; i < (1 << PAGETABLE_ORDER); ++i)
++        if ( dma_pte_present(pte[i]) )
++        {
++            /* Remove the reference of the target mapping (if needed) */
++            mfn_t mfn = maddr_to_mfn(dma_pte_addr(pte[i]));
++
++            if ( mfn_valid(mfn) )
++                put_page(mfn_to_page(mfn));
+ 
+-            dma_set_pte_addr(*pte, page_to_maddr(pgs[level]));
+-            dma_set_pte_readable(*pte);
+-            dma_set_pte_writable(*pte);
++            if ( preempt )
++                dma_clear_pte(pte[i]);
+         }
+-        else if ( level && !dma_pte_superpage(*pte) )
++
++    unmap_vtd_domain_page(pte);
++
++    return 0;
++}
++
++/**
++ * Cleanup logic :
++ * Walk through the entire page table, progressively removing mappings if preempt.
++ *
++ * Return values :
++ *  - Report preemption with -ERESTART.
++ *  - Report empty pte/pgd with 0.
++ *
++ * When preempted during superpage operation, store state in vtd.superpage_progress.
++ */
++
++static int intel_iommu_cleanup_superpage(struct iommu_context *ctx,
++                                          unsigned int page_order, uint64_t pte_maddr,
++                                          bool preempt)
++{
++    size_t i = 0, page_count = 1 << page_order;
++    struct page_info *page = maddr_to_page(pte_maddr);
++
++    if ( preempt )
++        i = ctx->arch.vtd.superpage_progress;
++
++    for (; i < page_count; page++)
++    {
++        put_page(page);
++
++        if ( preempt && (i & 0xff) && general_preempt_check() )
+         {
+-            next = map_vtd_domain_page(dma_pte_addr(*pte));
+-            rc = fill_qpt(next, level - 1, pgs);
+-            unmap_vtd_domain_page(next);
++            ctx->arch.vtd.superpage_progress = i + 1;
++            return -ERESTART;
+         }
+     }
+ 
+-    return rc;
++    if ( preempt )
++        ctx->arch.vtd.superpage_progress = 0;
++
++    return 0;
+ }
+ 
+-static int cf_check intel_iommu_quarantine_init(struct pci_dev *pdev,
+-                                                bool scratch_page)
++static int intel_iommu_cleanup_mappings(struct iommu_context *ctx,
++                                         unsigned int nr_pt_levels, uint64_t pgd_maddr,
++                                         bool preempt)
+ {
+-    struct domain_iommu *hd = dom_iommu(dom_io);
+-    struct page_info *pg;
+-    unsigned int agaw = hd->arch.vtd.agaw;
+-    unsigned int level = agaw_to_level(agaw);
+-    const struct acpi_drhd_unit *drhd;
+-    const struct acpi_rmrr_unit *rmrr;
+-    unsigned int i, bdf;
+-    bool rmrr_found = false;
++    size_t i;
+     int rc;
++    struct dma_pte *pgd;
+ 
+-    ASSERT(pcidevs_locked());
+-    ASSERT(!hd->arch.vtd.pgd_maddr);
+-    ASSERT(page_list_empty(&hd->arch.pgtables.list));
++    if ( ctx->opaque )
++        /* don't touch opaque contexts */
++        return 0;
++
++    pgd = map_vtd_domain_page(pgd_maddr);
+ 
+-    if ( pdev->arch.vtd.pgd_maddr )
++    for (i = 0; i < (1 << PAGETABLE_ORDER); ++i)
+     {
+-        clear_domain_page(pdev->arch.leaf_mfn);
+-        return 0;
++        if ( dma_pte_present(pgd[i]) )
++        {
++            uint64_t pte_maddr = dma_pte_addr(pgd[i]);
++
++            if ( dma_pte_superpage(pgd[i]) )
++                rc = intel_iommu_cleanup_superpage(ctx, nr_pt_levels * SUPERPAGE_ORDER,
++                                                   pte_maddr, preempt);
++            else if ( nr_pt_levels > 2 )
++                /* Next level is not PTE */
++                rc = intel_iommu_cleanup_mappings(ctx, nr_pt_levels - 1,
++                                                  pte_maddr, preempt);
++            else
++                rc = intel_iommu_cleanup_pte(pte_maddr, preempt);
++
++            if ( preempt && !rc )
++                /* Fold pgd (no more mappings in it) */
++                dma_clear_pte(pgd[i]);
++            else if ( preempt && (rc == -ERESTART || general_preempt_check()) )
++            {
++                unmap_vtd_domain_page(pgd);
++                return -ERESTART;
++            }
++        }
+     }
+ 
+-    drhd = acpi_find_matched_drhd_unit(pdev);
+-    if ( !drhd )
+-        return -ENODEV;
++    unmap_vtd_domain_page(pgd);
+ 
+-    pg = iommu_alloc_pgtable(hd, 0);
+-    if ( !pg )
+-        return -ENOMEM;
++    return 0;
++}
+ 
+-    rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
++static int intel_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags)
++{
++    struct acpi_drhd_unit *drhd;
++    pcidevs_lock();
+ 
+-    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
+-    hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
++    // Cleanup mappings
++    if ( intel_iommu_cleanup_mappings(ctx, agaw_to_level(d->iommu.arch.vtd.agaw),
++                                      ctx->arch.vtd.pgd_maddr,
++                                      flags & IOMMUF_preempt) < 0 )
++    {
++        pcidevs_unlock();
++        return -ERESTART;
++    }
+ 
+-    for_each_rmrr_device ( rmrr, bdf, i )
++    if (ctx->arch.vtd.didmap)
+     {
+-        if ( rc )
+-            break;
++        for_each_drhd_unit(drhd)
++        {
++            iommu_free_domid(ctx->arch.vtd.didmap[drhd->iommu->index],
++                drhd->iommu->pseudo_domid_map);
++        }
++
++        xfree(ctx->arch.vtd.didmap);
++    }
++
++    pcidevs_unlock();
++    return arch_iommu_context_teardown(d, ctx, flags);
++}
+ 
+-        if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf )
++static int intel_iommu_dev_rmrr(struct domain *d, struct pci_dev *pdev,
++                                struct iommu_context *ctx, bool unmap)
++{
++    struct acpi_rmrr_unit *rmrr;
++    u16 bdf;
++    int ret, i;
++
++    for_each_rmrr_device(rmrr, bdf, i)
++    {
++        if ( PCI_SBDF(rmrr->segment, bdf).sbdf == pdev->sbdf.sbdf )
+         {
+-            rmrr_found = true;
+-
+-            rc = iommu_identity_mapping(dom_io, p2m_access_rw,
+-                                        rmrr->base_address, rmrr->end_address,
+-                                        0);
+-            if ( rc )
+-                printk(XENLOG_ERR VTDPREFIX
+-                       "%pp: RMRR quarantine mapping failed\n",
+-                       &pdev->sbdf);
++            ret = iommu_identity_mapping(d, ctx,
++                                         unmap ? p2m_access_x : p2m_access_rw,
++                                         rmrr->base_address, rmrr->end_address,
++                                         0);
++
++            if ( ret < 0 )
++                return ret;
+         }
+     }
+ 
+-    iommu_identity_map_teardown(dom_io);
+-    hd->arch.vtd.pgd_maddr = 0;
+-    pdev->arch.vtd.pgd_maddr = page_to_maddr(pg);
++    return 0;
++}
++
++static int intel_iommu_attach(struct domain *d, struct pci_dev *pdev,
++                              struct iommu_context *ctx)
++{
++    int ret;
++    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++    if (!pdev || !drhd)
++        return -EINVAL;
+ 
+-    if ( !rc && scratch_page )
++    if ( !ctx->opaque || ctx->arch.hap_context )
+     {
+-        struct dma_pte *root;
+-        struct page_info *pgs[6] = {};
++        ret = intel_iommu_dev_rmrr(d, pdev, ctx, false);
++
++        if ( ret )
++            return ret;
++    }
++
++    ret = apply_context(d, ctx, pdev, pdev->devfn);
++
++    if ( ret )
++        return ret;
++
++    pci_vtd_quirk(pdev);
+ 
+-        root = map_vtd_domain_page(pdev->arch.vtd.pgd_maddr);
+-        rc = fill_qpt(root, level - 1, pgs);
+-        unmap_vtd_domain_page(root);
++    return ret;
++}
++
++static int intel_iommu_detach(struct domain *d, struct pci_dev *pdev,
++                              struct iommu_context *prev_ctx)
++{
++    int ret;
++    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++    if (!pdev || !drhd)
++        return -EINVAL;
++
++    ret = unapply_context_single(d, drhd->iommu, pdev->bus, pdev->devfn);
+ 
+-        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
++    if ( ret )
++        return ret;
++
++    if ( !prev_ctx->opaque || prev_ctx->arch.hap_context )
++        WARN_ON(intel_iommu_dev_rmrr(d, pdev, prev_ctx, true));
++
++    check_cleanup_domid_map(d, prev_ctx, NULL, drhd->iommu);
++
++    return ret;
++}
++
++static int intel_iommu_reattach(struct domain *d, struct pci_dev *pdev,
++                                struct iommu_context *prev_ctx,
++                                struct iommu_context *ctx)
++{
++    int ret;
++    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++    if (!pdev || !drhd)
++        return -EINVAL;
++
++    if ( !ctx->opaque || ctx->arch.hap_context )
++    {
++        ret = intel_iommu_dev_rmrr(d, pdev, ctx, false);
++
++        if ( ret )
++            return ret;
+     }
+ 
+-    page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list);
++    ret = apply_context_single(d, ctx, drhd->iommu, pdev->bus, pdev->devfn);
++
++    if ( ret )
++        return ret;
+ 
+-    if ( rc || (!scratch_page && !rmrr_found) )
+-        quarantine_teardown(pdev, drhd);
++    if ( !prev_ctx->opaque || prev_ctx->arch.hap_context )
++        WARN_ON(intel_iommu_dev_rmrr(d, pdev, prev_ctx, true));
+ 
+-    return rc;
++    /* We are overwriting an entry, cleanup previous domid if needed. */
++    check_cleanup_domid_map(d, prev_ctx, pdev, drhd->iommu);
++
++    pci_vtd_quirk(pdev);
++
++    return ret;
++}
++
++static int intel_iommu_add_devfn(struct domain *d, struct pci_dev *pdev,
++                                 u16 devfn, struct iommu_context *ctx)
++{
++    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++    if (!pdev || !drhd)
++        return -EINVAL;
++
++    return apply_context(d, ctx, pdev, devfn);
++}
++
++static int intel_iommu_remove_devfn(struct domain *d, struct pci_dev *pdev,
++                                    u16 devfn)
++{
++    const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
++
++    if (!pdev || !drhd)
++        return -EINVAL;
++
++    return unapply_context_single(d, drhd->iommu, pdev->bus, devfn);
++}
++
++static uint64_t intel_iommu_get_max_iova(struct domain *d)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++
++    return (1LLU << agaw_to_width(hd->arch.vtd.agaw)) - 1;
+ }
+ 
+ static const struct iommu_ops __initconst_cf_clobber vtd_ops = {
+     .page_sizes = PAGE_SIZE_4K,
+     .init = intel_iommu_domain_init,
+     .hwdom_init = intel_iommu_hwdom_init,
+-    .quarantine_init = intel_iommu_quarantine_init,
+-    .add_device = intel_iommu_add_device,
++    .context_init = intel_iommu_context_init,
++    .context_teardown = intel_iommu_context_teardown,
++    .attach = intel_iommu_attach,
++    .detach = intel_iommu_detach,
++    .reattach = intel_iommu_reattach,
++    .add_devfn = intel_iommu_add_devfn,
++    .remove_devfn = intel_iommu_remove_devfn,
+     .enable_device = intel_iommu_enable_device,
+-    .remove_device = intel_iommu_remove_device,
+-    .assign_device  = intel_iommu_assign_device,
+     .teardown = iommu_domain_teardown,
+     .clear_root_pgtable = iommu_clear_root_pgtable,
+     .map_page = intel_iommu_map_page,
+     .unmap_page = intel_iommu_unmap_page,
+     .lookup_page = intel_iommu_lookup_page,
+-    .reassign_device = reassign_device_ownership,
+     .get_device_group_id = intel_iommu_group_id,
+     .enable_x2apic = intel_iommu_enable_eim,
+     .disable_x2apic = intel_iommu_disable_eim,
+@@ -3272,6 +2816,7 @@ static const struct iommu_ops __initconst_cf_clobber vtd_ops = {
+     .iotlb_flush = iommu_flush_iotlb,
+     .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
+     .dump_page_tables = vtd_dump_page_tables,
++    .get_max_iova = intel_iommu_get_max_iova,
+ };
+ 
+ const struct iommu_init_ops __initconstrel intel_iommu_init_ops = {
+diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
+index dc3dac749ce6..6bf19b4f6c0d 100644
+--- a/xen/drivers/passthrough/vtd/quirks.c
++++ b/xen/drivers/passthrough/vtd/quirks.c
+@@ -408,9 +408,8 @@ void __init platform_quirks_init(void)
+ 
+ static int __must_check map_me_phantom_function(struct domain *domain,
+                                                 unsigned int dev,
+-                                                domid_t domid,
+-                                                paddr_t pgd_maddr,
+-                                                unsigned int mode)
++                                                unsigned int mode,
++                                                struct iommu_context *ctx)
+ {
+     struct acpi_drhd_unit *drhd;
+     struct pci_dev *pdev;
+@@ -422,18 +421,17 @@ static int __must_check map_me_phantom_function(struct domain *domain,
+ 
+     /* map or unmap ME phantom function */
+     if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
+-        rc = domain_context_mapping_one(domain, drhd->iommu, 0,
+-                                        PCI_DEVFN(dev, 7), NULL,
+-                                        domid, pgd_maddr, mode);
++        rc = apply_context_single(domain, ctx, drhd->iommu, 0,
++                                  PCI_DEVFN(dev, 7));
+     else
+-        rc = domain_context_unmap_one(domain, drhd->iommu, 0,
+-                                      PCI_DEVFN(dev, 7));
++        rc = unapply_context_single(domain, drhd->iommu, 0, PCI_DEVFN(dev, 7));
+ 
+     return rc;
+ }
+ 
+ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+-                  domid_t domid, paddr_t pgd_maddr, unsigned int mode)
++                  domid_t domid, unsigned int mode,
++                  struct iommu_context *ctx)
+ {
+     u32 id;
+     int rc = 0;
+@@ -457,7 +455,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+             case 0x423b8086:
+             case 0x423c8086:
+             case 0x423d8086:
+-                rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode);
++                rc = map_me_phantom_function(domain, 3, mode, ctx);
+                 break;
+             default:
+                 break;
+@@ -483,7 +481,7 @@ int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+             case 0x42388086:        /* Puma Peak */
+             case 0x422b8086:
+             case 0x422c8086:
+-                rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode);
++                rc = map_me_phantom_function(domain, 22, mode, ctx);
+                 break;
+             default:
+                 break;
+diff --git a/xen/drivers/passthrough/x86/Makefile b/xen/drivers/passthrough/x86/Makefile
+index 75b288533640..1614f3d2840b 100644
+--- a/xen/drivers/passthrough/x86/Makefile
++++ b/xen/drivers/passthrough/x86/Makefile
+@@ -1,2 +1,3 @@
+ obj-y += iommu.o
++obj-y += arena.o
+ obj-$(CONFIG_HVM) += hvm.o
+diff --git a/xen/drivers/passthrough/x86/arena.c b/xen/drivers/passthrough/x86/arena.c
+new file mode 100644
+index 000000000000..984bc4d643f1
+--- /dev/null
++++ b/xen/drivers/passthrough/x86/arena.c
+@@ -0,0 +1,157 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/**
++ * Simple arena-based page allocator.
++ *
++ * Allocate a large block using alloc_domheam_pages and allocate single pages
++ * using iommu_arena_allocate_page and iommu_arena_free_page functions.
++ *
++ * Concurrent {allocate/free}_page is thread-safe
++ * iommu_arena_teardown during {allocate/free}_page is not thread-safe.
++ *
++ * Written by Teddy Astie <teddy.astie@vates.tech>
++ */
++
++#include <asm/bitops.h>
++#include <asm/page.h>
++#include <xen/atomic.h>
++#include <xen/bug.h>
++#include <xen/config.h>
++#include <xen/mm-frame.h>
++#include <xen/mm.h>
++#include <xen/xmalloc.h>
++
++#include <asm/arena.h>
++
++/* Maximum of scan tries if the bit found not available */
++#define ARENA_TSL_MAX_TRIES 5
++
++int iommu_arena_initialize(struct iommu_arena *arena, struct domain *d,
++                           unsigned int order, unsigned int memflags)
++{
++    struct page_info *page;
++
++    /* TODO: Maybe allocate differently ? */
++    page = alloc_domheap_pages(d, order, memflags);
++
++    if ( !page )
++        return -ENOMEM;
++
++    arena->map = xzalloc_array(unsigned long, BITS_TO_LONGS(1LLU << order));
++    arena->order = order;
++    arena->region_start = page_to_mfn(page);
++
++    _atomic_set(&arena->used_pages, 0);
++    bitmap_zero(arena->map, iommu_arena_size(arena));
++
++    printk(XENLOG_DEBUG "IOMMU: Allocated arena (%llu pages, start=%"PRI_mfn")\n",
++           iommu_arena_size(arena), mfn_x(arena->region_start));
++    return 0;
++}
++
++int iommu_arena_teardown(struct iommu_arena *arena, bool check)
++{
++    BUG_ON(mfn_x(arena->region_start) == 0);
++
++    /* Check for allocations if check is specified */
++    if ( check && (atomic_read(&arena->used_pages) > 0) )
++        return -EBUSY;
++
++    free_domheap_pages(mfn_to_page(arena->region_start), arena->order);
++
++    arena->region_start = _mfn(0);
++    _atomic_set(&arena->used_pages, 0);
++    xfree(arena->map);
++    arena->map = NULL;
++
++    return 0;
++}
++
++struct page_info *iommu_arena_allocate_page(struct iommu_arena *arena)
++{
++    unsigned int index;
++    unsigned int tsl_tries = 0;
++
++    BUG_ON(mfn_x(arena->region_start) == 0);
++
++    if ( atomic_read(&arena->used_pages) == iommu_arena_size(arena) )
++        /* All pages used */
++        return NULL;
++
++    do
++    {
++        index = find_first_zero_bit(arena->map, iommu_arena_size(arena));
++
++        if ( index >= iommu_arena_size(arena) )
++            /* No more free pages */
++            return NULL;
++
++        /*
++         * While there shouldn't be a lot of retries in practice, this loop
++         * *may* run indefinetly if the found bit is never free due to being
++         * overwriten by another CPU core right after. Add a safeguard for
++         * such very rare cases.
++         */
++        tsl_tries++;
++
++        if ( unlikely(tsl_tries == ARENA_TSL_MAX_TRIES) )
++        {
++            printk(XENLOG_ERR "ARENA: Too many TSL retries !");
++            return NULL;
++        }
++
++        /* Make sure that the bit we found is still free */
++    } while ( test_and_set_bit(index, arena->map) );
++
++    atomic_inc(&arena->used_pages);
++
++    return mfn_to_page(mfn_add(arena->region_start, index));
++}
++
++bool iommu_arena_free_page(struct iommu_arena *arena, struct page_info *page)
++{
++    unsigned long index;
++    mfn_t frame;
++
++    if ( !page )
++    {
++        printk(XENLOG_WARNING "IOMMU: Trying to free NULL page");
++        WARN();
++        return false;
++    }
++
++    frame = page_to_mfn(page);
++
++    /* Check if page belongs to our arena */
++    if ( (mfn_x(frame) < mfn_x(arena->region_start))
++        || (mfn_x(frame) >= (mfn_x(arena->region_start) + iommu_arena_size(arena))) )
++    {
++        printk(XENLOG_WARNING
++               "IOMMU: Trying to free outside arena region [mfn=%"PRI_mfn"]",
++               mfn_x(frame));
++        WARN();
++        return false;
++    }
++
++    index = mfn_x(frame) - mfn_x(arena->region_start);
++
++    /* Sanity check in case of underflow. */
++    ASSERT(index < iommu_arena_size(arena));
++
++    if ( !test_and_clear_bit(index, arena->map) )
++    {
++        /*
++         * Bit was free during our arena_free_page, which means that
++         * either this page was never allocated, or we are in a double-free
++         * situation.
++         */
++        printk(XENLOG_WARNING
++               "IOMMU: Freeing non-allocated region (double-free?) [mfn=%"PRI_mfn"]",
++               mfn_x(frame));
++        WARN();
++        return false;
++    }
++
++    atomic_dec(&arena->used_pages);
++
++    return true;
++}
+\ No newline at end of file
+diff --git a/xen/drivers/passthrough/x86/iommu.c b/xen/drivers/passthrough/x86/iommu.c
+index 8b1e0596b84a..849f57c1ce21 100644
+--- a/xen/drivers/passthrough/x86/iommu.c
++++ b/xen/drivers/passthrough/x86/iommu.c
+@@ -12,6 +12,12 @@
+  * this program; If not, see <http://www.gnu.org/licenses/>.
+  */
+ 
++#include <xen/keyhandler.h>
++#include <xen/lib.h>
++#include <xen/pci.h>
++#include <xen/bitmap.h>
++#include <xen/list.h>
++#include <xen/mm.h>
+ #include <xen/cpu.h>
+ #include <xen/sched.h>
+ #include <xen/iocap.h>
+@@ -28,6 +34,10 @@
+ #include <asm/mem_paging.h>
+ #include <asm/pt-contig-markers.h>
+ #include <asm/setup.h>
++#include <asm/iommu.h>
++#include <asm/arena.h>
++#include <asm/page.h>
++#include <asm/p2m.h>
+ 
+ const struct iommu_init_ops *__initdata iommu_init_ops;
+ struct iommu_ops __ro_after_init iommu_ops;
+@@ -183,19 +193,66 @@ void __hwdom_init arch_iommu_check_autotranslated_hwdom(struct domain *d)
+         panic("PVH hardware domain iommu must be set in 'strict' mode\n");
+ }
+ 
+-int arch_iommu_domain_init(struct domain *d)
++int arch_iommu_context_init(struct domain *d, struct iommu_context *ctx, u32 flags)
++{
++    INIT_PAGE_LIST_HEAD(&ctx->arch.pgtables);
++    INIT_PAGE_LIST_HEAD(&ctx->arch.free_queue);
++    INIT_LIST_HEAD(&ctx->arch.identity_maps);
++
++    return 0;
++}
++
++int arch_iommu_context_teardown(struct domain *d, struct iommu_context *ctx, u32 flags)
++{
++    /* Cleanup all page tables */
++    while ( iommu_free_pgtables(d, ctx) == -ERESTART )
++        /* nothing */;
++
++    return 0;
++}
++
++int arch_iommu_flush_free_queue(struct domain *d, struct iommu_context *ctx)
++{
++    struct page_info *pg;
++    struct domain_iommu *hd = dom_iommu(d);
++
++    while ( (pg = page_list_remove_head(&ctx->arch.free_queue)) )
++        iommu_arena_free_page(&hd->arch.pt_arena, pg);
++
++    return 0;
++}
++
++int arch_iommu_pviommu_init(struct domain *d, uint16_t nb_ctx, uint32_t arena_order)
++{
++    struct domain_iommu *hd = dom_iommu(d);
++
++    if ( arena_order == 0 )
++        return 0;
++
++    return iommu_arena_initialize(&hd->arch.pt_arena, NULL, arena_order, 0);
++}
++
++int arch_iommu_pviommu_teardown(struct domain *d)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+ 
+-    spin_lock_init(&hd->arch.mapping_lock);
++    if ( iommu_arena_teardown(&hd->arch.pt_arena, true) )
++    {
++        printk(XENLOG_WARNING "IOMMU Arena used while being destroyed\n");
++        WARN();
+ 
+-    INIT_PAGE_LIST_HEAD(&hd->arch.pgtables.list);
+-    spin_lock_init(&hd->arch.pgtables.lock);
+-    INIT_LIST_HEAD(&hd->arch.identity_maps);
++        /* Teardown anyway */
++        iommu_arena_teardown(&hd->arch.pt_arena, false);
++    }
+ 
+     return 0;
+ }
+ 
++int arch_iommu_domain_init(struct domain *d)
++{
++    return 0;
++}
++
+ void arch_iommu_domain_destroy(struct domain *d)
+ {
+     /*
+@@ -203,8 +260,9 @@ void arch_iommu_domain_destroy(struct domain *d)
+      * domain is destroyed. Note that arch_iommu_domain_destroy() is
+      * called unconditionally, so pgtables may be uninitialized.
+      */
+-    ASSERT(!dom_iommu(d)->platform_ops ||
+-           page_list_empty(&dom_iommu(d)->arch.pgtables.list));
++    struct domain_iommu *hd = dom_iommu(d);
++
++    ASSERT(!hd->platform_ops);
+ }
+ 
+ struct identity_map {
+@@ -214,32 +272,104 @@ struct identity_map {
+     unsigned int count;
+ };
+ 
+-int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma,
+-                           paddr_t base, paddr_t end,
++static int unmap_identity_region(struct domain *d, struct iommu_context *ctx,
++                                 unsigned int base_pfn, unsigned int end_pfn)
++{
++    int ret = 0;
++
++    if ( ctx->arch.hap_context )
++    {
++        this_cpu(iommu_dont_flush_iotlb) = true;
++        while ( base_pfn < end_pfn )
++        {
++            if ( p2m_remove_identity_entry(d, base_pfn) )
++                ret = -ENXIO;
++
++            base_pfn++;
++        }
++        this_cpu(iommu_dont_flush_iotlb) = false;
++    }
++    else
++    {
++        size_t page_count = end_pfn - base_pfn + 1;
++        unsigned int flush_flags;
++
++        ret = iommu_unmap(d, _dfn(base_pfn), page_count, 0, &flush_flags,
++                          ctx->id);
++
++        if ( ret )
++            return ret;
++
++        ret = iommu_iotlb_flush(d, _dfn(base_pfn), page_count,
++                                flush_flags, ctx->id);
++    }
++
++    return ret;
++}
++
++static int map_identity_region(struct domain *d, struct iommu_context *ctx,
++                               unsigned int base_pfn, unsigned int end_pfn,
++                               p2m_access_t p2ma, unsigned int flag)
++{
++    int ret = 0;
++    unsigned int flush_flags = 0;
++    size_t page_count = end_pfn - base_pfn + 1;
++
++    if ( ctx->arch.hap_context )
++    {
++        this_cpu(iommu_dont_flush_iotlb) = true;
++        while ( base_pfn < end_pfn )
++        {
++            ret = p2m_add_identity_entry(d, base_pfn, p2ma, flag);
++
++            if ( ret )
++            {
++                this_cpu(iommu_dont_flush_iotlb) = false;
++                return ret;
++            }
++
++            base_pfn++;
++        }
++        this_cpu(iommu_dont_flush_iotlb) = false;
++    }
++    else
++    {
++        ret = iommu_map(d, _dfn(base_pfn), _mfn(base_pfn), page_count,
++                        p2m_access_to_iommu_flags(p2ma), &flush_flags,
++                        ctx->id);
++
++        if ( ret )
++            return ret;
++    }
++
++    ret = iommu_iotlb_flush(d, _dfn(base_pfn), page_count, flush_flags,
++                            ctx->id);
++
++    return ret;
++}
++
++/* p2m_access_x removes the mapping */
++int iommu_identity_mapping(struct domain *d, struct iommu_context *ctx,
++                           p2m_access_t p2ma, paddr_t base, paddr_t end,
+                            unsigned int flag)
+ {
+     unsigned long base_pfn = base >> PAGE_SHIFT_4K;
+     unsigned long end_pfn = PAGE_ALIGN_4K(end) >> PAGE_SHIFT_4K;
+     struct identity_map *map;
+-    struct domain_iommu *hd = dom_iommu(d);
++    int ret = 0;
+ 
+     ASSERT(pcidevs_locked());
+     ASSERT(base < end);
+ 
+-    /*
+-     * No need to acquire hd->arch.mapping_lock: Both insertion and removal
+-     * get done while holding pcidevs_lock.
+-     */
+-    list_for_each_entry( map, &hd->arch.identity_maps, list )
++    list_for_each_entry( map, &ctx->arch.identity_maps, list )
+     {
+         if ( map->base == base && map->end == end )
+         {
+-            int ret = 0;
+-
+             if ( p2ma != p2m_access_x )
+             {
+                 if ( map->access != p2ma )
+                     return -EADDRINUSE;
++
+                 ++map->count;
+                 return 0;
+             }
+@@ -247,12 +377,9 @@ int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma,
+             if ( --map->count )
+                 return 0;
+ 
+-            while ( base_pfn < end_pfn )
+-            {
+-                if ( clear_identity_p2m_entry(d, base_pfn) )
+-                    ret = -ENXIO;
+-                base_pfn++;
+-            }
++            printk("Unmapping [%"PRI_mfn"x:%"PRI_mfn"] for d%dc%d\n", base_pfn, end_pfn,
++                   d->domain_id, ctx->id);
++            ret = unmap_identity_region(d, ctx, base_pfn, end_pfn);
+ 
+             list_del(&map->list);
+             xfree(map);
+@@ -271,47 +398,43 @@ int iommu_identity_mapping(struct domain *d, p2m_access_t p2ma,
+     if ( !map )
+         return -ENOMEM;
+ 
+-    map->base = base;
+-    map->end = end;
+-    map->access = p2ma;
+-    map->count = 1;
+-
+-    /*
+-     * Insert into list ahead of mapping, so the range can be found when
+-     * trying to clean up.
+-     */
+-    list_add_tail(&map->list, &hd->arch.identity_maps);
++    printk("Mapping [%"PRI_mfn"x:%"PRI_mfn"] for d%dc%d\n", base_pfn, end_pfn,
++           d->domain_id, ctx->id);
++    ret = map_identity_region(d, ctx, base_pfn, end_pfn, p2ma, flag);
+ 
+-    for ( ; base_pfn < end_pfn; ++base_pfn )
++    if ( ret )
+     {
+-        int err = set_identity_p2m_entry(d, base_pfn, p2ma, flag);
+-
+-        if ( !err )
+-            continue;
+-
+-        if ( (map->base >> PAGE_SHIFT_4K) == base_pfn )
+-        {
+-            list_del(&map->list);
+-            xfree(map);
+-        }
+-        return err;
++        xfree(map);
++        return ret;
+     }
+ 
+     return 0;
+ }
+ 
+-void iommu_identity_map_teardown(struct domain *d)
++void iommu_identity_map_teardown(struct domain *d, struct iommu_context *ctx)
+ {
+-    struct domain_iommu *hd = dom_iommu(d);
+     struct identity_map *map, *tmp;
+ 
+-    list_for_each_entry_safe ( map, tmp, &hd->arch.identity_maps, list )
++    list_for_each_entry_safe ( map, tmp, &ctx->arch.identity_maps, list )
+     {
+         list_del(&map->list);
+         xfree(map);
+     }
+ }
+ 
++bool iommu_identity_map_check(struct domain *d, struct iommu_context *ctx,
++                              mfn_t mfn)
++{
++    struct identity_map *map;
++    uint64_t addr = pfn_to_paddr(mfn_x(mfn));
++
++    list_for_each_entry ( map, &ctx->arch.identity_maps, list )
++        if (addr >= map->base && addr < map->end)
++            return true;
++
++    return false;
++}
++
+ static int __hwdom_init cf_check map_subtract(unsigned long s, unsigned long e,
+                                               void *data)
+ {
+@@ -369,7 +492,7 @@ static int __hwdom_init cf_check identity_map(unsigned long s, unsigned long e,
+             if ( iomem_access_permitted(d, s, s) )
+             {
+                 rc = iommu_map(d, _dfn(s), _mfn(s), 1, perms,
+-                               &info->flush_flags);
++                               &info->flush_flags, 0);
+                 if ( rc < 0 )
+                     break;
+                 /* Must map a frame at least, which is what we request for. */
+@@ -379,7 +502,7 @@ static int __hwdom_init cf_check identity_map(unsigned long s, unsigned long e,
+             s++;
+         }
+         while ( (rc = iommu_map(d, _dfn(s), _mfn(s), e - s + 1,
+-                                perms, &info->flush_flags)) > 0 )
++                                perms, &info->flush_flags, 0)) > 0 )
+         {
+             s += rc;
+             process_pending_softirqs();
+@@ -408,6 +531,10 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
+     if ( iommu_hwdom_reserved == -1 )
+         iommu_hwdom_reserved = 1;
+ 
++    if ( iommu_hwdom_no_dma )
++        /* Skip special mappings with no-dma mode */
++        return;
++
+     if ( iommu_hwdom_inclusive )
+     {
+         printk(XENLOG_WARNING
+@@ -545,7 +672,6 @@ void __hwdom_init arch_iommu_hwdom_init(struct domain *d)
+ 
+ void arch_pci_init_pdev(struct pci_dev *pdev)
+ {
+-    pdev->arch.pseudo_domid = DOMID_INVALID;
+ }
+ 
+ unsigned long *__init iommu_init_domid(domid_t reserve)
+@@ -576,8 +702,6 @@ domid_t iommu_alloc_domid(unsigned long *map)
+     static unsigned int start;
+     unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start);
+ 
+-    ASSERT(pcidevs_locked());
+-
+     if ( idx >= UINT16_MAX - DOMID_MASK )
+         idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK);
+     if ( idx >= UINT16_MAX - DOMID_MASK )
+@@ -603,7 +727,7 @@ void iommu_free_domid(domid_t domid, unsigned long *map)
+         BUG();
+ }
+ 
+-int iommu_free_pgtables(struct domain *d)
++int iommu_free_pgtables(struct domain *d, struct iommu_context *ctx)
+ {
+     struct domain_iommu *hd = dom_iommu(d);
+     struct page_info *pg;
+@@ -612,18 +736,18 @@ int iommu_free_pgtables(struct domain *d)
+     if ( !is_iommu_enabled(d) )
+         return 0;
+ 
+-    /* After this barrier, no new IOMMU mappings can be inserted. */
+-    spin_barrier(&hd->arch.mapping_lock);
+-
+     /*
+      * Pages will be moved to the free list below. So we want to
+      * clear the root page-table to avoid any potential use after-free.
+      */
+-    iommu_vcall(hd->platform_ops, clear_root_pgtable, d);
++    iommu_vcall(hd->platform_ops, clear_root_pgtable, d, ctx);
+ 
+-    while ( (pg = page_list_remove_head(&hd->arch.pgtables.list)) )
++    while ( (pg = page_list_remove_head(&ctx->arch.pgtables)) )
+     {
+-        free_domheap_page(pg);
++        if (ctx->id == 0)
++            free_domheap_page(pg);
++        else
++            iommu_arena_free_page(&hd->arch.pt_arena, pg);
+ 
+         if ( !(++done & 0xff) && general_preempt_check() )
+             return -ERESTART;
+@@ -633,6 +757,7 @@ int iommu_free_pgtables(struct domain *d)
+ }
+ 
+ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
++                                      struct iommu_context *ctx,
+                                       uint64_t contig_mask)
+ {
+     unsigned int memflags = 0;
+@@ -644,7 +769,11 @@ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
+         memflags = MEMF_node(hd->node);
+ #endif
+ 
+-    pg = alloc_domheap_page(NULL, memflags);
++    if (ctx->id == 0)
++        pg = alloc_domheap_page(NULL, memflags);
++    else
++        pg = iommu_arena_allocate_page(&hd->arch.pt_arena);
++
+     if ( !pg )
+         return NULL;
+ 
+@@ -677,9 +806,7 @@ struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd,
+ 
+     unmap_domain_page(p);
+ 
+-    spin_lock(&hd->arch.pgtables.lock);
+-    page_list_add(pg, &hd->arch.pgtables.list);
+-    spin_unlock(&hd->arch.pgtables.lock);
++    page_list_add(pg, &ctx->arch.pgtables);
+ 
+     return pg;
+ }
+@@ -718,17 +845,20 @@ static void cf_check free_queued_pgtables(void *arg)
+     }
+ }
+ 
+-void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
++void iommu_queue_free_pgtable(struct iommu_context *ctx, struct page_info *pg)
+ {
+     unsigned int cpu = smp_processor_id();
+ 
+-    spin_lock(&hd->arch.pgtables.lock);
+-    page_list_del(pg, &hd->arch.pgtables.list);
+-    spin_unlock(&hd->arch.pgtables.lock);
++    page_list_del(pg, &ctx->arch.pgtables);
+ 
+-    page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
++    if ( !ctx->id )
++    {
++        page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+ 
+-    tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
++        tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
++    }
++    else
++        page_list_add_tail(pg, &ctx->arch.free_queue);
+ }
+ 
+ static int cf_check cpu_callback(
+-- 
+2.46.0
+
diff --git a/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch b/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch
new file mode 100644
index 0000000..0dce269
--- /dev/null
+++ b/0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch
@@ -0,0 +1,965 @@
+From 64340057e5819e0755f87c6d9d7b4b954c9a8a93 Mon Sep 17 00:00:00 2001
+From: Teddy Astie <teddy.astie@vates.tech>
+Date: Mon, 4 Nov 2024 14:28:39 +0000
+Subject: [PATCH 404/404] xen/public: Introduce PV-IOMMU hypercall interface
+
+Introduce a new pv interface to manage the underlying IOMMU and manage contexts
+and devices. This interface allows creation of new contexts from Dom0 and
+addition of IOMMU mappings using guest PoV.
+
+This interface doesn't allow creation of mapping to other domains.
+
+Signed-off-by Teddy Astie <teddy.astie@vates.tech>
+---
+ xen/common/Makefile           |   1 +
+ xen/common/pv-iommu.c         | 539 ++++++++++++++++++++++++++++++++++
+ xen/include/hypercall-defs.c  |   6 +
+ xen/include/public/pv-iommu.h | 341 +++++++++++++++++++++
+ xen/include/public/xen.h      |   1 +
+ 5 files changed, 888 insertions(+)
+ create mode 100644 xen/common/pv-iommu.c
+ create mode 100644 xen/include/public/pv-iommu.h
+
+diff --git a/xen/common/Makefile b/xen/common/Makefile
+index cba3b32733ba..0b6df5966056 100644
+--- a/xen/common/Makefile
++++ b/xen/common/Makefile
+@@ -61,6 +61,7 @@ obj-y += wait.o
+ obj-bin-y += warning.init.o
+ obj-$(CONFIG_XENOPROF) += xenoprof.o
+ obj-y += xmalloc_tlsf.o
++obj-y += pv-iommu.o
+ 
+ obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma lzo unlzo unlz4 unzstd earlycpio,$(n).init.o)
+ 
+diff --git a/xen/common/pv-iommu.c b/xen/common/pv-iommu.c
+new file mode 100644
+index 000000000000..9c7d04b4c7e6
+--- /dev/null
++++ b/xen/common/pv-iommu.c
+@@ -0,0 +1,539 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * xen/common/pv_iommu.c
++ *
++ * PV-IOMMU hypercall interface.
++ */
++
++#include <xen/errno.h>
++#include <xen/mm.h>
++#include <xen/lib.h>
++#include <xen/iommu.h>
++#include <xen/sched.h>
++#include <xen/iocap.h>
++#include <xen/mm-frame.h>
++#include <xen/pci.h>
++#include <xen/guest_access.h>
++#include <asm/p2m.h>
++#include <asm/event.h>
++#include <asm/mm.h>
++#include <asm/iommu.h>
++#include <public/pv-iommu.h>
++
++#define PVIOMMU_PREFIX "[PV-IOMMU] "
++
++static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
++                           struct page_info **page, bool readonly)
++{
++    int ret = 0;
++    p2m_type_t p2mt = p2m_invalid;
++
++    #ifdef CONFIG_X86
++    p2m_query_t query = P2M_ALLOC;
++
++    if ( !readonly )
++        query |= P2M_UNSHARE;
++
++    *mfn = get_gfn_type(d, gfn_x(gfn), &p2mt, query);
++    #else
++    *mfn = p2m_lookup(d, gfn, &p2mt);
++    #endif
++
++    if ( mfn_eq(*mfn, INVALID_MFN) )
++    {
++        /* No mapping ? */
++        printk(XENLOG_G_WARNING PVIOMMU_PREFIX
++               "Trying to map to non-backed page frame (gfn=%"PRI_gfn
++               " p2mt=%d d%d)\n", gfn_x(gfn), p2mt, d->domain_id);
++
++        ret = -ENOENT;
++    }
++    else if ( p2m_is_any_ram(p2mt) && mfn_valid(*mfn) )
++    {
++        *page = get_page_from_mfn(*mfn, d);
++        ret = 0;
++    }
++    else if ( p2m_is_mmio(p2mt) ||
++              iomem_access_permitted(d, mfn_x(*mfn),mfn_x(*mfn)) )
++    {
++        *page = NULL;
++        ret = 0;
++    }
++    else
++    {
++        printk(XENLOG_G_WARNING PVIOMMU_PREFIX
++               "Unexpected p2mt %d (d%d gfn=%"PRI_gfn" mfn=%"PRI_mfn")\n",
++               p2mt, d->domain_id, gfn_x(gfn), mfn_x(*mfn));
++
++        ret = -EPERM;
++    }
++
++    put_gfn(d, gfn_x(gfn));
++    return ret;
++}
++
++static bool can_use_iommu_check(struct domain *d)
++{
++    if ( !is_iommu_enabled(d) )
++    {
++        printk(XENLOG_G_WARNING PVIOMMU_PREFIX
++               "IOMMU disabled for this domain\n");
++        return false;
++    }
++
++    if ( !dom_iommu(d)->allow_pv_iommu )
++    {
++        printk(XENLOG_G_WARNING PVIOMMU_PREFIX
++               "PV-IOMMU disabled for this domain\n");
++        return false;
++    }
++
++    return true;
++}
++
++static long capabilities_op(struct pv_iommu_capabilities *cap, struct domain *d)
++{
++    cap->max_ctx_no = d->iommu.other_contexts.count;
++    cap->max_iova_addr = iommu_get_max_iova(d);
++
++    cap->max_pasid = 0; /* TODO */
++    cap->cap_flags = 0;
++
++    if ( !dom_iommu(d)->no_dma )
++        cap->cap_flags |= IOMMUCAP_default_identity;
++
++    cap->pgsize_mask = PAGE_SIZE_4K;
++
++    return 0;
++}
++
++static long init_op(struct pv_iommu_init *init, struct domain *d)
++{
++    if (init->max_ctx_no == UINT32_MAX)
++        return -E2BIG;
++
++    return iommu_domain_pviommu_init(d, init->max_ctx_no + 1, init->arena_order);
++}
++
++static long alloc_context_op(struct pv_iommu_alloc *alloc, struct domain *d)
++{
++    u16 ctx_no = 0;
++    int status = 0;
++
++    status = iommu_context_alloc(d, &ctx_no, 0);
++
++    if ( status )
++        return status;
++
++    printk(XENLOG_G_INFO PVIOMMU_PREFIX
++           "Created IOMMU context %hu in d%d\n", ctx_no, d->domain_id);
++
++    alloc->ctx_no = ctx_no;
++    return 0;
++}
++
++static long free_context_op(struct pv_iommu_free *free, struct domain *d)
++{
++    int flags = IOMMU_TEARDOWN_PREEMPT;
++
++    if ( !free->ctx_no )
++        return -EINVAL;
++
++    if ( free->free_flags & IOMMU_FREE_reattach_default )
++        flags |= IOMMU_TEARDOWN_REATTACH_DEFAULT;
++
++    return iommu_context_free(d, free->ctx_no, flags);
++}
++
++static long reattach_device_op(struct pv_iommu_reattach_device *reattach,
++                               struct domain *d)
++{
++    int ret;
++    device_t *pdev;
++    struct physdev_pci_device dev = reattach->dev;
++
++    pcidevs_lock();
++    pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn));
++
++    if ( !pdev )
++    {
++        pcidevs_unlock();
++        return -ENOENT;
++    }
++
++    ret = iommu_reattach_context(d, d, pdev, reattach->ctx_no);
++
++    pcidevs_unlock();
++    return ret;
++}
++
++static long map_pages_op(struct pv_iommu_map_pages *map, struct domain *d)
++{
++    struct iommu_context *ctx;
++    int ret = 0, flush_ret;
++    struct page_info *page = NULL;
++    mfn_t mfn, mfn_lookup;
++    unsigned int flags = 0, flush_flags = 0;
++    size_t i = 0;
++    dfn_t dfn0 = _dfn(map->dfn); /* original map->dfn */
++
++    if ( !map->ctx_no || !(ctx = iommu_get_context(d, map->ctx_no)) )
++        return -EINVAL;
++
++    if ( map->map_flags & IOMMU_MAP_readable )
++        flags |= IOMMUF_readable;
++
++    if ( map->map_flags & IOMMU_MAP_writeable )
++        flags |= IOMMUF_writable;
++
++    for (i = 0; i < map->nr_pages; i++)
++    {
++        gfn_t gfn = _gfn(map->gfn + i);
++        dfn_t dfn = _dfn(map->dfn + i);
++
++#ifdef CONFIG_X86
++        if ( iommu_identity_map_check(d, ctx, _mfn(map->dfn)) )
++        {
++            ret = -EADDRNOTAVAIL;
++            break;
++        }
++#endif
++
++        ret = get_paged_frame(d, gfn, &mfn, &page, 0);
++
++        if ( ret )
++            break;
++
++        /* Check for conflict with existing mappings */
++        if ( !iommu_lookup_page(d, dfn, &mfn_lookup, &flags, map->ctx_no) )
++        {
++            if ( page )
++                put_page(page);
++
++            ret = -EADDRINUSE;
++            break;
++        }
++
++        ret = iommu_map(d, dfn, mfn, 1, flags, &flush_flags, map->ctx_no);
++
++        if ( ret )
++        {
++            if ( page )
++                put_page(page);
++
++            break;
++        }
++
++        map->mapped++;
++
++        if ( (i & 0xff) && hypercall_preempt_check() )
++        {
++            i++;
++
++            map->gfn += i;
++            map->dfn += i;
++            map->nr_pages -= i;
++
++            ret = -ERESTART;
++            break;
++        }
++    }
++
++    flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, map->ctx_no);
++
++    iommu_put_context(ctx);
++
++    if ( flush_ret )
++        printk(XENLOG_G_WARNING PVIOMMU_PREFIX
++               "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
++               ctx->id, flush_ret);
++
++    return ret;
++}
++
++static long unmap_pages_op(struct pv_iommu_unmap_pages *unmap, struct domain *d)
++{
++    struct iommu_context *ctx;
++    mfn_t mfn;
++    int ret = 0, flush_ret;
++    unsigned int flags, flush_flags = 0;
++    size_t i = 0;
++    dfn_t dfn0 = _dfn(unmap->dfn); /* original unmap->dfn */
++
++    if ( !unmap->ctx_no || !(ctx = iommu_get_context(d, unmap->ctx_no)) )
++        return -EINVAL;
++
++    for (i = 0; i < unmap->nr_pages; i++)
++    {
++        dfn_t dfn = _dfn(unmap->dfn + i);
++
++#ifdef CONFIG_X86
++        if ( iommu_identity_map_check(d, ctx, _mfn(unmap->dfn)) )
++        {
++            ret = -EADDRNOTAVAIL;
++            break;
++        }
++#endif
++
++        /* Check if there is a valid mapping for this domain */
++        if ( iommu_lookup_page(d, dfn, &mfn, &flags, unmap->ctx_no) ) {
++            ret = -ENOENT;
++            break;
++        }
++
++        ret = iommu_unmap(d, dfn, 1, 0, &flush_flags, unmap->ctx_no);
++
++        if ( ret )
++            break;
++
++        unmap->unmapped++;
++
++        /* Decrement reference counter (if needed) */
++        if ( mfn_valid(mfn) )
++            put_page(mfn_to_page(mfn));
++
++        if ( (i & 0xff) && hypercall_preempt_check() )
++        {
++            i++;
++
++            unmap->dfn += i;
++            unmap->nr_pages -= i;
++
++            ret = -ERESTART;
++            break;
++        }
++    }
++
++    flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, unmap->ctx_no);
++
++    iommu_put_context(ctx);
++
++    if ( flush_ret )
++        printk(XENLOG_WARNING PVIOMMU_PREFIX
++               "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
++               ctx->id, flush_ret);
++
++    return ret;
++}
++
++static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
++                           struct domain *d, bool remote);
++
++static long remote_cmd_op(struct pv_iommu_remote_cmd *remote_cmd,
++                          struct domain *current_domain)
++{
++    long ret = 0;
++    struct domain *d;
++
++    /* TODO: use a better permission logic */
++    if ( !is_hardware_domain(current_domain) )
++        return -EPERM;
++
++    d = get_domain_by_id(remote_cmd->domid);
++
++    if ( !d )
++        return -ENOENT;
++
++    ret = do_iommu_subop(remote_cmd->subop, remote_cmd->arg, d, true);
++
++    put_domain(d);
++
++    return ret;
++}
++
++static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
++                           struct domain *d, bool remote)
++{
++    long ret = 0;
++
++    switch ( subop )
++    {
++        case IOMMU_noop:
++            break;
++
++        case IOMMU_query_capabilities:
++        {
++            struct pv_iommu_capabilities cap;
++
++            ret = capabilities_op(&cap, d);
++
++            if ( unlikely(copy_to_guest(arg, &cap, 1)) )
++                ret = -EFAULT;
++
++            break;
++        }
++
++        case IOMMU_init:
++        {
++            struct pv_iommu_init init;
++
++            if ( unlikely(copy_from_guest(&init, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = init_op(&init, d);
++        }
++
++        case IOMMU_alloc_context:
++        {
++            struct pv_iommu_alloc alloc;
++
++            if ( unlikely(copy_from_guest(&alloc, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = alloc_context_op(&alloc, d);
++
++            if ( unlikely(copy_to_guest(arg, &alloc, 1)) )
++                ret = -EFAULT;
++
++            break;
++        }
++
++        case IOMMU_free_context:
++        {
++            struct pv_iommu_free free;
++
++            if ( unlikely(copy_from_guest(&free, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = free_context_op(&free, d);
++            break;
++        }
++
++        case IOMMU_reattach_device:
++        {
++            struct pv_iommu_reattach_device reattach;
++
++            if ( unlikely(copy_from_guest(&reattach, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = reattach_device_op(&reattach, d);
++            break;
++        }
++
++        case IOMMU_map_pages:
++        {
++            struct pv_iommu_map_pages map;
++
++            if ( unlikely(copy_from_guest(&map, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = map_pages_op(&map, d);
++
++            if ( unlikely(copy_to_guest(arg, &map, 1)) )
++                ret = -EFAULT;
++
++            break;
++        }
++
++        case IOMMU_unmap_pages:
++        {
++            struct pv_iommu_unmap_pages unmap;
++
++            if ( unlikely(copy_from_guest(&unmap, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = unmap_pages_op(&unmap, d);
++
++            if ( unlikely(copy_to_guest(arg, &unmap, 1)) )
++                ret = -EFAULT;
++
++            break;
++        }
++
++        case IOMMU_remote_cmd:
++        {
++            struct pv_iommu_remote_cmd remote_cmd;
++
++            if ( remote )
++            {
++                /* Prevent remote_cmd from being called recursively */
++                ret = -EINVAL;
++                break;
++            }
++
++            if ( unlikely(copy_from_guest(&remote_cmd, arg, 1)) )
++            {
++                ret = -EFAULT;
++                break;
++            }
++
++            ret = remote_cmd_op(&remote_cmd, d);
++            break;
++        }
++
++        /*
++         * TODO
++         */
++        case IOMMU_alloc_nested:
++        {
++            ret = -EOPNOTSUPP;
++            break;
++        }
++
++        case IOMMU_flush_nested:
++        {
++            ret = -EOPNOTSUPP;
++            break;
++        }
++
++        case IOMMU_attach_pasid:
++        {
++            ret = -EOPNOTSUPP;
++            break;
++        }
++
++        case IOMMU_detach_pasid:
++        {
++            ret = -EOPNOTSUPP;
++            break;
++        }
++
++        default:
++            return -EOPNOTSUPP;
++    }
++
++    return ret;
++}
++
++long do_iommu_op(unsigned int subop, XEN_GUEST_HANDLE_PARAM(void) arg)
++{
++    long ret = 0;
++
++    if ( !can_use_iommu_check(current->domain) )
++        return -ENODEV;
++
++    ret = do_iommu_subop(subop, arg, current->domain, false);
++
++    if ( ret == -ERESTART )
++        return hypercall_create_continuation(__HYPERVISOR_iommu_op, "ih", subop, arg);
++
++    return ret;
++}
++
++/*
++ * Local variables:
++ * mode: C
++ * c-file-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
+index 7720a29ade0b..78ca87b57fab 100644
+--- a/xen/include/hypercall-defs.c
++++ b/xen/include/hypercall-defs.c
+@@ -209,6 +209,9 @@ hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns
+ #ifdef CONFIG_X86
+ xenpmu_op(unsigned int op, xen_pmu_params_t *arg)
+ #endif
++#ifdef CONFIG_HAS_PASSTHROUGH
++iommu_op(unsigned int subop, void *arg)
++#endif
+ 
+ #ifdef CONFIG_PV
+ caller: pv64
+@@ -295,5 +298,8 @@ mca                                do       do       -        -        -
+ #ifndef CONFIG_PV_SHIM_EXCLUSIVE
+ paging_domctl_cont                 do       do       do       do       -
+ #endif
++#ifdef CONFIG_HAS_PASSTHROUGH
++iommu_op                           do       do       do       do       -
++#endif
+ 
+ #endif /* !CPPCHECK */
+diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h
+new file mode 100644
+index 000000000000..c14b8435c980
+--- /dev/null
++++ b/xen/include/public/pv-iommu.h
+@@ -0,0 +1,341 @@
++/* SPDX-License-Identifier: MIT */
++/**
++ * pv-iommu.h
++ *
++ * Paravirtualized IOMMU driver interface.
++ *
++ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
++ */
++
++#ifndef __XEN_PUBLIC_PV_IOMMU_H__
++#define __XEN_PUBLIC_PV_IOMMU_H__
++
++#include "xen.h"
++#include "physdev.h"
++
++#ifndef uint64_aligned_t
++#define uint64_aligned_t uint64_t
++#endif
++
++#define IOMMU_DEFAULT_CONTEXT (0)
++
++enum {
++    /* Basic cmd */
++    IOMMU_noop = 0,
++    IOMMU_query_capabilities,
++    IOMMU_init,
++    IOMMU_alloc_context,
++    IOMMU_free_context,
++    IOMMU_reattach_device,
++    IOMMU_map_pages,
++    IOMMU_unmap_pages,
++    IOMMU_remote_cmd,
++
++    /* Extended cmd */
++    IOMMU_alloc_nested,     /* if IOMMUCAP_nested */
++    IOMMU_flush_nested,     /* if IOMMUCAP_nested */
++    IOMMU_attach_pasid,     /* if IOMMUCAP_pasid */
++    IOMMU_detach_pasid,     /* if IOMMUCAP_pasid */
++};
++
++/**
++ * Indicate if the default context is a identity mapping to domain memory.
++ * If not defined, default context blocks all DMA to domain memory.
++ */
++#define IOMMUCAP_default_identity  (1 << 0)
++
++/**
++ * IOMMU_MAP_cache support.
++ */
++#define IOMMUCAP_cache     (1 << 1)
++
++/**
++ * Support for IOMMU_alloc_nested.
++ */
++#define IOMMUCAP_nested    (1 << 2)
++
++/**
++ * Support for IOMMU_attach_pasid and IOMMU_detach_pasid and pasid parameter in
++ * reattach_context.
++ */
++#define IOMMUCAP_pasid     (1 << 3)
++
++/**
++ * Support for IOMMU_ALLOC_identity
++ */
++#define IOMMUCAP_identity (1 << 4)
++
++/**
++ * IOMMU_query_capabilities
++ * Query PV-IOMMU capabilities for this domain.
++ */
++struct pv_iommu_capabilities {
++    /*
++     * OUT: Maximum device address (iova) that the guest can use for mappings.
++     */
++    uint64_aligned_t max_iova_addr;
++
++    /* OUT: IOMMU capabilities flags */
++    uint32_t cap_flags;
++
++    /* OUT: Mask of all supported page sizes. */
++    uint32_t pgsize_mask;
++
++    /* OUT: Maximum pasid (if IOMMUCAP_pasid) */
++    uint32_t max_pasid;
++
++    /* OUT: Maximum number of IOMMU context this domain can use. */
++    uint16_t max_ctx_no;
++};
++typedef struct pv_iommu_capabilities pv_iommu_capabilities_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_capabilities_t);
++
++/**
++ * IOMMU_init
++ * Initialize PV-IOMMU for this domain.
++ *
++ * Fails with -EACCESS if PV-IOMMU is already initialized.
++ */
++struct pv_iommu_init {
++    /* IN: Maximum number of IOMMU context this domain can use. */
++    uint32_t max_ctx_no;
++
++    /* IN: Arena size in pages (in power of two) */
++    uint32_t arena_order;
++};
++typedef struct pv_iommu_init pv_iommu_init_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_init_t);
++
++/**
++ * Create a 1:1 identity mapped context to domain memory
++ * (needs IOMMUCAP_identity).
++ */
++#define IOMMU_ALLOC_identity (1 << 0)
++
++/**
++ * IOMMU_alloc_context
++ * Allocate an IOMMU context.
++ * Fails with -ENOSPC if no context number is available.
++ */
++struct pv_iommu_alloc {
++    /* OUT: allocated IOMMU context number */
++    uint16_t ctx_no;
++
++    /* IN: allocation flags */
++    uint32_t alloc_flags;
++};
++typedef struct pv_iommu_alloc pv_iommu_alloc_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_t);
++
++/**
++ * Move all devices to default context before freeing the context.
++ */
++#define IOMMU_FREE_reattach_default (1 << 0)
++
++/**
++ * IOMMU_free_context
++ * Destroy a IOMMU context.
++ *
++ * If IOMMU_FREE_reattach_default is specified, move all context devices to
++ * default context before destroying this context.
++ *
++ * If there are devices in the context and IOMMU_FREE_reattach_default is not
++ * specified, fail with -EBUSY.
++ *
++ * The default context can't be destroyed.
++ */
++struct pv_iommu_free {
++    /* IN: IOMMU context number to free */
++    uint16_t ctx_no;
++
++    /* IN: Free operation specific flags */
++    uint32_t free_flags;
++};
++typedef struct pv_iommu_free pv_iommu_free_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_free_t);
++
++/* Device has read access */
++#define IOMMU_MAP_readable (1 << 0)
++
++/* Device has write access */
++#define IOMMU_MAP_writeable (1 << 1)
++
++/* Enforce DMA coherency */
++#define IOMMU_MAP_cache (1 << 2)
++
++/**
++ * IOMMU_map_pages
++ * Map pages on a IOMMU context.
++ *
++ * pgsize must be supported by pgsize_mask.
++ * Fails with -EINVAL if mapping on top of another mapping.
++ * Report actually mapped page count in mapped field (regardless of failure).
++ */
++struct pv_iommu_map_pages {
++    /* IN: IOMMU context number */
++    uint16_t ctx_no;
++
++    /* IN: Guest frame number */
++    uint64_aligned_t gfn;
++
++    /* IN: Device frame number */
++    uint64_aligned_t dfn;
++
++    /* IN: Map flags */
++    uint32_t map_flags;
++
++    /* IN: Size of pages to map */
++    uint32_t pgsize;
++
++    /* IN: Number of pages to map */
++    uint32_t nr_pages;
++
++    /* OUT: Number of pages actually mapped */
++    uint32_t mapped;
++};
++typedef struct pv_iommu_map_pages pv_iommu_map_pages_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_map_pages_t);
++
++/**
++ * IOMMU_unmap_pages
++ * Unmap pages on a IOMMU context.
++ *
++ * pgsize must be supported by pgsize_mask.
++ * Report actually unmapped page count in mapped field (regardless of failure).
++ * Fails with -ENOENT when attempting to unmap a page without any mapping
++ */
++struct pv_iommu_unmap_pages {
++    /* IN: IOMMU context number */
++    uint16_t ctx_no;
++
++    /* IN: Device frame number */
++    uint64_aligned_t dfn;
++
++    /* IN: Size of pages to unmap */
++    uint32_t pgsize;
++
++    /* IN: Number of pages to unmap */
++    uint32_t nr_pages;
++
++    /* OUT: Number of pages actually unmapped */
++    uint32_t unmapped;
++};
++typedef struct pv_iommu_unmap_pages pv_iommu_unmap_pages_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_unmap_pages_t);
++
++/**
++ * IOMMU_reattach_device
++ * Reattach a device to another IOMMU context.
++ * Fails with -ENODEV if no such device exist.
++ */
++struct pv_iommu_reattach_device {
++    /* IN: Target IOMMU context number */
++    uint16_t ctx_no;
++
++    /* IN: Physical device to move */
++    struct physdev_pci_device dev;
++
++    /* IN: PASID of the device (if IOMMUCAP_pasid) */
++    uint32_t pasid;
++};
++typedef struct pv_iommu_reattach_device pv_iommu_reattach_device_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_reattach_device_t);
++
++
++/**
++ * IOMMU_remote_cmd
++ * Do a PV-IOMMU operation on another domain.
++ * Current domain needs to be allowed to act on the target domain, otherwise
++ * fails with -EPERM.
++ */
++struct pv_iommu_remote_cmd {
++    /* IN: Target domain to do the subop on */
++    uint16_t domid;
++
++    /* IN: Command to do on target domain. */
++    uint16_t subop;
++
++    /* INOUT: Command argument from current domain memory */
++    XEN_GUEST_HANDLE(void) arg;
++};
++typedef struct pv_iommu_remote_cmd pv_iommu_remote_cmd_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_remote_cmd_t);
++
++/**
++ * IOMMU_alloc_nested
++ * Create a nested IOMMU context (needs IOMMUCAP_nested).
++ *
++ * This context uses a platform-specific page table from domain address space
++ * specified in pgtable_gfn and use it for nested translations.
++ *
++ * Explicit flushes needs to be submited with IOMMU_flush_nested on
++ * modification of the nested pagetable to ensure coherency between IOTLB and
++ * nested page table.
++ *
++ * This context can be destroyed using IOMMU_free_context.
++ * This context cannot be modified using map_pages, unmap_pages.
++ */
++struct pv_iommu_alloc_nested {
++    /* OUT: allocated IOMMU context number */
++    uint16_t ctx_no;
++
++    /* IN: guest frame number of the nested page table */
++    uint64_aligned_t pgtable_gfn;
++
++    /* IN: nested mode flags */
++    uint64_aligned_t nested_flags;
++};
++typedef struct pv_iommu_alloc_nested pv_iommu_alloc_nested_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_nested_t);
++
++/**
++ * IOMMU_flush_nested (needs IOMMUCAP_nested)
++ * Flush the IOTLB for nested translation.
++ */
++struct pv_iommu_flush_nested {
++    /* TODO */
++};
++typedef struct pv_iommu_flush_nested pv_iommu_flush_nested_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_flush_nested_t);
++
++/**
++ * IOMMU_attach_pasid (needs IOMMUCAP_pasid)
++ * Attach a new device-with-pasid to a IOMMU context.
++ * If a matching device-with-pasid already exists (globally),
++ * fail with -EEXIST.
++ * If pasid is 0, fails with -EINVAL.
++ * If physical device doesn't exist in domain, fail with -ENOENT.
++ */
++struct pv_iommu_attach_pasid {
++    /* IN: IOMMU context to add the device-with-pasid in */
++    uint16_t ctx_no;
++
++    /* IN: Physical device */
++    struct physdev_pci_device dev;
++
++    /* IN: pasid of the device to attach */
++    uint32_t pasid;
++};
++typedef struct pv_iommu_attach_pasid pv_iommu_attach_pasid_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_attach_pasid_t);
++
++/**
++ * IOMMU_detach_pasid (needs IOMMUCAP_pasid)
++ * detach a device-with-pasid.
++ * If the device-with-pasid doesn't exist or belong to the domain,
++ * fail with -ENOENT.
++ * If pasid is 0, fails with -EINVAL.
++ */
++struct pv_iommu_detach_pasid {
++    /* IN: Physical device */
++    struct physdev_pci_device dev;
++
++    /* pasid of the device to detach */
++    uint32_t pasid;
++};
++typedef struct pv_iommu_detach_pasid pv_iommu_detach_pasid_t;
++DEFINE_XEN_GUEST_HANDLE(pv_iommu_detach_pasid_t);
++
++/* long do_iommu_op(int subop, XEN_GUEST_HANDLE_PARAM(void) arg) */
++
++#endif
+\ No newline at end of file
+diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
+index e051f989a5ca..d5bdedfee5ee 100644
+--- a/xen/include/public/xen.h
++++ b/xen/include/public/xen.h
+@@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
+ #define __HYPERVISOR_xenpmu_op            40
+ #define __HYPERVISOR_dm_op                41
+ #define __HYPERVISOR_hypfs_op             42
++#define __HYPERVISOR_iommu_op             43
+ 
+ /* Architecture-specific hypercall definitions. */
+ #define __HYPERVISOR_arch_0               48
+-- 
+2.46.0
+
diff --git a/xen.spec.in b/xen.spec.in
index 8252188..a1201c3 100644
--- a/xen.spec.in
+++ b/xen.spec.in
@@ -98,6 +98,12 @@ Patch0203: 0203-xen.efi.build.patch
 
 # Backports (300+)
 
+Patch0400: 0400-docs-designs-Add-a-design-document-for-PV-IOMMU.patch
+Patch0401: 0401-docs-designs-Add-a-design-document-for-IOMMU-subsyst.patch
+Patch0402: 0402-IOMMU-Introduce-redesigned-IOMMU-subsystem.patch
+Patch0403: 0403-VT-d-Port-IOMMU-driver-to-new-subsystem.patch
+Patch0404: 0404-xen-public-Introduce-PV-IOMMU-hypercall-interface.patch
+
 # Security fixes (500+)
 
 # Upstreamable patches (600+)

From ea8ecfcc28b0d03d340124607eec25f47c85f95a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Wed, 8 Jan 2025 21:33:17 +0100
Subject: [PATCH 5/6] WIP Disable AMD IOMMU

AMD IOMMU isn't ported to the new IOMMU API yet.
---
 config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/config b/config
index 4bebb45..90108c8 100644
--- a/config
+++ b/config
@@ -120,6 +120,8 @@ CONFIG_SERIAL_TX_BUFSIZE=16384
 CONFIG_XHCI=y
 CONFIG_HAS_CPUFREQ=y
 CONFIG_HAS_PASSTHROUGH=y
+# CONFIG_AMD_IOMMU is not set
+CONFIG_INTEL_IOMMU=y
 # CONFIG_IOMMU_QUARANTINE_NONE is not set
 CONFIG_IOMMU_QUARANTINE_BASIC=y
 # CONFIG_IOMMU_QUARANTINE_SCRATCH_PAGE is not set

From d8dcbccdf830e6f3133269fb648c4b37e4b58b7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
 <marmarek@invisiblethingslab.com>
Date: Thu, 9 Jan 2025 01:51:58 +0100
Subject: [PATCH 6/6] Fix setting configure flags

Move setting $CONFIG_EXTRA before its use for the configure call.
This also makes --disable-pvshim option effective, so remove pvshim from
the files list.
---
 xen.spec.in | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/xen.spec.in b/xen.spec.in
index a1201c3..0dc7a16 100644
--- a/xen.spec.in
+++ b/xen.spec.in
@@ -428,6 +428,18 @@ CONFIG_EXTRA="$CONFIG_EXTRA --with-system-seabios=/usr/share/seabios/bios-256k.b
 %else
 CONFIG_EXTRA="$CONFIG_EXTRA --disable-seabios"
 %endif
+
+# BEGIN QUBES SPECIFIC PART
+%ifnarch armv7hl aarch64
+#CONFIG_EXTRA="$CONFIG_EXTRA --with-system-ipxe=/usr/share/ipxe"
+CONFIG_EXTRA="$CONFIG_EXTRA --disable-ipxe --disable-rombios"
+CONFIG_EXTRA="$CONFIG_EXTRA --disable-pvshim"
+%endif
+CONFIG_EXTRA="$CONFIG_EXTRA --with-system-qemu=/usr/bin/qemu-system-x86_64"
+export PATH="/usr/bin:$PATH"
+autoreconf -i
+# END QUBES SPECIFIC PART
+
 ./configure --prefix=%{_prefix} --libdir=%{_libdir} --libexecdir=%{_libexecdir} --with-system-qemu=/usr/bin/qemu-system-i386 --with-linux-backend-modules="xen-evtchn xen-gntdev xen-gntalloc xen-blkback xen-netback xen-pciback xen-scsiback xen-acpi-processor" --enable-systemd --disable-pygrub $CONFIG_EXTRA
 unset CFLAGS CXXFLAGS FFLAGS LDFLAGS
 export LDFLAGS="$LDFLAGS_SAVE"
@@ -449,17 +461,6 @@ export CFLAGS=`echo $CFLAGS | sed -e 's/-mfloat-abi=hard//g' -e 's/-march=armv7-
 %endif
 unset CFLAGS CXXFLAGS FFLAGS LDFLAGS
 
-# BEGIN QUBES SPECIFIC PART
-%ifnarch armv7hl aarch64
-#CONFIG_EXTRA="$CONFIG_EXTRA --with-system-ipxe=/usr/share/ipxe"
-CONFIG_EXTRA="$CONFIG_EXTRA --disable-ipxe --disable-rombios"
-CONFIG_EXTRA="$CONFIG_EXTRA --disable-pvshim"
-%endif
-CONFIG_EXTRA="$CONFIG_EXTRA --with-system-qemu=/usr/bin/qemu-system-x86_64"
-export PATH="/usr/bin:$PATH"
-autoreconf -i
-# END QUBES SPECIFIC PART
-
 %make_build %{?ocaml_flags} prefix=/usr tools
 %if %build_docs
 make prefix=/usr docs
@@ -897,10 +898,10 @@ fi
 %ifarch %{ix86} x86_64
 %dir %{_libexecdir}/%{name}/boot
 %{_libexecdir}/xen/boot/hvmloader
-%ifnarch %{ix86}
-%{_libexecdir}/%{name}/boot/xen-shim
-/usr/lib/debug%{_libexecdir}/xen/boot/xen-shim-syms
-%endif
+%dnl %ifnarch %{ix86}
+%dnl %{_libexecdir}/%{name}/boot/xen-shim
+%dnl /usr/lib/debug%{_libexecdir}/xen/boot/xen-shim-syms
+%dnl %endif
 %if %build_stubdom
 %if %build_qemutrad
 %{_libexecdir}/xen/boot/ioemu-stubdom.gz