Re: [PATCH] x86/PCI: Revert: "Clip only host bridge windows for E820 regions"

From: Bjorn Helgaas
Date: Mon Jun 13 2022 - 19:15:49 EST


On Sun, Jun 12, 2022 at 04:43:25PM +0200, Hans de Goede wrote:
> Clipping the bridge windows directly from pci_acpi_root_prepare_resources()
> instead of clipping from arch_remove_reservations(), has a number of
> unforseen consequences.
>
> If there is an e820 reservation in the middle of a bridge window, then
> the smallest of the 2 remaining parts of the window will be also clipped
> off. Where as the previous code would clip regions requested by devices,
> rather then the entire window, leaving regions which were either entirely
> above or below a reservation in the middle of the window alone.
>
> E.g. on the Steam Deck this leads to this log message:
>
> acpi PNP0A08:00: clipped [mem 0x80000000-0xf7ffffff window] to [mem 0xa0100000-0xf7ffffff window]
>
> which then gets followed by these log messages:
>
> pci 0000:00:01.2: can't claim BAR 14 [mem 0x80600000-0x806fffff]: no compatible bridge window
> pci 0000:00:01.3: can't claim BAR 14 [mem 0x80500000-0x805fffff]: no compatible bridge window
>
> and many more of these. Ultimately this leads to the Steam Deck
> no longer booting properly, so revert the change.
>
> Note this is not a clean revert, this revert keeps the later change
> to make the clipping dependent on a new pci_use_e820 bool, moving
> the checking of this bool to arch_remove_reservations().

4c5e242d3e93 was definitely a mistake (my fault). My intent was to
mainly to improve logging of the clipping, but I didn't implement it
well.

That said, I'd like to understand the connection between the messages
you mention and the failure. There are four bridges whose MMIO
windows were in the [mem 0x80000000-0x9fffffff] area that we clipped
out. The log shows that we moved all those windows and the devices in
them to the [mem 0xa0100000-0xf7ffffff] area that remained after
clipping.

So I think this *should* have worked even though we moved things
around unnecessarily. What am I missing?

The E820 map reports [mem 0xa0000000-0xa00fffff] in the middle of the
_CRS, and we currently trim that out. We think this is a firmware
defect, so it's likely to break in 2023 if we stop clipping by
default. I'm concerned that there may be other things in _CRS that we
need to avoid, but firmware isn't telling us about them.

Or there's some dependency in the devices that we moved on their
original addresses, e.g., firmware on the device latched the address
and didn't notice the reassignment.

[1] https://bugzilla.kernel.org/attachment.cgi?id=301154

> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=216109
> Fixes: 4c5e242d3e93 ("x86/PCI: Clip only host bridge windows for E820 regions")
> Reported-and-tested-by: Guilherme G. Piccoli <gpiccoli@xxxxxxxxxx>
> Signed-off-by: Hans de Goede <hdegoede@xxxxxxxxxx>
> ---
> arch/x86/include/asm/e820/api.h | 5 -----
> arch/x86/include/asm/pci_x86.h | 8 ++++++++
> arch/x86/kernel/resource.c | 14 +++++++++-----
> arch/x86/pci/acpi.c | 8 +-------
> 4 files changed, 18 insertions(+), 17 deletions(-)
>
> diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
> index 5a39ed59b6db..e8f58ddd06d9 100644
> --- a/arch/x86/include/asm/e820/api.h
> +++ b/arch/x86/include/asm/e820/api.h
> @@ -4,9 +4,6 @@
>
> #include <asm/e820/types.h>
>
> -struct device;
> -struct resource;
> -
> extern struct e820_table *e820_table;
> extern struct e820_table *e820_table_kexec;
> extern struct e820_table *e820_table_firmware;
> @@ -46,8 +43,6 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn);
>
> extern int e820__get_entry_type(u64 start, u64 end);
>
> -extern void remove_e820_regions(struct device *dev, struct resource *avail);
> -
> /*
> * Returns true iff the specified range [start,end) is completely contained inside
> * the ISA region.
> diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
> index f52a886d35cf..70533fdcbf02 100644
> --- a/arch/x86/include/asm/pci_x86.h
> +++ b/arch/x86/include/asm/pci_x86.h
> @@ -69,6 +69,8 @@ void pcibios_scan_specific_bus(int busn);
>
> /* pci-irq.c */
>
> +struct pci_dev;
> +
> struct irq_info {
> u8 bus, devfn; /* Bus, device and function */
> struct {
> @@ -246,3 +248,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
> # define x86_default_pci_init_irq NULL
> # define x86_default_pci_fixup_irqs NULL
> #endif
> +
> +#if defined(CONFIG_PCI) && defined(CONFIG_ACPI)
> +extern bool pci_use_e820;
> +#else
> +#define pci_use_e820 false
> +#endif
> diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
> index db2b350a37b7..bba1abd05bfe 100644
> --- a/arch/x86/kernel/resource.c
> +++ b/arch/x86/kernel/resource.c
> @@ -1,7 +1,8 @@
> // SPDX-License-Identifier: GPL-2.0
> -#include <linux/dev_printk.h>
> #include <linux/ioport.h>
> +#include <linux/printk.h>
> #include <asm/e820/api.h>
> +#include <asm/pci_x86.h>
>
> static void resource_clip(struct resource *res, resource_size_t start,
> resource_size_t end)
> @@ -24,14 +25,14 @@ static void resource_clip(struct resource *res, resource_size_t start,
> res->start = end + 1;
> }
>
> -void remove_e820_regions(struct device *dev, struct resource *avail)
> +static void remove_e820_regions(struct resource *avail)
> {
> int i;
> struct e820_entry *entry;
> u64 e820_start, e820_end;
> struct resource orig = *avail;
>
> - if (!(avail->flags & IORESOURCE_MEM))
> + if (!pci_use_e820)
> return;
>
> for (i = 0; i < e820_table->nr_entries; i++) {
> @@ -41,7 +42,7 @@ void remove_e820_regions(struct device *dev, struct resource *avail)
>
> resource_clip(avail, e820_start, e820_end);
> if (orig.start != avail->start || orig.end != avail->end) {
> - dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
> + pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
> &orig, avail, e820_start, e820_end);
> orig = *avail;
> }
> @@ -55,6 +56,9 @@ void arch_remove_reservations(struct resource *avail)
> * the low 1MB unconditionally, as this area is needed for some ISA
> * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
> */
> - if (avail->flags & IORESOURCE_MEM)
> + if (avail->flags & IORESOURCE_MEM) {
> resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
> +
> + remove_e820_regions(avail);
> + }
> }
> diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
> index a4f43054bc79..2f82480fd430 100644
> --- a/arch/x86/pci/acpi.c
> +++ b/arch/x86/pci/acpi.c
> @@ -8,7 +8,6 @@
> #include <linux/pci-acpi.h>
> #include <asm/numa.h>
> #include <asm/pci_x86.h>
> -#include <asm/e820/api.h>
>
> struct pci_root_info {
> struct acpi_pci_root_info common;
> @@ -20,7 +19,7 @@ struct pci_root_info {
> #endif
> };
>
> -static bool pci_use_e820 = true;
> +bool pci_use_e820 = true;
> static bool pci_use_crs = true;
> static bool pci_ignore_seg;
>
> @@ -387,11 +386,6 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
>
> status = acpi_pci_probe_root_resources(ci);
>
> - if (pci_use_e820) {
> - resource_list_for_each_entry(entry, &ci->resources)
> - remove_e820_regions(&device->dev, entry->res);
> - }
> -
> if (pci_use_crs) {
> resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
> if (resource_is_pcicfg_ioport(entry->res))
> --
> 2.36.0
>