From a94d072b20239f6f615dc20f0a54f63e9b642f9e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 5 Oct 2011 22:35:03 +0100 Subject: PCI: Add quirk for known incorrect MPSS Using legacy interrupts and TLPs > 256 bytes on the SFC4000 (all revisions) may cause interrupt messages to be replayed. In some systems this results in a non-recoverable MCE. Early boards using the SFC4000 set the maximum payload size supported (MPSS) to 1024 bytes and we should override that. There are probably other devices with similar issues, so give this quirk a generic name. Signed-off-by: Ben Hutchings Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 1196f61..f70c36f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2822,6 +2822,20 @@ static void __devinit fixup_ti816x_class(struct pci_dev* dev) } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_TI, 0xb800, fixup_ti816x_class); +/* Some PCIe devices do not work reliably with the claimed maximum + * payload size supported. + */ +static void __devinit fixup_mpss_256(struct pci_dev *dev) +{ + dev->pcie_mpss = 1; /* 256 bytes */ +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000A_0, fixup_mpss_256); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000A_1, fixup_mpss_256); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, + PCI_DEVICE_ID_SOLARFLARE_SFC4000B, fixup_mpss_256); + static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) { -- cgit v1.1 From e24442733ee486c99d03fe2ecd98924d1bc14c51 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sun, 11 Sep 2011 14:08:38 -0300 Subject: PCI: Make pci_setup_bridge() non-static for use by arch code The "powernv" platform of the powerpc architecture needs to assign PCI resources using a specific algorithm to fit some HW constraints of the IBM "IODA" architecture (related to the ability to create error handling domains that encompass specific segments of MMIO space). For doing so, it wants to call pci_setup_bridge() from architecture specific resource management in order to configure bridges after all resources have been assigned. So make it non-static. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Jesse Barnes --- drivers/pci/setup-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/pci') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 784da9d..86b69f85 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -426,7 +426,7 @@ static void __pci_setup_bridge(struct pci_bus *bus, unsigned long type) pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl); } -static void pci_setup_bridge(struct pci_bus *bus) +void pci_setup_bridge(struct pci_bus *bus) { unsigned long type = IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; -- cgit v1.1 From 3e309cdf07c930f29a4e0f233e47d399bea34c68 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 5 Oct 2011 11:44:50 -0400 Subject: PCI quirk: mmc: Always check for lower base frequency quirk for Ricoh 1180:e823 Commit 15bed0f2f added a quirk for the e823 Ricoh card reader to lower the base frequency. However, the quirk first checks to see if the proprietary MMC controller is disabled, and returns if so. On some devices, such as the Lenovo X220, the MMC controller is already disabled by firmware it seems, but the frequency change is still needed so sdhci-pci can talk to the cards. Since the MMC controller is disabled, the frequency fixup was never being run on these machines. This moves the e823 check above the MMC controller check so that it always gets run. This fixes https://bugzilla.redhat.com/show_bug.cgi?id=722509 Signed-off-by: Josh Boyer Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index f70c36f..e5aadf35 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2745,20 +2745,6 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) /* disable must be done via function #0 */ if (PCI_FUNC(dev->devfn)) return; - - pci_read_config_byte(dev, 0xCB, &disable); - - if (disable & 0x02) - return; - - pci_read_config_byte(dev, 0xCA, &write_enable); - pci_write_config_byte(dev, 0xCA, 0x57); - pci_write_config_byte(dev, 0xCB, disable | 0x02); - pci_write_config_byte(dev, 0xCA, write_enable); - - dev_notice(&dev->dev, "proprietary Ricoh MMC controller disabled (via firewire function)\n"); - dev_notice(&dev->dev, "MMC cards are now supported by standard SDHCI controller\n"); - /* * RICOH 0xe823 SD/MMC card reader fails to recognize * certain types of SD/MMC cards. Lowering the SD base @@ -2781,6 +2767,20 @@ static void ricoh_mmc_fixup_r5c832(struct pci_dev *dev) dev_notice(&dev->dev, "MMC controller base frequency changed to 50Mhz.\n"); } + + pci_read_config_byte(dev, 0xCB, &disable); + + if (disable & 0x02) + return; + + pci_read_config_byte(dev, 0xCA, &write_enable); + pci_write_config_byte(dev, 0xCA, 0x57); + pci_write_config_byte(dev, 0xCB, disable | 0x02); + pci_write_config_byte(dev, 0xCA, write_enable); + + dev_notice(&dev->dev, "proprietary Ricoh MMC controller disabled (via firewire function)\n"); + dev_notice(&dev->dev, "MMC cards are now supported by standard SDHCI controller\n"); + } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); DECLARE_PCI_FIXUP_RESUME_EARLY(PCI_VENDOR_ID_RICOH, PCI_DEVICE_ID_RICOH_R5C832, ricoh_mmc_fixup_r5c832); -- cgit v1.1 From 379021d5c0899fcf9410cae4ca7a59a5a94ca769 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 3 Oct 2011 23:16:33 +0200 Subject: PCI / PM: Extend PME polling to all PCI devices The land of PCI power management is a land of sorrow and ugliness, especially in the area of signaling events by devices. There are devices that set their PME Status bits, but don't really bother to send a PME message or assert PME#. There are hardware vendors who don't connect PME# lines to the system core logic (they know who they are). There are PCI Express Root Ports that don't bother to trigger interrupts when they receive PME messages from the devices below. There are ACPI BIOSes that forget to provide _PRW methods for devices capable of signaling wakeup. Finally, there are BIOSes that do provide _PRW methods for such devices, but then don't bother to call Notify() for those devices from the corresponding _Lxx/_Exx GPE-handling methods. In all of these cases the kernel doesn't have a chance to receive a proper notification that it should wake up a device, so devices stay in low-power states forever. Worse yet, in some cases they continuously send PME Messages that are silently ignored, because the kernel simply doesn't know that it should clear the device's PME Status bit. This problem was first observed for "parallel" (non-Express) PCI devices on add-on cards and Matthew Garrett addressed it by adding code that polls PME Status bits of such devices, if they are enabled to signal PME, to the kernel. Recently, however, it has turned out that PCI Express devices are also affected by this issue and that it is not limited to add-on devices, so it seems necessary to extend the PME polling to all PCI devices, including PCI Express and planar ones. Still, it would be wasteful to poll the PME Status bits of devices that are known to receive proper PME notifications, so make the kernel (1) poll the PME Status bits of all PCI and PCIe devices enabled to signal PME and (2) disable the PME Status polling for devices for which correct PME notifications are received. Tested-by: Sarah Sharp Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pci-acpi.c | 3 +++ drivers/pci/pci.c | 41 ++++++++++++++++++++--------------------- drivers/pci/pcie/pme.c | 9 +++++++++ 3 files changed, 32 insertions(+), 21 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d36f41e..cd3c4f1 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -46,6 +46,9 @@ static void pci_acpi_wake_dev(acpi_handle handle, u32 event, void *context) struct pci_dev *pci_dev = context; if (event == ACPI_NOTIFY_DEVICE_WAKE && pci_dev) { + if (pci_dev->pme_poll) + pci_dev->pme_poll = false; + pci_wakeup_event(pci_dev); pci_check_pme_status(pci_dev); pm_runtime_resume(&pci_dev->dev); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e9651f0..7cd417e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1407,13 +1407,16 @@ bool pci_check_pme_status(struct pci_dev *dev) /** * pci_pme_wakeup - Wake up a PCI device if its PME Status bit is set. * @dev: Device to handle. - * @ign: Ignored. + * @pme_poll_reset: Whether or not to reset the device's pme_poll flag. * * Check if @dev has generated PME and queue a resume request for it in that * case. */ -static int pci_pme_wakeup(struct pci_dev *dev, void *ign) +static int pci_pme_wakeup(struct pci_dev *dev, void *pme_poll_reset) { + if (pme_poll_reset && dev->pme_poll) + dev->pme_poll = false; + if (pci_check_pme_status(dev)) { pci_wakeup_event(dev); pm_request_resume(&dev->dev); @@ -1428,7 +1431,7 @@ static int pci_pme_wakeup(struct pci_dev *dev, void *ign) void pci_pme_wakeup_bus(struct pci_bus *bus) { if (bus) - pci_walk_bus(bus, pci_pme_wakeup, NULL); + pci_walk_bus(bus, pci_pme_wakeup, (void *)true); } /** @@ -1446,31 +1449,26 @@ bool pci_pme_capable(struct pci_dev *dev, pci_power_t state) static void pci_pme_list_scan(struct work_struct *work) { - struct pci_pme_device *pme_dev; + struct pci_pme_device *pme_dev, *n; mutex_lock(&pci_pme_list_mutex); if (!list_empty(&pci_pme_list)) { - list_for_each_entry(pme_dev, &pci_pme_list, list) - pci_pme_wakeup(pme_dev->dev, NULL); - schedule_delayed_work(&pci_pme_work, msecs_to_jiffies(PME_TIMEOUT)); + list_for_each_entry_safe(pme_dev, n, &pci_pme_list, list) { + if (pme_dev->dev->pme_poll) { + pci_pme_wakeup(pme_dev->dev, NULL); + } else { + list_del(&pme_dev->list); + kfree(pme_dev); + } + } + if (!list_empty(&pci_pme_list)) + schedule_delayed_work(&pci_pme_work, + msecs_to_jiffies(PME_TIMEOUT)); } mutex_unlock(&pci_pme_list_mutex); } /** - * pci_external_pme - is a device an external PCI PME source? - * @dev: PCI device to check - * - */ - -static bool pci_external_pme(struct pci_dev *dev) -{ - if (pci_is_pcie(dev) || dev->bus->number == 0) - return false; - return true; -} - -/** * pci_pme_active - enable or disable PCI device's PME# function * @dev: PCI device to handle. * @enable: 'true' to enable PME# generation; 'false' to disable it. @@ -1503,7 +1501,7 @@ void pci_pme_active(struct pci_dev *dev, bool enable) hit, and the power savings from the devices will still be a win. */ - if (pci_external_pme(dev)) { + if (dev->pme_poll) { struct pci_pme_device *pme_dev; if (enable) { pme_dev = kmalloc(sizeof(struct pci_pme_device), @@ -1821,6 +1819,7 @@ void pci_pm_init(struct pci_dev *dev) (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; + dev->pme_poll = true; /* * Make device's PM flags reflect the wake-up capability, but * let the user space enable it to wake up the system as needed. diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index 0057344..001f1b7 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -84,6 +84,9 @@ static bool pcie_pme_walk_bus(struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { /* Skip PCIe devices in case we started from a root port. */ if (!pci_is_pcie(dev) && pci_check_pme_status(dev)) { + if (dev->pme_poll) + dev->pme_poll = false; + pci_wakeup_event(dev); pm_request_resume(&dev->dev); ret = true; @@ -142,6 +145,9 @@ static void pcie_pme_handle_request(struct pci_dev *port, u16 req_id) /* First, check if the PME is from the root port itself. */ if (port->devfn == devfn && port->bus->number == busnr) { + if (port->pme_poll) + port->pme_poll = false; + if (pci_check_pme_status(port)) { pm_request_resume(&port->dev); found = true; @@ -187,6 +193,9 @@ static void pcie_pme_handle_request(struct pci_dev *port, u16 req_id) /* The device is there, but we have to check its PME status. */ found = pci_check_pme_status(dev); if (found) { + if (dev->pme_poll) + dev->pme_poll = false; + pci_wakeup_event(dev); pm_request_resume(&dev->dev); } -- cgit v1.1 From 6af8bef14d6fc9e4e52c83fd646412e9dedadd26 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 28 Sep 2011 19:40:53 -0400 Subject: PCI hotplug: acpiphp: Prevent deadlock on PCI-to-PCI bridge remove I originally submitted a patch to workaround this by pushing all Ejection Requests and Device Checks onto the kacpi_hotplug queue. http://marc.info/?l=linux-acpi&m=131678270930105&w=2 The patch is still insufficient in that Bus Checks also need to be added. Rather than add all events, including non-PCI-hotplug events, to the hotplug queue, mjg suggested that a better approach would be to modify the acpiphp driver so only acpiphp events would be added to the kacpi_hotplug queue. It's a longer patch, but at least we maintain the benefit of having separate queues in ACPI. This, of course, is still only a workaround the problem. As Bjorn and mjg pointed out, we have to refactor a lot of this code to do the right thing but at this point it is a better to have this code working. The acpi core places all events on the kacpi_notify queue. When the acpiphp driver is loaded and a PCI card with a PCI-to-PCI bridge is removed the following call sequence occurs: cleanup_p2p_bridge() -> cleanup_bridge() -> acpi_remove_notify_handler() -> acpi_os_wait_events_complete() -> flush_workqueue(kacpi_notify_wq) which is the queue we are currently executing on and the process will hang. Move all hotplug acpiphp events onto the kacpi_hotplug workqueue. In handle_hotplug_event_bridge() and handle_hotplug_event_func() we can simply push the rest of the work onto the kacpi_hotplug queue and then avoid the deadlock. Signed-off-by: Prarit Bhargava Cc: mjg@redhat.com Cc: bhelgaas@google.com Cc: linux-acpi@vger.kernel.org Signed-off-by: Jesse Barnes --- drivers/pci/hotplug/acpiphp_glue.c | 109 ++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 15 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 2202857..596172b 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "../pci.h" #include "acpiphp.h" @@ -1149,15 +1150,35 @@ check_sub_bridges(acpi_handle handle, u32 lvl, void *context, void **rv) return AE_OK ; } -/** - * handle_hotplug_event_bridge - handle ACPI event on bridges - * @handle: Notify()'ed acpi_handle - * @type: Notify code - * @context: pointer to acpiphp_bridge structure - * - * Handles ACPI event notification on {host,p2p} bridges. - */ -static void handle_hotplug_event_bridge(acpi_handle handle, u32 type, void *context) +struct acpiphp_hp_work { + struct work_struct work; + acpi_handle handle; + u32 type; + void *context; +}; + +static void alloc_acpiphp_hp_work(acpi_handle handle, u32 type, + void *context, + void (*func)(struct work_struct *work)) +{ + struct acpiphp_hp_work *hp_work; + int ret; + + hp_work = kmalloc(sizeof(*hp_work), GFP_KERNEL); + if (!hp_work) + return; + + hp_work->handle = handle; + hp_work->type = type; + hp_work->context = context; + + INIT_WORK(&hp_work->work, func); + ret = queue_work(kacpi_hotplug_wq, &hp_work->work); + if (!ret) + kfree(hp_work); +} + +static void _handle_hotplug_event_bridge(struct work_struct *work) { struct acpiphp_bridge *bridge; char objname[64]; @@ -1165,11 +1186,18 @@ static void handle_hotplug_event_bridge(acpi_handle handle, u32 type, void *cont .pointer = objname }; struct acpi_device *device; int num_sub_bridges = 0; + struct acpiphp_hp_work *hp_work; + acpi_handle handle; + u32 type; + + hp_work = container_of(work, struct acpiphp_hp_work, work); + handle = hp_work->handle; + type = hp_work->type; if (acpi_bus_get_device(handle, &device)) { /* This bridge must have just been physically inserted */ handle_bridge_insertion(handle, type); - return; + goto out; } bridge = acpiphp_handle_to_bridge(handle); @@ -1180,7 +1208,7 @@ static void handle_hotplug_event_bridge(acpi_handle handle, u32 type, void *cont if (!bridge && !num_sub_bridges) { err("cannot get bridge info\n"); - return; + goto out; } acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); @@ -1241,22 +1269,49 @@ static void handle_hotplug_event_bridge(acpi_handle handle, u32 type, void *cont warn("notify_handler: unknown event type 0x%x for %s\n", type, objname); break; } + +out: + kfree(hp_work); /* allocated in handle_hotplug_event_bridge */ } /** - * handle_hotplug_event_func - handle ACPI event on functions (i.e. slots) + * handle_hotplug_event_bridge - handle ACPI event on bridges * @handle: Notify()'ed acpi_handle * @type: Notify code - * @context: pointer to acpiphp_func structure + * @context: pointer to acpiphp_bridge structure * - * Handles ACPI event notification on slots. + * Handles ACPI event notification on {host,p2p} bridges. */ -static void handle_hotplug_event_func(acpi_handle handle, u32 type, void *context) +static void handle_hotplug_event_bridge(acpi_handle handle, u32 type, + void *context) +{ + /* + * Currently the code adds all hotplug events to the kacpid_wq + * queue when it should add hotplug events to the kacpi_hotplug_wq. + * The proper way to fix this is to reorganize the code so that + * drivers (dock, etc.) do not call acpi_os_execute(), etc. + * For now just re-add this work to the kacpi_hotplug_wq so we + * don't deadlock on hotplug actions. + */ + alloc_acpiphp_hp_work(handle, type, context, + _handle_hotplug_event_bridge); +} + +static void _handle_hotplug_event_func(struct work_struct *work) { struct acpiphp_func *func; char objname[64]; struct acpi_buffer buffer = { .length = sizeof(objname), .pointer = objname }; + struct acpiphp_hp_work *hp_work; + acpi_handle handle; + u32 type; + void *context; + + hp_work = container_of(work, struct acpiphp_hp_work, work); + handle = hp_work->handle; + type = hp_work->type; + context = hp_work->context; acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); @@ -1291,8 +1346,32 @@ static void handle_hotplug_event_func(acpi_handle handle, u32 type, void *contex warn("notify_handler: unknown event type 0x%x for %s\n", type, objname); break; } + + kfree(hp_work); /* allocated in handle_hotplug_event_func */ } +/** + * handle_hotplug_event_func - handle ACPI event on functions (i.e. slots) + * @handle: Notify()'ed acpi_handle + * @type: Notify code + * @context: pointer to acpiphp_func structure + * + * Handles ACPI event notification on slots. + */ +static void handle_hotplug_event_func(acpi_handle handle, u32 type, + void *context) +{ + /* + * Currently the code adds all hotplug events to the kacpid_wq + * queue when it should add hotplug events to the kacpi_hotplug_wq. + * The proper way to fix this is to reorganize the code so that + * drivers (dock, etc.) do not call acpi_os_execute(), etc. + * For now just re-add this work to the kacpi_hotplug_wq so we + * don't deadlock on hotplug actions. + */ + alloc_acpiphp_hp_work(handle, type, context, + _handle_hotplug_event_func); +} static acpi_status find_root_bridges(acpi_handle handle, u32 lvl, void *context, void **rv) -- cgit v1.1 From 78d090b0be3f072a3c95022771c35183af961aaa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 28 Sep 2011 21:44:36 +0200 Subject: PCI / PM: Remove unnecessary error variable from acpi_dev_run_wake() The result returned by acpi_dev_run_wake() is always either -EINVAL or -ENODEV, while obviously it should return 0 on success. The problem is that the leftover error variable, that's not really used in the function, is initialized with -ENODEV and then returned without modification. To fix this issue remove the error variable from acpi_dev_run_wake() and make the function return 0 on success as appropriate. Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/pci/pci-acpi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index cd3c4f1..4ecb640 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -285,7 +285,6 @@ static int acpi_dev_run_wake(struct device *phys_dev, bool enable) { struct acpi_device *dev; acpi_handle handle; - int error = -ENODEV; if (!device_run_wake(phys_dev)) return -EINVAL; @@ -305,7 +304,7 @@ static int acpi_dev_run_wake(struct device *phys_dev, bool enable) acpi_disable_wakeup_device_power(dev); } - return error; + return 0; } static void acpi_pci_propagate_run_wake(struct pci_bus *bus, bool enable) -- cgit v1.1 From db3c33c6d3fa04ee46b491e9d75d0d3b4798d074 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Sep 2011 15:57:13 +0200 Subject: PCI: Move ATS implementation into own file ATS does not depend on IOV support, so move the code into its own file. This file will also include support for the PRI and PASID capabilities later. Also give ATS its own Kconfig variable to allow selecting it without IOV support. Reviewed-by: Bjorn Helgaas Signed-off-by: Joerg Roedel Signed-off-by: Jesse Barnes --- drivers/pci/Kconfig | 4 ++ drivers/pci/Makefile | 1 + drivers/pci/ats.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/pci/iov.c | 142 ---------------------------------------------- 4 files changed, 160 insertions(+), 142 deletions(-) create mode 100644 drivers/pci/ats.c (limited to 'drivers/pci') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 0fa466a..1d8ce83 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -71,9 +71,13 @@ config HT_IRQ If unsure say Y. +config PCI_ATS + bool + config PCI_IOV bool "PCI IOV support" depends on PCI + select PCI_ATS help I/O Virtualization is a PCI feature supported by some devices which allows them to create virtual devices which share their diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 6fadae3..083a49f 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_PCI_MSI) += msi.o # Build the Hypertransport interrupt support obj-$(CONFIG_HT_IRQ) += htirq.o +obj-$(CONFIG_PCI_ATS) += ats.o obj-$(CONFIG_PCI_IOV) += iov.o # diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c new file mode 100644 index 0000000..ae4bf87 --- /dev/null +++ b/drivers/pci/ats.c @@ -0,0 +1,155 @@ +/* + * drivers/pci/ats.c + * + * Copyright (C) 2009 Intel Corporation, Yu Zhao + * + * PCI Express I/O Virtualization (IOV) support. + * Address Translation Service 1.0 + */ + +#include +#include + +#include "pci.h" + +static int ats_alloc_one(struct pci_dev *dev, int ps) +{ + int pos; + u16 cap; + struct pci_ats *ats; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); + if (!pos) + return -ENODEV; + + ats = kzalloc(sizeof(*ats), GFP_KERNEL); + if (!ats) + return -ENOMEM; + + ats->pos = pos; + ats->stu = ps; + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); + ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : + PCI_ATS_MAX_QDEP; + dev->ats = ats; + + return 0; +} + +static void ats_free_one(struct pci_dev *dev) +{ + kfree(dev->ats); + dev->ats = NULL; +} + +/** + * pci_enable_ats - enable the ATS capability + * @dev: the PCI device + * @ps: the IOMMU page shift + * + * Returns 0 on success, or negative on failure. + */ +int pci_enable_ats(struct pci_dev *dev, int ps) +{ + int rc; + u16 ctrl; + + BUG_ON(dev->ats && dev->ats->is_enabled); + + if (ps < PCI_ATS_MIN_STU) + return -EINVAL; + + if (dev->is_physfn || dev->is_virtfn) { + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; + + mutex_lock(&pdev->sriov->lock); + if (pdev->ats) + rc = pdev->ats->stu == ps ? 0 : -EINVAL; + else + rc = ats_alloc_one(pdev, ps); + + if (!rc) + pdev->ats->ref_cnt++; + mutex_unlock(&pdev->sriov->lock); + if (rc) + return rc; + } + + if (!dev->is_physfn) { + rc = ats_alloc_one(dev, ps); + if (rc) + return rc; + } + + ctrl = PCI_ATS_CTRL_ENABLE; + if (!dev->is_virtfn) + ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU); + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); + + dev->ats->is_enabled = 1; + + return 0; +} + +/** + * pci_disable_ats - disable the ATS capability + * @dev: the PCI device + */ +void pci_disable_ats(struct pci_dev *dev) +{ + u16 ctrl; + + BUG_ON(!dev->ats || !dev->ats->is_enabled); + + pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl); + ctrl &= ~PCI_ATS_CTRL_ENABLE; + pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); + + dev->ats->is_enabled = 0; + + if (dev->is_physfn || dev->is_virtfn) { + struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; + + mutex_lock(&pdev->sriov->lock); + pdev->ats->ref_cnt--; + if (!pdev->ats->ref_cnt) + ats_free_one(pdev); + mutex_unlock(&pdev->sriov->lock); + } + + if (!dev->is_physfn) + ats_free_one(dev); +} + +/** + * pci_ats_queue_depth - query the ATS Invalidate Queue Depth + * @dev: the PCI device + * + * Returns the queue depth on success, or negative on failure. + * + * The ATS spec uses 0 in the Invalidate Queue Depth field to + * indicate that the function can accept 32 Invalidate Request. + * But here we use the `real' values (i.e. 1~32) for the Queue + * Depth; and 0 indicates the function shares the Queue with + * other functions (doesn't exclusively own a Queue). + */ +int pci_ats_queue_depth(struct pci_dev *dev) +{ + int pos; + u16 cap; + + if (dev->is_virtfn) + return 0; + + if (dev->ats) + return dev->ats->qdep; + + pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); + if (!pos) + return -ENODEV; + + pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); + + return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : + PCI_ATS_MAX_QDEP; +} diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 42fae47..9b4e88c 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -722,145 +722,3 @@ int pci_num_vf(struct pci_dev *dev) return dev->sriov->nr_virtfn; } EXPORT_SYMBOL_GPL(pci_num_vf); - -static int ats_alloc_one(struct pci_dev *dev, int ps) -{ - int pos; - u16 cap; - struct pci_ats *ats; - - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); - if (!pos) - return -ENODEV; - - ats = kzalloc(sizeof(*ats), GFP_KERNEL); - if (!ats) - return -ENOMEM; - - ats->pos = pos; - ats->stu = ps; - pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); - ats->qdep = PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : - PCI_ATS_MAX_QDEP; - dev->ats = ats; - - return 0; -} - -static void ats_free_one(struct pci_dev *dev) -{ - kfree(dev->ats); - dev->ats = NULL; -} - -/** - * pci_enable_ats - enable the ATS capability - * @dev: the PCI device - * @ps: the IOMMU page shift - * - * Returns 0 on success, or negative on failure. - */ -int pci_enable_ats(struct pci_dev *dev, int ps) -{ - int rc; - u16 ctrl; - - BUG_ON(dev->ats && dev->ats->is_enabled); - - if (ps < PCI_ATS_MIN_STU) - return -EINVAL; - - if (dev->is_physfn || dev->is_virtfn) { - struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; - - mutex_lock(&pdev->sriov->lock); - if (pdev->ats) - rc = pdev->ats->stu == ps ? 0 : -EINVAL; - else - rc = ats_alloc_one(pdev, ps); - - if (!rc) - pdev->ats->ref_cnt++; - mutex_unlock(&pdev->sriov->lock); - if (rc) - return rc; - } - - if (!dev->is_physfn) { - rc = ats_alloc_one(dev, ps); - if (rc) - return rc; - } - - ctrl = PCI_ATS_CTRL_ENABLE; - if (!dev->is_virtfn) - ctrl |= PCI_ATS_CTRL_STU(ps - PCI_ATS_MIN_STU); - pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); - - dev->ats->is_enabled = 1; - - return 0; -} - -/** - * pci_disable_ats - disable the ATS capability - * @dev: the PCI device - */ -void pci_disable_ats(struct pci_dev *dev) -{ - u16 ctrl; - - BUG_ON(!dev->ats || !dev->ats->is_enabled); - - pci_read_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, &ctrl); - ctrl &= ~PCI_ATS_CTRL_ENABLE; - pci_write_config_word(dev, dev->ats->pos + PCI_ATS_CTRL, ctrl); - - dev->ats->is_enabled = 0; - - if (dev->is_physfn || dev->is_virtfn) { - struct pci_dev *pdev = dev->is_physfn ? dev : dev->physfn; - - mutex_lock(&pdev->sriov->lock); - pdev->ats->ref_cnt--; - if (!pdev->ats->ref_cnt) - ats_free_one(pdev); - mutex_unlock(&pdev->sriov->lock); - } - - if (!dev->is_physfn) - ats_free_one(dev); -} - -/** - * pci_ats_queue_depth - query the ATS Invalidate Queue Depth - * @dev: the PCI device - * - * Returns the queue depth on success, or negative on failure. - * - * The ATS spec uses 0 in the Invalidate Queue Depth field to - * indicate that the function can accept 32 Invalidate Request. - * But here we use the `real' values (i.e. 1~32) for the Queue - * Depth; and 0 indicates the function shares the Queue with - * other functions (doesn't exclusively own a Queue). - */ -int pci_ats_queue_depth(struct pci_dev *dev) -{ - int pos; - u16 cap; - - if (dev->is_virtfn) - return 0; - - if (dev->ats) - return dev->ats->qdep; - - pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ATS); - if (!pos) - return -ENODEV; - - pci_read_config_word(dev, pos + PCI_ATS_CAP, &cap); - - return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : - PCI_ATS_MAX_QDEP; -} -- cgit v1.1 From d4c0636c2107010f0ef8c4dfbb1d6368ae3b3ed9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Sep 2011 15:57:14 +0200 Subject: PCI: Export ATS functions to modules This patch makes the ATS functions usable for modules. They will be used by a module implementing some advanced AMD IOMMU features. Reviewed-by: Bjorn Helgaas Signed-off-by: Joerg Roedel Signed-off-by: Jesse Barnes --- drivers/pci/ats.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index ae4bf87..5ceff3e 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -90,6 +90,7 @@ int pci_enable_ats(struct pci_dev *dev, int ps) return 0; } +EXPORT_SYMBOL_GPL(pci_enable_ats); /** * pci_disable_ats - disable the ATS capability @@ -120,6 +121,7 @@ void pci_disable_ats(struct pci_dev *dev) if (!dev->is_physfn) ats_free_one(dev); } +EXPORT_SYMBOL_GPL(pci_disable_ats); /** * pci_ats_queue_depth - query the ATS Invalidate Queue Depth @@ -153,3 +155,4 @@ int pci_ats_queue_depth(struct pci_dev *dev) return PCI_ATS_CAP_QDEP(cap) ? PCI_ATS_CAP_QDEP(cap) : PCI_ATS_MAX_QDEP; } +EXPORT_SYMBOL_GPL(pci_ats_queue_depth); -- cgit v1.1 From c320b976d7837c561ce4aa49dfe0a64f0e527ce4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Sep 2011 15:57:15 +0200 Subject: PCI: Add implementation for PRI capability Implement the necessary functions to handle PRI capabilities on PCIe devices. With PRI devices behind an IOMMU can signal page fault conditions to software and recover from such faults. Reviewed-by: Bjorn Helgaas Signed-off-by: Joerg Roedel Signed-off-by: Jesse Barnes --- drivers/pci/Kconfig | 9 +++ drivers/pci/ats.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 1d8ce83..fb1e9707 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -85,6 +85,15 @@ config PCI_IOV If unsure, say N. +config PCI_PRI + bool "PCI PRI support" + select PCI_ATS + help + PRI is the PCI Page Request Interface. It allows PCI devices that are + behind an IOMMU to recover from page faults. + + If unsure, say N. + config PCI_IOAPIC bool depends on PCI diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 5ceff3e..bf892a0 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -2,9 +2,11 @@ * drivers/pci/ats.c * * Copyright (C) 2009 Intel Corporation, Yu Zhao + * Copyright (C) 2011 Advanced Micro Devices, * * PCI Express I/O Virtualization (IOV) support. * Address Translation Service 1.0 + * Page Request Interface added by Joerg Roedel */ #include @@ -156,3 +158,168 @@ int pci_ats_queue_depth(struct pci_dev *dev) PCI_ATS_MAX_QDEP; } EXPORT_SYMBOL_GPL(pci_ats_queue_depth); + +#ifdef CONFIG_PCI_PRI +/** + * pci_enable_pri - Enable PRI capability + * @ pdev: PCI device structure + * + * Returns 0 on success, negative value on error + */ +int pci_enable_pri(struct pci_dev *pdev, u32 reqs) +{ + u16 control, status; + u32 max_requests; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); + if ((control & PCI_PRI_ENABLE) || !(status & PCI_PRI_STATUS_STOPPED)) + return -EBUSY; + + pci_read_config_dword(pdev, pos + PCI_PRI_MAX_REQ_OFF, &max_requests); + reqs = min(max_requests, reqs); + pci_write_config_dword(pdev, pos + PCI_PRI_ALLOC_REQ_OFF, reqs); + + control |= PCI_PRI_ENABLE; + pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_enable_pri); + +/** + * pci_disable_pri - Disable PRI capability + * @pdev: PCI device structure + * + * Only clears the enabled-bit, regardless of its former value + */ +void pci_disable_pri(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + control &= ~PCI_PRI_ENABLE; + pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); +} +EXPORT_SYMBOL_GPL(pci_disable_pri); + +/** + * pci_pri_enabled - Checks if PRI capability is enabled + * @pdev: PCI device structure + * + * Returns true if PRI is enabled on the device, false otherwise + */ +bool pci_pri_enabled(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return false; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + + return (control & PCI_PRI_ENABLE) ? true : false; +} +EXPORT_SYMBOL_GPL(pci_pri_enabled); + +/** + * pci_reset_pri - Resets device's PRI state + * @pdev: PCI device structure + * + * The PRI capability must be disabled before this function is called. + * Returns 0 on success, negative value on error. + */ +int pci_reset_pri(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + if (control & PCI_PRI_ENABLE) + return -EBUSY; + + control |= PCI_PRI_RESET; + + pci_write_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, control); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_reset_pri); + +/** + * pci_pri_stopped - Checks whether the PRI capability is stopped + * @pdev: PCI device structure + * + * Returns true if the PRI capability on the device is disabled and the + * device has no outstanding PRI requests, false otherwise. The device + * indicates this via the STOPPED bit in the status register of the + * capability. + * The device internal state can be cleared by resetting the PRI state + * with pci_reset_pri(). This can force the capability into the STOPPED + * state. + */ +bool pci_pri_stopped(struct pci_dev *pdev) +{ + u16 control, status; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return true; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); + + if (control & PCI_PRI_ENABLE) + return false; + + return (status & PCI_PRI_STATUS_STOPPED) ? true : false; +} +EXPORT_SYMBOL_GPL(pci_pri_stopped); + +/** + * pci_pri_status - Request PRI status of a device + * @pdev: PCI device structure + * + * Returns negative value on failure, status on success. The status can + * be checked against status-bits. Supported bits are currently: + * PCI_PRI_STATUS_RF: Response failure + * PCI_PRI_STATUS_UPRGI: Unexpected Page Request Group Index + * PCI_PRI_STATUS_STOPPED: PRI has stopped + */ +int pci_pri_status(struct pci_dev *pdev) +{ + u16 status, control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PRI_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PRI_STATUS_OFF, &status); + + /* Stopped bit is undefined when enable == 1, so clear it */ + if (control & PCI_PRI_ENABLE) + status &= ~PCI_PRI_STATUS_STOPPED; + + return status; +} +EXPORT_SYMBOL_GPL(pci_pri_status); +#endif /* CONFIG_PCI_PRI */ -- cgit v1.1 From 086ac11f6435c9dc2fe5025fc8ea3a1dbca273d6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Sep 2011 15:57:16 +0200 Subject: PCI: Add support for PASID capability Devices supporting Process Address Space Identifiers (PASIDs) can use an IOMMU to access multiple IO address spaces at the same time. A PCIe device indicates support for this feature by implementing the PASID capability. This patch adds support for the capability to the Linux kernel. Reviewed-by: Bjorn Helgaas Signed-off-by: Joerg Roedel Signed-off-by: Jesse Barnes --- drivers/pci/Kconfig | 13 ++++++ drivers/pci/ats.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index fb1e9707..cec6606 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -94,6 +94,19 @@ config PCI_PRI If unsure, say N. +config PCI_PASID + bool "PCI PASID support" + depends on PCI + select PCI_ATS + help + Process Address Space Identifiers (PASIDs) can be used by PCI devices + to access more than one IO address space at the same time. To make + use of this feature an IOMMU is required which also supports PASIDs. + Select this option if you have such an IOMMU and want to compile the + driver for it into your kernel. + + If unsure, say N. + config PCI_IOAPIC bool depends on PCI diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index bf892a0..f727a09 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -7,6 +7,7 @@ * PCI Express I/O Virtualization (IOV) support. * Address Translation Service 1.0 * Page Request Interface added by Joerg Roedel + * PASID support added by Joerg Roedel */ #include @@ -323,3 +324,115 @@ int pci_pri_status(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_pri_status); #endif /* CONFIG_PCI_PRI */ + +#ifdef CONFIG_PCI_PASID +/** + * pci_enable_pasid - Enable the PASID capability + * @pdev: PCI device structure + * @features: Features to enable + * + * Returns 0 on success, negative value on error. This function checks + * whether the features are actually supported by the device and returns + * an error if not. + */ +int pci_enable_pasid(struct pci_dev *pdev, int features) +{ + u16 control, supported; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, &control); + pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + + if (!(supported & PCI_PASID_ENABLE)) + return -EINVAL; + + supported &= PCI_PASID_EXEC | PCI_PASID_PRIV; + + /* User wants to enable anything unsupported? */ + if ((supported & features) != features) + return -EINVAL; + + control = PCI_PASID_ENABLE | features; + + pci_write_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, control); + + return 0; +} +EXPORT_SYMBOL_GPL(pci_enable_pasid); + +/** + * pci_disable_pasid - Disable the PASID capability + * @pdev: PCI device structure + * + */ +void pci_disable_pasid(struct pci_dev *pdev) +{ + u16 control = 0; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + if (!pos) + return; + + pci_write_config_word(pdev, pos + PCI_PASID_CONTROL_OFF, control); +} +EXPORT_SYMBOL_GPL(pci_disable_pasid); + +/** + * pci_pasid_features - Check which PASID features are supported + * @pdev: PCI device structure + * + * Returns a negative value when no PASI capability is present. + * Otherwise is returns a bitmask with supported features. Current + * features reported are: + * PCI_PASID_ENABLE - PASID capability can be enabled + * PCI_PASID_EXEC - Execute permission supported + * PCI_PASID_PRIV - Priviledged mode supported + */ +int pci_pasid_features(struct pci_dev *pdev) +{ + u16 supported; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + + supported &= PCI_PASID_ENABLE | PCI_PASID_EXEC | PCI_PASID_PRIV; + + return supported; +} +EXPORT_SYMBOL_GPL(pci_pasid_features); + +#define PASID_NUMBER_SHIFT 8 +#define PASID_NUMBER_MASK (0x1f << PASID_NUMBER_SHIFT) +/** + * pci_max_pasid - Get maximum number of PASIDs supported by device + * @pdev: PCI device structure + * + * Returns negative value when PASID capability is not present. + * Otherwise it returns the numer of supported PASIDs. + */ +int pci_max_pasids(struct pci_dev *pdev) +{ + u16 supported; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_PASID_CAP); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PASID_CAP_OFF, &supported); + + supported = (supported & PASID_NUMBER_MASK) >> PASID_NUMBER_SHIFT; + + return (1 << supported); +} +EXPORT_SYMBOL_GPL(pci_max_pasids); +#endif /* CONFIG_PCI_PASID */ -- cgit v1.1 From d387a8d66670371e6be3b6d6bde2e38b8cade076 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Fri, 14 Oct 2011 14:56:13 -0500 Subject: PCI: Workaround for Intel MPS errata Intel 5000 and 5100 series memory controllers have a known issue if read completion coalescing is enabled and the PCI-E Maximum Payload Size is set to 256B. To work around this issue, disable read completion coalescing in the memory controller and root complexes. Unfortunately, it must always be disabled, even if no 256B MPS devices are present, due to the possibility of one being hotplugged. Links to erratas: http://www.intel.com/content/dam/doc/specification-update/5000-chipset-memory-controller-hub-specification-update.pdf http://www.intel.com/content/dam/doc/specification-update/5100-memory-controller-hub-chipset-specification-update.pdf Thanks to Jesse Brandeburg and Ben Hutchings for providing insight into the problem. Tested-and-Reported-by: Avi Kivity Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- drivers/pci/quirks.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'drivers/pci') diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index e5aadf35..dca58b9 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2836,6 +2836,75 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SOLARFLARE, PCI_DEVICE_ID_SOLARFLARE_SFC4000B, fixup_mpss_256); +/* Intel 5000 and 5100 Memory controllers have an errata with read completion + * coalescing (which is enabled by default on some BIOSes) and MPS of 256B. + * Since there is no way of knowing what the PCIE MPS on each fabric will be + * until all of the devices are discovered and buses walked, read completion + * coalescing must be disabled. Unfortunately, it cannot be re-enabled because + * it is possible to hotplug a device with MPS of 256B. + */ +static void __devinit quirk_intel_mc_errata(struct pci_dev *dev) +{ + int err; + u16 rcc; + + if (pcie_bus_config == PCIE_BUS_TUNE_OFF) + return; + + /* Intel errata specifies bits to change but does not say what they are. + * Keeping them magical until such time as the registers and values can + * be explained. + */ + err = pci_read_config_word(dev, 0x48, &rcc); + if (err) { + dev_err(&dev->dev, "Error attempting to read the read " + "completion coalescing register.\n"); + return; + } + + if (!(rcc & (1 << 10))) + return; + + rcc &= ~(1 << 10); + + err = pci_write_config_word(dev, 0x48, rcc); + if (err) { + dev_err(&dev->dev, "Error attempting to write the read " + "completion coalescing register.\n"); + return; + } + + pr_info_once("Read completion coalescing disabled due to hardware " + "errata relating to 256B MPS.\n"); +} +/* Intel 5000 series memory controllers and ports 2-7 */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25c0, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25d0, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25d4, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25d8, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e2, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e3, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e4, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e5, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e6, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25e7, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25f7, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25f8, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25f9, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x25fa, quirk_intel_mc_errata); +/* Intel 5100 series memory controllers and ports 2-7 */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65c0, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e2, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e3, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e4, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e5, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e6, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65e7, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f7, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f8, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65f9, quirk_intel_mc_errata); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x65fa, quirk_intel_mc_errata); + static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) { -- cgit v1.1 From 62f392ea5b5f87b641e16e61a4cedda21ef7341f Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Fri, 14 Oct 2011 14:56:14 -0500 Subject: PCI: enable MPS "performance" setting to properly handle bridge MPS Rework the "performance" MPS option to configure the device MPS with the smaller of the device MPSS or the bridge MPS (which is assumed to be properly configured at this point to the largest allowable MPS based on its parent bus). Also, rework the MRRS setting to report an inability to set the MRRS to a valid setting. Signed-off-by: Jon Mason Acked-by: Benjamin Herrenschmidt Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 55 +++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 6ab6bd3..4829424 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1363,31 +1363,25 @@ static int pcie_find_smpss(struct pci_dev *dev, void *data) static void pcie_write_mps(struct pci_dev *dev, int mps) { - int rc, dev_mpss; - - dev_mpss = 128 << dev->pcie_mpss; + int rc; if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { - if (dev->bus->self) { - dev_dbg(&dev->bus->dev, "Bus MPSS %d\n", - 128 << dev->bus->self->pcie_mpss); + mps = 128 << dev->pcie_mpss; - /* For "MPS Force Max", the assumption is made that + if (dev->pcie_type != PCI_EXP_TYPE_ROOT_PORT && dev->bus->self) + /* For "Performance", the assumption is made that * downstream communication will never be larger than * the MRRS. So, the MPS only needs to be configured * for the upstream communication. This being the case, * walk from the top down and set the MPS of the child * to that of the parent bus. + * + * Configure the device MPS with the smaller of the + * device MPSS or the bridge MPS (which is assumed to be + * properly configured at this point to the largest + * allowable MPS based on its parent bus). */ - mps = 128 << dev->bus->self->pcie_mpss; - if (mps > dev_mpss) - dev_warn(&dev->dev, "MPS configured higher than" - " maximum supported by the device. If" - " a bus issue occurs, try running with" - " pci=pcie_bus_safe.\n"); - } - - dev->pcie_mpss = ffs(mps) - 8; + mps = min(mps, pcie_get_mps(dev->bus->self)); } rc = pcie_set_mps(dev, mps); @@ -1395,25 +1389,22 @@ static void pcie_write_mps(struct pci_dev *dev, int mps) dev_err(&dev->dev, "Failed attempting to set the MPS\n"); } -static void pcie_write_mrrs(struct pci_dev *dev, int mps) +static void pcie_write_mrrs(struct pci_dev *dev) { - int rc, mrrs, dev_mpss; + int rc, mrrs; /* In the "safe" case, do not configure the MRRS. There appear to be * issues with setting MRRS to 0 on a number of devices. */ - if (pcie_bus_config != PCIE_BUS_PERFORMANCE) return; - dev_mpss = 128 << dev->pcie_mpss; - /* For Max performance, the MRRS must be set to the largest supported * value. However, it cannot be configured larger than the MPS the - * device or the bus can support. This assumes that the largest MRRS - * available on the device cannot be smaller than the device MPSS. + * device or the bus can support. This should already be properly + * configured by a prior call to pcie_write_mps. */ - mrrs = min(mps, dev_mpss); + mrrs = pcie_get_mps(dev); /* MRRS is a R/W register. Invalid values can be written, but a * subsequent read will verify if the value is acceptable or not. @@ -1421,16 +1412,18 @@ static void pcie_write_mrrs(struct pci_dev *dev, int mps) * shrink the value until it is acceptable to the HW. */ while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) { - dev_warn(&dev->dev, "Attempting to modify the PCI-E MRRS value" - " to %d. If any issues are encountered, please try " - "running with pci=pcie_bus_safe\n", mrrs); rc = pcie_set_readrq(dev, mrrs); - if (rc) - dev_err(&dev->dev, - "Failed attempting to set the MRRS\n"); + if (!rc) + break; + dev_warn(&dev->dev, "Failed attempting to set the MRRS\n"); mrrs /= 2; } + + if (mrrs < 128) + dev_err(&dev->dev, "MRRS was unable to be configured with a " + "safe value. If problems are experienced, try running " + "with pci=pcie_bus_safe.\n"); } static int pcie_bus_configure_set(struct pci_dev *dev, void *data) @@ -1444,7 +1437,7 @@ static int pcie_bus_configure_set(struct pci_dev *dev, void *data) pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); pcie_write_mps(dev, mps); - pcie_write_mrrs(dev, mps); + pcie_write_mrrs(dev); dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); -- cgit v1.1 From a1c473aa11e61bc871be16279c9bf976acf22504 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 14 Oct 2011 14:56:15 -0500 Subject: pci: Clamp pcie_set_readrq() when using "performance" settings When configuring the PCIe settings for "performance", we allow parents to have a larger Max Payload Size than children and rely on children Max Read Request Size to not be larger than their own MPS to avoid having the host bridge generate responses they can't cope with. However, various drivers in Linux call pci_set_readrq() with arbitrary values, assuming this to be a simple performance tweak. This breaks under our "performance" configuration. Fix that by making sure the value programmed by pcie_set_readrq() is never larger than the configured MPS for that device. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7cd417e..6f45a73 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3202,8 +3202,6 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) if (rq < 128 || rq > 4096 || !is_power_of_2(rq)) goto out; - v = (ffs(rq) - 8) << 12; - cap = pci_pcie_cap(dev); if (!cap) goto out; @@ -3211,6 +3209,22 @@ int pcie_set_readrq(struct pci_dev *dev, int rq) err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); if (err) goto out; + /* + * If using the "performance" PCIe config, we clamp the + * read rq size to the max packet size to prevent the + * host bridge generating requests larger than we can + * cope with + */ + if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { + int mps = pcie_get_mps(dev); + + if (mps < 0) + return mps; + if (mps < rq) + rq = mps; + } + + v = (ffs(rq) - 8) << 12; if ((ctl & PCI_EXP_DEVCTL_READRQ) != v) { ctl &= ~PCI_EXP_DEVCTL_READRQ; -- cgit v1.1 From a513a99a7cebfb452839cc09c9c0586f72d96414 Mon Sep 17 00:00:00 2001 From: Jon Mason Date: Fri, 14 Oct 2011 14:56:16 -0500 Subject: PCI: Clean-up MPS debug output Clean-up MPS debug output to make it a single line and aligned, thus making it more readable for a large number of buses and devices in a single system. Suggested by Benjamin Herrenschmidt Signed-off-by: Jon Mason Signed-off-by: Jesse Barnes --- drivers/pci/probe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/pci') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 4829424..04e74f4 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1428,24 +1428,25 @@ static void pcie_write_mrrs(struct pci_dev *dev) static int pcie_bus_configure_set(struct pci_dev *dev, void *data) { - int mps = 128 << *(u8 *)data; + int mps, orig_mps; if (!pci_is_pcie(dev)) return 0; - dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", - pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); + mps = 128 << *(u8 *)data; + orig_mps = pcie_get_mps(dev); pcie_write_mps(dev, mps); pcie_write_mrrs(dev); - dev_dbg(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", - pcie_get_mps(dev), 128<pcie_mpss, pcie_get_readrq(dev)); + dev_info(&dev->dev, "PCI-E Max Payload Size set to %4d/%4d (was %4d), " + "Max Read Rq %4d\n", pcie_get_mps(dev), 128 << dev->pcie_mpss, + orig_mps, pcie_get_readrq(dev)); return 0; } -/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down, +/* pcie_bus_configure_settings requires that pci_walk_bus work in a top-down, * parents then children fashion. If this changes, then this code will not * work as designed. */ -- cgit v1.1