Re: [PATCH] AMD Thermal Interrupt Support

From: Russell Leidich
Date: Fri Jan 04 2008 - 19:53:47 EST


On Jan 4, 2008 2:26 PM, Andi Kleen <andi@xxxxxxxxxxxxxx> wrote:
> > There are 2 pending issues, to which I have received insufficient feedback:
> >
> > 1. Andi would like to eliminate the trampoline in mce_thermal.c, but
> > no one has responded to my proposed disgusting hack on entry_64.S in
> > order to do so.
>
> There are two ways to do it:
>
> either duplicate the entry point in entry_64.S and set different
> vectors or let the asm glue jump through a call vector (use
> apicinterrupt ...,*thermal_vector(%rip) ) Later is probably better.

OK, I did it the latter way. Your thermal_vector is my
smp_thermal_interrupt, which I have converted from a function to a
pointer to the CPU-specific function.

>
> >
> > 2. Ingo pointed out that a given config file did not build. But when
>
> The patch that was in git-x86 didn't build on 32bit. Perhaps it was a
> 32bit config file? Run it under "linux32" or equivalent on your distro.

I _think_ it's fixed now. I'll let him rerun it at his end for verification.

>
> > + */
> > + if (therm_throt_process(1))
> > + mce_log_therm_throt_event(cpu, 1);
> > + /*
> > + * We'll still get subsequent interrupts even if we don't clear the
> > + * status bit in THERM_CTL_F3X64. Take advantage of this fact to avoid
> > + * touching PCI space. (If this assumption fails at some point, we'll
> > + * need to schedule_work() in order to enter a process context, so that
> > + * PCI locks can be asserted for proper access. This requirement, in
>
> PCI locks are spinlocks so they don't need process context. In PREEMPT-RT
> kernels they can sleep, but there the handler will be likely already
> a thread. Touching config space from interrupt context is legal, although
> not very popular because it tends to be slow (but a few drivers do it)

But if PCI locks are spinlocks, then how can one access config space
in an interrupt handler, as it might be locked by the foreground? (No
locks would be required at all, if everyone just saved 0xCF8 and
0xCFC, but I digress.) And it's one thing to be "likely" already in a
thread, and another to be sure. If you can address these issues, I'll
change or remove the comment. I just want to prevent a
reasonable-looking but bad coding change from happening in the future.

>
> > + /*
> > + * If any of the northbridges has PCI ID 0x1103, then its thermal
> > + * hardware suffers from an erratum which prevents this code from
> > + * working, so abort.
>
> Please add
>
> * This implies it only works on Family 10h aka AMD Quad Core.
>
> Otherwise I can just see the support questions of people asking why this
> doesn't work for them.

Agreed. I had it at the top of the function, but now I've worked it
into both places.

>
> Anyways I'm unsure about the blacklist here -- white list would
> probably have been better. k8_northbridges[] will certainly include
> Griffin northtbridges and who knows if TT will work there or not.
> [sorry for mentioning that not earlier]

Ideally, every ID in the k8_northbridges[] whitelist would have
functional thermal throttling. If more IDs than 0x1103 turn out to
have this feature broken, then we may need to add a blacklist as well.
But I expect that most future IDs will function correctly, hence my
reliance on the whitelist. In my view, anyone who adds an ID to a
whitelist (or just a list, for that matter) is obligated to perform a
static analysis (i.e. grep for "k8_northbridges") of the implications
of such act; therefore, I'm not too concerned about Griffin.

I've attached an updated patch. In the unlikely event that you want
to check this in, let me know so I can give it a final test.

--
Russell Leidich
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/Makefile linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/Makefile
--- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/Makefile 2007-12-11 14:30:53.533297000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/Makefile 2007-12-11 11:56:20.549185000 -0800
@@ -1,4 +1,4 @@
-obj-y = mce_$(BITS).o therm_throt.o
+obj-y = mce_$(BITS).o therm_throt.o mce_thermal.o

obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel_64.o
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_amd_64.c linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
--- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_amd_64.c 2007-12-11 14:30:53.559298000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_amd_64.c 2008-01-04 16:28:05.676044000 -0800
@@ -20,15 +20,18 @@
#include <linux/interrupt.h>
#include <linux/kobject.h>
#include <linux/notifier.h>
+#include <linux/pci_ids.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
#include <linux/sysfs.h>
#include <asm/apic.h>
-#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/percpu.h>
#include <asm/idle.h>
+#include <asm/therm_throt.h>
+#include <asm/k8.h>
+#include "mce_thermal.h"

#define PFX "mce_threshold: "
#define VERSION "version 1.1.1"
@@ -46,6 +49,16 @@
#define MASK_ERR_COUNT_HI 0x00000FFF
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
+#define THERM_CTL_F3X64 0x64
+#define PSL_APIC_LO_EN 0x80
+#define PSL_APIC_HI_EN 0x40
+#define HTC_ACTIVE 0x10
+#define HTC_EN 1
+#define NB_PCI_DEV_BASE 0x18
+
+static int smp_thermal_interrupt_init(void);
+static int thermal_apic_init_allowed;
+static void thermal_apic_init(void *unused);

struct threshold_block {
unsigned int block;
@@ -657,6 +670,9 @@ static int threshold_cpu_callback(struct
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
threshold_create_device(cpu);
+ if (thermal_apic_init_allowed)
+ smp_call_function_single(cpu,
+ &thermal_apic_init, NULL, 1, 0);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -665,7 +681,7 @@ static int threshold_cpu_callback(struct
default:
break;
}
- out:
+out:
return NOTIFY_OK;
}

@@ -688,3 +704,187 @@ static __init int threshold_init_device(
}

device_initcall(threshold_init_device);
+
+/*
+ * AMD-specific thermal interrupt handler.
+ */
+void amd_smp_thermal_interrupt(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ack_APIC_irq();
+ exit_idle();
+ irq_enter();
+ /*
+ * We're here because thermal throttling has just been activated -- not
+ * deactivated -- hence therm_throt_process(1).
+ */
+ if (therm_throt_process(1))
+ mce_log_therm_throt_event(cpu, 1);
+ /*
+ * We'll still get subsequent interrupts even if we don't clear the
+ * status bit in THERM_CTL_F3X64. Take advantage of this fact to avoid
+ * touching PCI space. (If this assumption fails at some point, we'll
+ * need to schedule_work() in order to enter a process context, so that
+ * PCI locks can be asserted for proper access. This requirement, in
+ * turn, creates the need to remember which core interrupted, as the
+ * core which ultimately takes the scheduled work may be different.
+ * With any luck, we'll never need to do this.)
+ */
+ irq_exit();
+}
+
+/*
+ * Initialize each northbridge's thermal throttling logic.
+ */
+static void smp_thermal_northbridge_init(void)
+{
+ int nb_num;
+ u32 therm_ctl_f3x64;
+
+ for (nb_num = 0; nb_num < num_k8_northbridges; nb_num++) {
+ /*
+ * Configure the thermal interrupt for this northbridge.
+ */
+ pci_read_config_dword(k8_northbridges[nb_num],
+ THERM_CTL_F3X64, &therm_ctl_f3x64);
+ therm_ctl_f3x64 |= PSL_APIC_HI_EN | HTC_EN;
+ therm_ctl_f3x64 &= (~PSL_APIC_LO_EN);
+ pci_write_config_dword(k8_northbridges[nb_num],
+ THERM_CTL_F3X64, therm_ctl_f3x64);
+ printk(KERN_INFO "Northbridge at PCI device 0x%x: "
+ "thermal monitoring enabled.\n", NB_PCI_DEV_BASE +
+ nb_num);
+ }
+}
+
+/*
+ * Enable the delivery of thermal interrupts via the local APIC.
+ */
+static void thermal_apic_init(void *unused)
+{
+ unsigned int apic_lv_therm;
+
+ /* Set up APIC_LVTTHMR to issue THERMAL_APIC_VECTOR. */
+ apic_lv_therm = apic_read(APIC_LVTTHMR);
+ /*
+ * See if some agent other than this routine has already initialized
+ * APIC_LVTTHMR, i.e. if it's unmasked, but not equal to the value that
+ * we would have programmed, had we been here before on this core.
+ */
+ if ((!(apic_lv_therm & APIC_LVT_MASKED)) && ((apic_lv_therm &
+ (APIC_MODE_MASK | APIC_VECTOR_MASK)) != (APIC_DM_FIXED |
+ THERMAL_APIC_VECTOR))) {
+ unsigned int cpu = smp_processor_id();
+
+ printk(KERN_WARNING "CPU 0x%x: Thermal monitoring not "
+ "functional.\n", cpu);
+ if ((apic_lv_therm & APIC_MODE_MASK) == APIC_DM_SMI)
+ printk(KERN_DEBUG "Thermal interrupts already "
+ "handled by SMI according to (((local APIC "
+ "base) + 0x330) bit 0x9).\n");
+ else
+ printk(KERN_DEBUG "Thermal interrupts unexpectedly "
+ "enabled at (((local APIC base) + 0x330) bit "
+ "0x10).\n");
+ } else {
+ /*
+ * Configure the Local Thermal Vector Table Entry to issue
+ * issue thermal interrupts to THERMAL_APIC_VECTOR.
+ *
+ * Start by masking off Delivery Mode and Vector.
+ */
+ apic_lv_therm &= ~(APIC_MODE_MASK | APIC_VECTOR_MASK);
+ /* Fixed interrupt, masked for now. */
+ apic_lv_therm |= APIC_LVT_MASKED | APIC_DM_FIXED |
+ THERMAL_APIC_VECTOR;
+ apic_write(APIC_LVTTHMR, apic_lv_therm);
+ /*
+ * The Intel thermal kernel code implies that there may be a
+ * race involving the mask bit, so clear it only now, after
+ * the other bits have settled.
+ */
+ apic_write(APIC_LVTTHMR, apic_lv_therm & ~APIC_LVT_MASKED);
+ }
+}
+
+/*
+* This function is intended to be called just after thermal throttling has
+ * been enabled. It warns the user if throttling is already active, which
+ * could indicate a failed cooling system. It may be the last chance to get
+ * a warning out before thermal shutdown occurs.
+ */
+static void smp_thermal_early_throttle_check(void)
+{
+ int nb_num;
+ u32 therm_ctl_f3x64;
+
+ for (nb_num = 0; nb_num < num_k8_northbridges; nb_num++) {
+ /*
+ * Read back THERM_CTL_F3X64 to check whther HTC_ACTIVE is
+ * asserted, in which case, warn the user.
+ */
+ pci_read_config_dword(k8_northbridges[nb_num],
+ THERM_CTL_F3X64, &therm_ctl_f3x64);
+ if (therm_ctl_f3x64 & HTC_ACTIVE)
+ printk(KERN_WARNING "High temperature on northbridge "
+ "at PCI device 0x%x. Throttling enabled.\n",
+ NB_PCI_DEV_BASE + nb_num);
+ }
+}
+
+/*
+ * Determine whether or not the northbridges support thermal throttling
+ * interrupts. If so, initialize them for receiving the same, then perform
+ * corresponding APIC initialization on each core.
+ *
+ * Due to an erratum involving the thermal sensor, this code does not work
+ * on any CPU prior to Family 0x10 (AMD Quad Core).
+ */
+static int smp_thermal_interrupt_init(void)
+{
+ int nb_num;
+
+ if (num_k8_northbridges == 0)
+ goto out;
+ /*
+ * If any of the northbridges has PCI ID 0x1103, then its thermal
+ * hardware suffers from an erratum which prevents this code from
+ * working, so abort. (This implies that it only works on Family
+ * 0x10 (AMD Quad Core).)
+ */
+ for (nb_num = 0; nb_num < num_k8_northbridges; nb_num++) {
+ if ((k8_northbridges[nb_num]->device) == 0x1103)
+ goto out;
+ }
+ /*
+ * Assert that we should log thermal throttling events, whenever
+ * we eventually get around to enabling them.
+ */
+ atomic_set(&therm_throt_en, 1);
+ /*
+ * Bind smp_thermal_interrupt() to amd_smp_thermal_interrupt().
+ */
+ smp_thermal_interrupt = amd_smp_thermal_interrupt;
+ smp_thermal_northbridge_init();
+ /*
+ * We've now initialized sufficient fabric to permit the
+ * initialization of the thermal interrupt APIC vectors, such as
+ * when a core comes online and calls threshold_cpu_callback().
+ */
+ thermal_apic_init_allowed = 1;
+ /*
+ * Call thermal_apic_init() on each core.
+ */
+ on_each_cpu(&thermal_apic_init, NULL, 1, 0);
+ smp_thermal_early_throttle_check();
+out:
+ return 0;
+}
+
+/*
+ * Ensure that smp_thermal_interrupt_init() executes after num_k8_northbridges
+ * has been initialized.
+ */
+late_initcall(smp_thermal_interrupt_init);
+
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_intel_64.c linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
--- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_intel_64.c 2007-12-11 14:30:53.564303000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_intel_64.c 2008-01-04 16:25:08.547999000 -0800
@@ -8,24 +8,21 @@
#include <linux/percpu.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
+#include "mce_thermal.h"

-asmlinkage void smp_thermal_interrupt(void)
+void intel_smp_thermal_interrupt(void)
{
__u64 msr_val;

ack_APIC_irq();
-
exit_idle();
irq_enter();
-
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);
-
add_pda(irq_thermal_count, 1);
irq_exit();
}
@@ -75,6 +72,10 @@ static void __cpuinit intel_init_thermal
wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);

l = apic_read(APIC_LVTTHMR);
+ /*
+ * Bind smp_thermal_interrupt() to intel_smp_thermal_interrupt().
+ */
+ smp_thermal_interrupt = intel_smp_thermal_interrupt;
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
cpu, tm2 ? "TM2" : "TM1");
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_thermal.c linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_thermal.c
--- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_thermal.c 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_thermal.c 2008-01-04 15:45:40.686775000 -0800
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2007 Google Inc.
+ *
+ * Written by Mike Waychison <mikew@xxxxxxxxxx> and Russell Leidich
+ * <rml@xxxxxxxxxx>.
+ *
+ * CPU-independent thermal interrupt handler.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <asm/hw_irq.h>
+#include <asm/idle.h>
+#include "mce_thermal.h"
+
+static void default_smp_thermal_interrupt(void)
+{
+ if (printk_ratelimit())
+ printk(KERN_DEBUG "Unexpected thermal throttling interrupt.\n");
+}
+
+smp_thermal_interrupt_callback_t smp_thermal_interrupt
+ = default_smp_thermal_interrupt;
+
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_thermal.h linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_thermal.h
--- linux-2.6.24-rc5.orig/arch/x86/kernel/cpu/mcheck/mce_thermal.h 1969-12-31 16:00:00.000000000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/cpu/mcheck/mce_thermal.h 2008-01-04 15:53:07.614658000 -0800
@@ -0,0 +1,8 @@
+#include <linux/init.h>
+#include <asm/mce.h>
+
+typedef void (*smp_thermal_interrupt_callback_t)(void);
+extern smp_thermal_interrupt_callback_t smp_thermal_interrupt;
+
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+
diff -uprN linux-2.6.24-rc5.orig/arch/x86/kernel/entry_64.S linux-2.6.24-rc5/arch/x86/kernel/entry_64.S
--- linux-2.6.24-rc5.orig/arch/x86/kernel/entry_64.S 2007-12-11 14:30:53.756297000 -0800
+++ linux-2.6.24-rc5/arch/x86/kernel/entry_64.S 2008-01-04 15:34:16.761342000 -0800
@@ -649,7 +649,7 @@ END(common_interrupt)
.endm

ENTRY(thermal_interrupt)
- apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt(%rip)
END(thermal_interrupt)

ENTRY(threshold_interrupt)