Re: [patch 2/2] add x86-64 support for memory hot-add

From: Yasunori Goto
Date: Tue Jan 10 2006 - 07:43:14 EST


> IIRC, SRAT is just for booting time. So, when hotplug occured,
> it is not reliable. DSDT should be used for it in order to SRAT
> like following 2 patches.
> First is to get pxm from physical address.
> I'll post the second patch after this post.

Second one is here.
This is map/unmap between pxm to nid. This is just for ia64.
But I guess for x86-64 is not so difference.


Signed-off-by: Keiichiro Tokunaga <tokunaga.keiich@xxxxxxxxxxxxxx>
Signed-off-by: Yasunori Goto <y-goto@xxxxxxxxxxxxxx>


Index: current_source/arch/ia64/kernel/acpi.c
===================================================================
--- current_source.orig/arch/ia64/kernel/acpi.c 2005-12-27 17:05:19.000000000 +0900
+++ current_source/arch/ia64/kernel/acpi.c 2005-12-27 17:08:18.000000000 +0900
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(pm_power_off);

unsigned char acpi_kbd_controller_present = 1;
unsigned char acpi_legacy_devices;
+static nodemask_t node_present_map = NODE_MASK_NONE;

static unsigned int __initdata acpi_madt_rev;

@@ -408,10 +409,11 @@ static int __init acpi_parse_madt(unsign
static int __initdata srat_num_cpus; /* number of cpus */
static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_clear(bit) (clear_bit(bit,(void *)pxm_flag))
#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
/* maps to convert between proximity domain and logical node ID */
int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
+int __devinitdata nid_to_pxm_map[MAX_NUMNODES];
static struct acpi_table_slit __initdata *slit_table;

/*
@@ -447,6 +449,36 @@ acpi_numa_processor_affinity_init(struct
srat_num_cpus++;
}

+int __devinit
+acpi_map_pxm_to_nid(int pxm)
+{
+ int nid;
+ nodemask_t tmp_map;
+
+ if (pxm_to_nid_map[pxm] != -1)
+ nid = pxm_to_nid_map[pxm];
+ else {
+ nodes_complement(tmp_map, node_present_map);
+ nid = first_node(tmp_map);
+ pxm_to_nid_map[pxm] = nid;
+ nid_to_pxm_map[nid] = pxm;
+ pxm_bit_set(pxm);
+ }
+
+ set_bit(nid, node_present_map.bits);
+
+ return nid;
+}
+
+void
+acpi_unmap_pxm_to_nid(int nid)
+{
+
+ if ((node_items[nid].num_cpus == 0) &&
+ (node_items[nid].num_memblks == 0))
+ clear_bit(nid, node_present_map.bits);
+}
+
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
@@ -504,18 +536,19 @@ void __init acpi_numa_arch_fixup(void)
memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
nodes_clear(node_online_map);
+ nodes_clear(node_present_map);
for (i = 0; i < MAX_PXM_DOMAINS; i++) {
if (pxm_bit_test(i)) {
- int nid = num_online_nodes();
- pxm_to_nid_map[i] = nid;
- nid_to_pxm_map[nid] = i;
+ int nid = acpi_map_pxm_to_nid(i);
node_set_online(nid);
}
}

/* set logical node id in memory chunk structure */
- for (i = 0; i < num_node_memblks; i++)
+ for (i = 0; i < num_node_memblks; i++) {
node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+ node_items[node_memblk[i].nid].num_memblks++;
+ }

/* assign memory bank numbers for each chunk on each node */
for_each_online_node(i) {
@@ -528,8 +561,10 @@ void __init acpi_numa_arch_fixup(void)
}

/* set logical node id in cpu structure */
- for (i = 0; i < srat_num_cpus; i++)
+ for (i = 0; i < srat_num_cpus; i++) {
node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+ node_items[node_cpuid[i].nid].num_cpus++;
+ }

printk(KERN_INFO "Number of logical nodes in system = %d\n",
num_online_nodes());
@@ -751,16 +786,50 @@ int acpi_map_cpu2node(acpi_handle handle
pxm_id = acpi_get_pxm(handle);

/*
- * Assuming that the container driver would have set the proximity
- * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+ * Assuming that if at least one processor's PXM < 0, the system does
+ * not have multiple PXMs. In this case, there is one PXM and all the
+ * devices belong to it.
*/
- node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
+ if (pxm_id < 0)
+ pxm_id = 0;
+
+ /*
+ * Container driver might call cpu hotplug driver before memory hot-add.
+ * So, pxm_to_nid must be mapped here.
+ */
+ if ((pxm_id >= 0) && (pxm_id < MAX_PXM_DOMAINS)){
+ acpi_map_pxm_to_nid(pxm_id);
+ arch_register_node(pxm_to_nid_map[pxm_id]);
+ }
+
+ node_cpuid[cpu].nid = pxm_to_nid_map[pxm_id];

node_cpuid[cpu].phys_id = physid;
+ node_items[node_cpuid[cpu].nid].num_cpus++;
#endif
return (0);
}

+static
+void acpi_unmap_cpu2node(int cpu)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+ int pxm_id;
+
+ nid = node_cpuid[cpu].nid;
+ pxm_id = nid_to_pxm_map[nid];
+
+ if (node_items[nid].num_cpus > 0)
+ node_items[nid].num_cpus--;
+
+ acpi_unmap_pxm_to_nid(pxm_id);
+
+ node_cpuid[cpu].phys_id = 0;
+ node_cpuid[cpu].nid = 0;
+#endif
+}
+
int acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -828,7 +897,7 @@ int acpi_unmap_lsapic(int cpu)
cpu_clear(cpu, cpu_present_map);

#ifdef CONFIG_ACPI_NUMA
- /* NUMA specific cleanup's */
+ acpi_unmap_cpu2node(cpu);
#endif

return (0);
Index: current_source/arch/ia64/mm/numa.c
===================================================================
--- current_source.orig/arch/ia64/mm/numa.c 2005-12-27 17:05:19.000000000 +0900
+++ current_source/arch/ia64/mm/numa.c 2005-12-27 17:08:18.000000000 +0900
@@ -28,6 +28,7 @@
int num_node_memblks;
struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
struct node_cpuid_s node_cpuid[NR_CPUS];
+struct node_items_s node_items[MAX_NUMNODES];
/*
* This is a matrix with "distances" between nodes, they should be
* proportional to the memory access latency ratios.
Index: current_source/include/asm-ia64/acpi.h
===================================================================
--- current_source.orig/include/asm-ia64/acpi.h 2005-12-27 17:05:19.000000000 +0900
+++ current_source/include/asm-ia64/acpi.h 2005-12-27 17:08:18.000000000 +0900
@@ -111,7 +111,7 @@ extern unsigned int get_cpei_target_cpu(
/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/
#define MAX_PXM_DOMAINS (256)
extern int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
+extern int __devinitdata nid_to_pxm_map[MAX_NUMNODES];
#endif

extern u16 ia64_acpiid_to_sapicid[];
Index: current_source/include/asm-ia64/numa.h
===================================================================
--- current_source.orig/include/asm-ia64/numa.h 2005-12-27 17:05:19.000000000 +0900
+++ current_source/include/asm-ia64/numa.h 2005-12-27 18:44:16.000000000 +0900
@@ -47,8 +47,14 @@ struct node_cpuid_s {
int nid; /* logical node containing this CPU */
};

+struct node_items_s {
+ int num_cpus; /* total num of cpus in a node */
+ int num_memblks; /* total num of memblks in a node */
+};
+
extern struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
extern struct node_cpuid_s node_cpuid[NR_CPUS];
+extern struct node_items_s node_items[MAX_NUMNODES];

/*
* ACPI 2.0 SLIT (System Locality Information Table)
@@ -68,11 +74,17 @@ extern int paddr_to_nid(unsigned long pa
extern int acpi_search_node_id(u64, u64);
#define firmware_phys_to_nid(start_addr, size) acpi_search_node_id(start_addr, size)

+extern int acpi_map_pxm_to_nid(int);
+extern void acpi_unmap_pxm_to_nid(int);
+#define arch_release_node_id(nid) acpi_unmap_pxm_to_nid(nid)
+
#else /* !CONFIG_NUMA */

#define paddr_to_nid(addr) 0
#define firmware_phys_to_nid(start_addr, size) 0

+#define acpi_map_pxm_to_nid(pxm) 0
+#define acpi_unmap_pxm_to_nid(pxm) {}
#endif /* CONFIG_NUMA */

#endif /* _ASM_IA64_NUMA_H */

--
Yasunori Goto


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/