Re: [RFC PATCH v2 00/15][Sorted-buddy] mm: Memory Power Management

From: Srinivas Pandruvada
Date: Thu Apr 18 2013 - 11:09:04 EST


On 04/18/2013 02:54 AM, Srivatsa S. Bhat wrote:
On 04/17/2013 10:23 PM, Srinivas Pandruvada wrote:
On 04/09/2013 02:45 PM, Srivatsa S. Bhat wrote:
[I know, this cover letter is a little too long, but I wanted to clearly
explain the overall goals and the high-level design of this patchset in
detail. I hope this helps more than it annoys, and makes it easier for
reviewers to relate to the background and the goals of this patchset.]


Overview of Memory Power Management and its implications to the Linux MM
========================================================================

[...]
One thing you need to prevent is boot time allocation. You have to make
sure that frequently accessed per node data stored at the end of memory
will keep all ranks of memory active.

When I was experimenting I did something like this.
/////////////////////////////////


+/*
+ * Experimental MPST implemenentation
+ * Copyright (c) 2012, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/acpi.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/delay.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/page-isolation.h>
+#include <linux/vmalloc.h>
+#include <linux/compaction.h>
+#include "internal.h"
+
+#define phys_to_pfn(p) ((p) >> PAGE_SHIFT)
+#define pfn_to_phys(p) ((p) << PAGE_SHIFT)
+#define MAX_MPST_ZONES 16
+/* Atleast 4G of non MPST memory. */
+#define MINIMAL_NON_MPST_MEMORY_PFN (0x100000000 >> PAGE_SHIFT)
+
+struct mpst_mem_zone {
+ phys_addr_t start_addr;
+ phys_addr_t end_addr;
+};
+
+static struct mpst_mem_zone mpst_zones[MAX_MPST_ZONES];
+static int mpst_zone_cnt;
+static unsigned long mpst_start_pfn;
+static unsigned long mpst_end_pfn;
+static bool mpst_enabled;
+
+/* Minimal parsing for just getting node ranges */
+static int __init acpi_parse_mpst_table(struct acpi_table_header *table)
+{
+ struct acpi_table_mpst *mpst;
+ struct acpi_mpst_power_node *node;
+ u16 node_count;
+ int i;
+
+ mpst = (struct acpi_table_mpst *)table;
+ if (!mpst) {
+ pr_warn("Unable to map MPST\n");
+ return -ENODEV;
+ }
+ node_count = mpst->power_node_count;
+ node = (struct acpi_mpst_power_node *)((u8 *)mpst + sizeof(*mpst));
+
+ for (i = mpst_zone_cnt; (i < node_count) && (i < MAX_MPST_ZONES);
+ ++i) {
+ if ((node->flags & ACPI_MPST_ENABLED) &&
+ (node->flags & ACPI_MPST_POWER_MANAGED)) {
+ mpst_zones[mpst_zone_cnt].start_addr =
+ node->range_address;
+ mpst_zones[mpst_zone_cnt].end_addr =
+ node->range_address + node->range_length;
+ ++mpst_zone_cnt;
+ }
+ ++node;
+ }
+
+ return 0;
+}
+
+static unsigned long local_ahex_to_long(const char *name)
+{
+ unsigned long val = 0;
+
+ for (;; name++) {
+ switch (*name) {
+ case '0' ... '9':
+ val = 16*val+(*name-'0');
+ break;
+ case 'A' ... 'F':
+ val = 16*val+(*name-'A'+10);
+ break;
+ case 'a' ... 'f':
+ val = 16*val+(*name-'a'+10);
+ break;
+ default:
+ return val;
+ }
+ }
+
+ return val;
+}
+
+/* Specify MPST range by command line for test till ACPI - MPST is available */
+static int __init parse_mpst_opt(char *str)
+{
+ char *ptr;
+ phys_addr_t start_at = 0, end_at = 0;
+ u64 mem_size = 0;
+
+ if (!str)
+ return -EINVAL;
+ ptr = str;
+ while (1) {
+ if (*str == '-') {
+ *str = '\0';
+ start_at = local_ahex_to_long(ptr);
+ ++str;
+ ptr = str;
+ }
+ if (start_at && (*str == '\0' || *str == ',' || *str == ' ')) {
+ *str = '\0';
+ end_at = local_ahex_to_long(ptr);
+ mem_size = end_at-start_at;
+ ++str;
+ ptr = str;
+ pr_info("-mpst[%#018Lx-%#018Lx size: %#018Lx]\n",
+ start_at, end_at, mem_size);
+ if (IS_ALIGNED(phys_to_pfn(start_at),
+ pageblock_nr_pages) &&
+ IS_ALIGNED(phys_to_pfn(end_at),
+ pageblock_nr_pages)) {
+ mpst_zones[mpst_zone_cnt].start_addr =
+ start_at;
+ mpst_zones[mpst_zone_cnt].end_addr =
+ end_at;
+ } else {
+ pr_err("mpst invalid range\n");
+ return -EINVAL;
+ }
+ mpst_zone_cnt++;
+ start_at = mem_size = end_at = 0;
+ }
+ if (*str == '\0')
+ break;
+ else
+ ++str;
+ }
+
+ return 0;
+}
+early_param("mpst_range", parse_mpst_opt);
+
+/* Specify MPST range by command line for test till ACPI - MPST is available */
+static int __init parse_mpst_enable_opt(char *str)
+{
+ long value;
+ if (kstrtol(str, 10, &value))
+ return -EINVAL;
+ mpst_enabled = value ? true : false;
+
+ return 0;
+}
+early_param("mpst_enable", parse_mpst_enable_opt);
+
+/* Set the minimum and maximum PFN */
+static void mpst_set_min_max_pfn(void)
+{
+ int i;
+
+ if (!mpst_zone_cnt)
+ return;
+
+ mpst_start_pfn = phys_to_pfn(mpst_zones[0].start_addr);
+ mpst_end_pfn = phys_to_pfn(mpst_zones[0].end_addr);
+
+ for (i = 1; i < mpst_zone_cnt; ++i) {
+ if (mpst_start_pfn > phys_to_pfn(mpst_zones[i].start_addr))
+ mpst_start_pfn = phys_to_pfn(mpst_zones[i].start_addr);
+ if (mpst_end_pfn < phys_to_pfn(mpst_zones[i].end_addr))
+ mpst_end_pfn = phys_to_pfn(mpst_zones[i].end_addr);
+ }
+}
+
+/* Change migrate type for the MPST ranges */
+int mpst_set_migrate_type(void)
+{
+ int i;
+ struct page *page;
+ unsigned long start_pfn, end_pfn;
+
+ if (!mpst_start_pfn || !mpst_end_pfn)
+ return -EINVAL;
+ if (!IS_ALIGNED(mpst_start_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ if (!IS_ALIGNED(mpst_end_pfn, pageblock_nr_pages))
+ return -EINVAL;
+ memblock_free(pfn_to_phys(mpst_start_pfn),
+ pfn_to_phys(mpst_end_pfn) - pfn_to_phys(mpst_start_pfn));
+ for (i = 0; i < mpst_zone_cnt; ++i) {
+ start_pfn = phys_to_pfn(mpst_zones[i].start_addr);
+ end_pfn = phys_to_pfn(mpst_zones[i].end_addr);
+ for (; start_pfn < end_pfn; ++start_pfn) {
+ page = pfn_to_page(start_pfn);
+ if (page)
+ set_pageblock_migratetype(page,
+ MIGRATE_LP_MEMORY);
+ }
+ }
+
+ return 0;
+}
+
+/* Parse ACPI table and find start and end of MPST zone.
+Assuming zones are contiguous */
+int mpst_init(void)
+{
+ if (!mpst_enabled) {
+ pr_info("mpst not enabled in command line\n");
+ return 0;
+ }
+
+ acpi_table_parse(ACPI_SIG_MPST, acpi_parse_mpst_table);
+ mpst_set_min_max_pfn();
+ if (mpst_zone_cnt) {
+
+ if (mpst_start_pfn < MINIMAL_NON_MPST_MEMORY_PFN) {
+ pr_err("Not enough memory: Ignore MPST\n");
+ mpst_start_pfn = mpst_end_pfn = 0;
+ return -EINVAL;
+ }
+ memblock_reserve(pfn_to_phys(mpst_start_pfn),
+ pfn_to_phys(mpst_end_pfn) -
+ pfn_to_phys(mpst_start_pfn));
+ pr_info("mpst_init memblock limit set to pfn %lu 0x%#018lx\n",
+ mpst_start_pfn, pfn_to_phys(mpst_start_pfn));
+ }
+
+ return 0;
+}





/////////////////////////////
I think you meant to say "... stored at the end of memory will NOT keep all
ranks of memory active".

Yep, that's a good point! I'll think about how to achieve that. Thanks!

Regards,
Srivatsa S. Bhat

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/