Hi Dan
on 5/8/2023 5:45 PM, Zhijian Li (Fujitsu) wrote:
Dan,The kernel and makedumpfile has updated. It's still in a early stage, but in order to make sure I'm following your proposal.
On 29/04/2023 02:59, Dan Williams wrote:
Li Zhijian wrote:Sorry for the late reply. I'm just back from the vacation.
Hello folks,I think the reason this patchkit is difficult to follow is that it
About 2 months ago, we posted our first RFC[3] and received your kindly feedback. Thank you :)
Now, I'm back with the code.
Currently, this RFC has already implemented to supported case D*. And the case A&B is disabled
deliberately in makedumpfile. It includes changes in 3 source code as below:
spends a lot of time describing a chosen solution, but not enough time
describing the problem and the tradeoffs.
For example why is updating /proc/vmcore with pmem metadata the chosen
solution? Why not leave the kernel out of it and have makedumpfile
tooling aware of how to parse persistent memory namespace info-blocks
and retrieve that dump itself? This is what I proposed here:
http://lore.kernel.org/r/641484f7ef780_a52e2940@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.notmuch
And sorry again for missing your previous *important* information in V1.
Your proposal also sounds to me with less kernel changes, but more ndctl coupling with makedumpfile tools.
In my current understanding, it will includes following source changes.
i want to share the changes with you early. Alternatively, you are able to refer to my github for the full details.
https://github.com/zhijianli88/makedumpfile/commit/8ebfe38c015cfca0545cb3b1d7a6cc9a58fc9bb3
If I'm going the wrong way, fee free to let me know :)
-----------+-------------------------------------------------------------------+kernel should adapt it so that the metadata of pmem will be updated again in the kdump kernel:
Source | changes |
-----------+-------------------------------------------------------------------+
I. | 1. enter force_raw in kdump kernel automatically(avoid metadata being updated again)|
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index c60ec0b373c5..2e59be8b9c78 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -8,6 +8,7 @@
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/nd.h>
+#include <linux/crash_dump.h>
#include "nd-core.h"
#include "pmem.h"
#include "pfn.h"
@@ -1504,6 +1505,8 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
return ERR_PTR(-ENODEV);
}
+ if (is_kdump_kernel())
+ ndns->force_raw = true;
return ndns;
}
EXPORT_SYMBOL(nvdimm_namespace_common_probe);
kernel | |inspect_pmem_namespace() will walk the namespaces and the read its resource.start and infoblock. With this
| 2. mark the whole pmem's PT_LOAD for kexec_file_load(2) syscall |
-----------+-------------------------------------------------------------------+
II. kexec- | 1. mark the whole pmem's PT_LOAD for kexe_load(2) syscall |
tool | |
-----------+-------------------------------------------------------------------+
III. | 1. parse the infoblock and calculate the boundaries of userdata and metadata |
makedump- | 2. skip pmem userdata region |
file | 3. exclude pmem metadata region if needed |
-----------+-------------------------------------------------------------------+
I will try rewrite it with your proposal ASAP
information, we can calculate the boundaries of userdata and metadata easily. But currently this changes are
strongly coupling with the ndctl/pmem which looks a bit messy and ugly.
============makedumpfile=======
diff --git a/Makefile b/Makefile
index a289e41ef44d..4b4ded639cfd 100644
--- a/Makefile
+++ b/Makefile
@@ -50,7 +50,7 @@ OBJ_PART=$(patsubst %.c,%.o,$(SRC_PART))
SRC_ARCH = arch/arm.c arch/arm64.c arch/x86.c arch/x86_64.c arch/ia64.c arch/ppc64.c arch/s390x.c arch/ppc.c arch/sparc64.c arch/mips64.c arch/loongarch64.c
OBJ_ARCH=$(patsubst %.c,%.o,$(SRC_ARCH))
-LIBS = -ldw -lbz2 -ldl -lelf -lz
+LIBS = -ldw -lbz2 -ldl -lelf -lz -lndctl
ifneq ($(LINKTYPE), dynamic)
LIBS := -static $(LIBS) -llzma
endif
diff --git a/makedumpfile.c b/makedumpfile.c
index 98c3b8c7ced9..db68d05a29f9 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -27,6 +27,8 @@
#include <limits.h>
#include <assert.h>
#include <zlib.h>
+#include <sys/types.h>
+#include <ndctl/libndctl.h>
+
+#define INFOBLOCK_SZ (8192)
+#define SZ_4K (4096)
+#define PFN_SIG_LEN 16
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+
+typedef int64_t le64;
+typedef int32_t le32;
+typedef int16_t le16;
+
+struct pfn_sb {
+ u8 signature[PFN_SIG_LEN];
+ u8 uuid[16];
+ u8 parent_uuid[16];
+ le32 flags;
+ le16 version_major;
+ le16 version_minor;
+ le64 dataoff; /* relative to namespace_base + start_pad */
+ le64 npfns;
+ le32 mode;
+ /* minor-version-1 additions for section alignment */
+ le32 start_pad;
+ le32 end_trunc;
+ /* minor-version-2 record the base alignment of the mapping */
+ le32 align;
+ /* minor-version-3 guarantee the padding and flags are zero */
+ /* minor-version-4 record the page size and struct page size */
+ le32 page_size;
+ le16 page_struct_size;
+ u8 padding[3994];
+ le64 checksum;
+};
+
+static int nd_read_infoblock_dataoff(struct ndctl_namespace *ndns)
+{
+ int fd, rc;
+ char path[50];
+ char buf[INFOBLOCK_SZ + 1];
+ struct pfn_sb *pfn_sb = (struct pfn_sb *)(buf + SZ_4K);
+
+ sprintf(path, "/dev/%s", ndctl_namespace_get_block_device(ndns));
+
+ fd = open(path, O_RDONLY|O_EXCL);
+ if (fd < 0)
+ return -1;
+
+
+ rc = read(fd, buf, INFOBLOCK_SZ);
+ if (rc < INFOBLOCK_SZ) {
+ return -1;
+ }
+
+ return pfn_sb->dataoff;
+}
+
+int inspect_pmem_namespace(void)
+{
+ struct ndctl_ctx *ctx;
+ struct ndctl_bus *bus;
+ int rc = -1;
+
+ fprintf(stderr, "\n\ninspect_pmem_namespace!!\n\n");
+ rc = ndctl_new(&ctx);
+ if (rc)
+ return -1;
+
+ ndctl_bus_foreach(ctx, bus) {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ struct ndctl_namespace *ndns;
+
+ ndctl_namespace_foreach(region, ndns) {
+ enum ndctl_namespace_mode mode;
+ long long start, end_metadata;
+
+ mode = ndctl_namespace_get_mode(ndns);
+ /* kdump kernel should set force_raw, mode become *safe* */
+ if (mode == NDCTL_NS_MODE_SAFE) {
+ fprintf(stderr, "Only raw can be dumpable\n");
+ continue;
+ }
+
+ start = ndctl_namespace_get_resource(ndns);
+ end_metadata = nd_read_infoblock_dataoff(ndns);
+
+ /* metadata really starts from 2M alignment */
+ if (start != ULLONG_MAX && end_metadata > 2 * 1024 * 1024) // 2M
+ pmem_add_next(start, end_metadata);
+ }
+ }
+ }
+
+ ndctl_unref(ctx);
+ return 0;
+}
+
Thanks
Zhijian
Thanks again_______________________________________________
Thanks
Zhijian
...but never got an answer, or I missed the answer._______________________________________________
kexec mailing list
kexec@xxxxxxxxxxxxxxxxxxx
http://lists.infradead.org/mailman/listinfo/kexec
kexec mailing list
kexec@xxxxxxxxxxxxxxxxxxx
http://lists.infradead.org/mailman/listinfo/kexec