This patchset is version 3 of my /proc/pid/pagemaps code. Rather than submit about 30 incremental patches atop an existing 20 or so where many of the intermediate states are broken and get undone anyway, I've respun this as a much smaller set of 11 patches. Changes in this series: - headers gone again (as recommended by Dave Hansen and Alan Cox) - 64-bit entries (as per discussion with Andi Kleen) - swap pte information exported (from Dave Hansen) - page walker callback for holes (from Dave Hansen) - direct put_user I/O (as suggested by Rusty Russell) - split kpagemap into kpagecount and kpageflags I've dropped one cleanup patch from Rusty from the current series, mmaps2-vma-out-of-mem_size_stats.patch, which I didn't find to be an improvement. Andrew, please replace the current maps2* patches with this set at your convenience. I've included the above change list in the relevant patches. -
Move is_swap_pte helper function to swapops.h for use by pagemap code
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: l/include/linux/swapops.h
===================================================================
--- l.orig/include/linux/swapops.h 2007-10-09 17:36:25.000000000 -0500
+++ l/include/linux/swapops.h 2007-10-10 11:46:34.000000000 -0500
@@ -42,6 +42,12 @@ static inline pgoff_t swp_offset(swp_ent
return entry.val & SWP_OFFSET_MASK(entry);
}
+/* check whether a pte points to a swap entry */
+static inline int is_swap_pte(pte_t pte)
+{
+ return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
/*
* Convert the arch-dependent pte representation of a swp_entry_t into an
* arch-independent swp_entry_t.
Index: l/mm/migrate.c
===================================================================
--- l.orig/mm/migrate.c 2007-10-09 17:37:59.000000000 -0500
+++ l/mm/migrate.c 2007-10-10 11:46:34.000000000 -0500
@@ -114,11 +114,6 @@ int putback_lru_pages(struct list_head *
return count;
}
-static inline int is_swap_pte(pte_t pte)
-{
- return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
-}
-
/*
* Restore a potential migration pte to a working pte entry
*/
-
Use the generic pagewalker for smaps and clear_refs
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: l/fs/proc/task_mmu.c
===================================================================
--- l.orig/fs/proc/task_mmu.c 2007-10-14 13:36:56.000000000 -0500
+++ l/fs/proc/task_mmu.c 2007-10-14 13:37:08.000000000 -0500
@@ -116,6 +116,7 @@ static void pad_len_spaces(struct seq_fi
struct mem_size_stats
{
+ struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -145,13 +146,6 @@ struct mem_size_stats
#define PSS_DIV_BITS 12
};
-struct pmd_walker {
- struct vm_area_struct *vma;
- void *private;
- void (*action)(struct vm_area_struct *, pmd_t *, unsigned long,
- unsigned long, void *);
-};
-
static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
{
struct proc_maps_private *priv = m->private;
@@ -241,11 +235,11 @@ static int show_map(struct seq_file *m,
return show_map_internal(m, v, NULL);
}
-static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- void *private)
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ void *private)
{
struct mem_size_stats *mss = private;
+ struct vm_area_struct *vma = mss->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -283,12 +277,13 @@ static void smaps_pte_range(struct vm_ar
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
+ return 0;
}
-static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- void *private)
+static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, void *private)
{
+ struct vm_area_struct *vma = private;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
@@ -309,71 +304,10 @@ static void clear_refs_pte_range(struct
}
pte_unmap_unlock(pte ...Acked-by: David Rientjes <rientjes@google.com> -
From: Fengguang Wu <wfg@mail.ustc.edu.cn>
The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing
it. So if a process has 1000 pages all to itself, and 1000 shared with one
other process, its PSS will be 1500.
- lwn.net: "ELC: How much memory are applications really using?"
The PSS proposed by Matt Mackall is a very nice metic for measuring an
process's memory footprint. So collect and export it via
/proc/<pid>/smaps.
Matt Mackall's pagemap/kpagemap and John Berthels's exmap can also do the
job. They are comprehensive tools. But for PSS, let's do it in the simple
way.
Cc: John Berthels <jjberthels@gmail.com>
Cc: Bernardo Innocenti <bernie@codewiz.org>
Cc: Padraig Brady <P@draigBrady.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Fengguang Wu <wfg@mail.ustc.edu.cn>
Cc: Hugh Dickins <hugh@veritas.com>
---
fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++++-
1 files changed, 28 insertions(+), 1 deletion(-)
Index: l/fs/proc/task_mmu.c
===================================================================
--- l.orig/fs/proc/task_mmu.c 2007-10-14 13:35:31.000000000 -0500
+++ l/fs/proc/task_mmu.c 2007-10-14 13:36:56.000000000 -0500
@@ -122,6 +122,27 @@ struct mem_size_stats
unsigned long private_clean;
unsigned long private_dirty;
unsigned long referenced;
+
+ /*
+ * Proportional Set Size(PSS): my share of RSS.
+ *
+ * PSS of a process is the count of pages it has in memory, where each
+ * page is divided by the number of processes sharing it. So if a
+ * process has 1000 pages all to itself, and 1000 shared with one other
+ * process, its PSS will be 1500. - Matt Mackall, lwn.net
+ */
+ u64 pss;
+ /*
+ * To keep (accumulated) division errors low, we adopt 64bit pss and
+ * use some low bits for division ...I know this gets moved again in the eighth patch of the series, but the
#define still has no place inside the struct definition.
The pss is going to need accessor functions, preferably inlined, and the
comment adjusted stating that all accesses should be through those
functions and not directly to the mem_size_stats struct.
static inline u64 pss_up(unsigned long pss)
{
return pss << PSS_DIV_BITS;
}
static inline unsigned long pss_down(u64 pss)
{
return pss >> PSS_DIV_BITS;
}
-
I think that's overkill for something that has exactly one use of each. -- Mathematics is the supreme nostalgia of our time. -
There's no overkill at all, the current uses are already accessed with these bitshifts so there's no overhead when using an inlined function instead. To correctly access the pss, these bitshifts are required because the decision was made to use the lower PSS_DIV_BITS for rounding. Thus, you need to include accessor functions so that they are always accessed correctly now and in the future. David -
From: Matt Mackall <mpm@selenic.com>
This pulls the shared map display code out of show_map and puts it in
show_smap where it belongs.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Index: l/fs/proc/task_mmu.c
===================================================================
--- l.orig/fs/proc/task_mmu.c 2007-10-14 13:37:08.000000000 -0500
+++ l/fs/proc/task_mmu.c 2007-10-14 13:38:43.000000000 -0500
@@ -146,7 +146,7 @@ struct mem_size_stats
#define PSS_DIV_BITS 12
};
-static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
+static int show_map(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
@@ -206,35 +206,11 @@ static int show_map_internal(struct seq_
}
seq_putc(m, '\n');
- if (mss)
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss->resident >> 10,
- (unsigned long)(mss.pss >> (10 + PSS_DIV_BITS)),
- mss->shared_clean >> 10,
- mss->shared_dirty >> 10,
- mss->private_clean >> 10,
- mss->private_dirty >> 10,
- mss->referenced >> 10);
-
if (m->count < m->size) /* vma is copied successfully */
m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
return 0;
}
-static int show_map(struct seq_file *m, void *v)
-{
- return show_map_internal(m, v, NULL);
-}
-
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
void *private)
{
@@ -313,13 +289,37 @@ static int show_smap(struct seq_file *m,
{
struct vm_area_struct *vma = v;
...Introduce a general page table walker
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: l/include/linux/mm.h
===================================================================
--- l.orig/include/linux/mm.h 2007-10-09 17:37:59.000000000 -0500
+++ l/include/linux/mm.h 2007-10-10 11:46:37.000000000 -0500
@@ -773,6 +773,17 @@ unsigned long unmap_vmas(struct mmu_gath
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
+
+struct mm_walk {
+ int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, void *);
+ int (*pud_entry)(pud_t *, unsigned long, unsigned long, void *);
+ int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, void *);
+ int (*pte_entry)(pte_t *, unsigned long, unsigned long, void *);
+ int (*pte_hole) (unsigned long, unsigned long, void *);
+};
+
+int walk_page_range(struct mm_struct *, unsigned long addr, unsigned long end,
+ struct mm_walk *walk, void *private);
void free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
Index: l/mm/Makefile
===================================================================
--- l.orig/mm/Makefile 2007-10-09 17:37:59.000000000 -0500
+++ l/mm/Makefile 2007-10-10 11:46:37.000000000 -0500
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o
+ vmalloc.o pagewalk.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o \
Index: l/mm/pagewalk.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ l/mm/pagewalk.c 2007-10-10 11:46:37.000000000 -0500
@@ -0,0 +1,120 @@
+#include <linux/mm.h>
+#include ...It would be nice to have some clue about when each of these functions are called (depth first? pre or post order?), and what their params are. Does it call a callback for folded pagetable levels? Can pte_hole be used to create new mappings while we're traversing the Should this be (pte, addr, addr+PAGE_SIZE, private)? Is the second addr argument for the address range being mapped by this thing? Why pass -
For now, we should probably document that these functions assume that
the appropriate locks are held, and that there are no changes being made
to the pagetables as we walk.
However, I can see that people might want to use these in the future for
establishing ptes. Perhaps a special code coming back from the
->pte_hole() function could indicate changes were made to the
pagetables. I guess we could at least retry part of the loop where the
hole call was made, like:
+int walk_page_range(struct mm_struct *mm,...
+{
...
+ pgd = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, private);
if (err == -EAGAIN) { // or whatever we want
pgd--;
err = 0;
}
+ if (err)
+ break;
+ continue;
+ }
That wouldn't allow changes behind the walker, but it should allow them
in the range that was walked by the ->pte_hole() function.
-- Dave
-
Yes. We already have apply_to_page_range(), which has the side effect
of creating the page range in order to apply a function to it. It would
be nice to be able to replicate its functionality with this page waker
Yep.
J
-
Probably - the pattern is [start, end). Either that or we should have Oops. -- Mathematics is the supreme nostalgia of our time. -
Should walk_page_range be exported? Is it trivial to convert ioremap to use this new pte walker? -
From: Matt Mackall <mpm@selenic.com>
This puts all the clear_refs code where it belongs and probably lets things
compile on MMU-less systems as well.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Index: l/fs/proc/base.c
===================================================================
--- l.orig/fs/proc/base.c 2007-10-14 13:35:08.000000000 -0500
+++ l/fs/proc/base.c 2007-10-14 13:39:00.000000000 -0500
@@ -713,42 +713,6 @@ static const struct file_operations proc
.write = oom_adjust_write,
};
-#ifdef CONFIG_MMU
-static ssize_t clear_refs_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
- struct mm_struct *mm;
-
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- return -EFAULT;
- if (!simple_strtol(buffer, &end, 0))
- return -EINVAL;
- if (*end == '\n')
- end++;
- task = get_proc_task(file->f_path.dentry->d_inode);
- if (!task)
- return -ESRCH;
- mm = get_task_mm(task);
- if (mm) {
- clear_refs_smap(mm);
- mmput(mm);
- }
- put_task_struct(task);
- if (end - buffer == 0)
- return -EIO;
- return end - buffer;
-}
-
-static struct file_operations proc_clear_refs_operations = {
- .write = clear_refs_write,
-};
-#endif
-
#ifdef CONFIG_AUDITSYSCALL
#define TMPBUFLEN 21
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
Index: l/fs/proc/internal.h
===================================================================
--- l.orig/fs/proc/internal.h 2007-10-14 13:35:08.000000000 -0500
+++ l/fs/proc/internal.h 2007-10-14 13:39:00.000000000 -0500
@@ -49,11 +49,7 @@ extern int proc_pid_statm(struct task_st
extern const struct file_operations proc_maps_operations;
extern const struct ...The #define for PROC_NUMBUF will need to be moved from fs/proc/base.c to include/linux/proc_fs.h and used here instead of hardcoding it. -
From: Matt Mackall <mpm@selenic.com>
This interface provides a mapping for each page in an address space to its
physical page frame number, allowing precise determination of what pages are
mapped and what pages are shared between processes.
New in this version:
- headers gone again (as recommended by Dave Hansen and Alan Cox)
- 64-bit entries (as per discussion with Andi Kleen)
- swap pte information exported (from Dave Hansen)
- page walker callback for holes (from Dave Hansen)
- direct put_user I/O (as suggested by Rusty Russell)
This patch folds in cleanups and swap PTE support from Dave Hansen
<haveblue@us.ibm.com>.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Index: l/fs/proc/base.c
===================================================================
--- l.orig/fs/proc/base.c 2007-10-14 13:39:00.000000000 -0500
+++ l/fs/proc/base.c 2007-10-15 17:18:09.000000000 -0500
@@ -635,7 +635,7 @@ out_no_task:
}
#endif
-static loff_t mem_lseek(struct file * file, loff_t offset, int orig)
+loff_t mem_lseek(struct file * file, loff_t offset, int orig)
{
switch (orig) {
case 0:
@@ -2034,6 +2034,7 @@ static const struct pid_entry tgid_base_
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pagemap", S_IRUSR, pagemap),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
@@ -2320,6 +2321,7 @@ static const struct pid_entry tid_base_s
#ifdef CONFIG_MMU
REG("clear_refs", S_IWUSR, clear_refs),
REG("smaps", S_IRUGO, smaps),
+ REG("pagemap", S_IRUSR, pagemap),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
Index: l/fs/proc/internal.h
===================================================================
--- l.orig/fs/proc/internal.h 2007-10-14 13:39:00.000000000 -0500
+++ l/fs/proc/internal.h 2007-10-15 17:18:09.000000000 -0500
@@ -45,11 +45,13 @@ extern int proc_tid_stat(struct task_str
extern int proc_tgid_stat(struct task_struct *, ...From: Matt Mackall <mpm@selenic.com>
Reorder source so that all the code and data for each interface is together.
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Index: l/fs/proc/task_mmu.c
===================================================================
--- l.orig/fs/proc/task_mmu.c 2007-10-14 13:42:11.000000000 -0500
+++ l/fs/proc/task_mmu.c 2007-10-14 18:07:26.000000000 -0500
@@ -114,37 +114,121 @@ static void pad_len_spaces(struct seq_fi
seq_printf(m, "%*c", len, ' ');
}
-struct mem_size_stats
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
- struct vm_area_struct *vma;
- unsigned long resident;
- unsigned long shared_clean;
- unsigned long shared_dirty;
- unsigned long private_clean;
- unsigned long private_dirty;
- unsigned long referenced;
+ if (vma && vma != priv->tail_vma) {
+ struct mm_struct *mm = vma->vm_mm;
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ }
+}
+
+static void *m_start(struct seq_file *m, loff_t *pos)
+{
+ struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = m->version;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma, *tail_vma = NULL;
+ loff_t l = *pos;
+
+ /* Clear the per syscall fields in priv */
+ priv->task = NULL;
+ priv->tail_vma = NULL;
/*
- * Proportional Set Size(PSS): my share of RSS.
- *
- * PSS of a process is the count of pages it has in memory, where each
- * page is divided by the number of processes sharing it. So if a
- * process has 1000 pages all to itself, and 1000 shared with one other
- * process, its PSS will be 1500. - Matt Mackall, lwn.net
+ * We remember last_addr rather than next_addr to hit with
+ * mmap_cache most of the time. We have zero last_addr at
+ * the beginning and also after lseek. We will have -1 last_addr
+ * after the end of the vmas.
*/
- u64 ...For the /proc/<pid>/pagemap code[1], we need to able to query how much virtual address space a particular task has. The trick is that we do it through /proc and can't use TASK_SIZE since it references "current" on some arches. The process opening the /proc file might be a 32-bit process opening a 64-bit process's pagemap file. x86_64 already has a TASK_SIZE_OF() macro: #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) I'd like to have that for other architectures. So, add it for all the architectures that actually use "current" in their TASK_SIZE. For the others, just add a quick #define in sched.h to use plain old TASK_SIZE. 1. http://www.linuxworld.com/news/2007/042407-kernel.html - MIPS portion from Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Matt Mackall <mpm@selenic.com> --- lxc-dave/include/asm-ia64/processor.h | 3 ++- lxc-dave/include/asm-mips/processor.h | 4 ++++ lxc-dave/include/asm-parisc/processor.h | 3 ++- lxc-dave/include/asm-powerpc/processor.h | 4 +++- lxc-dave/include/asm-s390/processor.h | 2 ++ lxc-dave/include/linux/sched.h | 4 ++++ 6 files changed, 17 insertions(+), 3 deletions(-) Index: l/include/asm-ia64/processor.h =================================================================== --- l.orig/include/asm-ia64/processor.h 2007-10-09 17:37:58.000000000 -0500 +++ l/include/asm-ia64/processor.h 2007-10-10 11:46:30.000000000 -0500 @@ -31,7 +31,8 @@ * each (assuming 8KB page size), for a total of 8TB of user virtual * address space. */ -#define TASK_SIZE (current->thread.task_size) +#define TASK_SIZE_OF(tsk) ((tsk)->thread.task_size) +#define TASK_SIZE TASK_SIZE_OF(current) /* * This decides where the kernel will search for a free chunk of vm Index: ...
TASK_SIZE_OF() should be defined in terms of TASK_SIZE, just like it is Same. -
David, All of your comments looked pretty valid to me. I've refreshed that patch. I haven't even compile-tested this so there may be some fat fingering somewhere. I'll run compile tests on it now. -- Dave For the /proc/<pid>/pagemap code[1], we need to able to query how much virtual address space a particular task has. The trick is that we do it through /proc and can't use TASK_SIZE since it references "current" on some arches. The process opening the /proc file might be a 32-bit process opening a 64-bit process's pagemap file. x86_64 already has a TASK_SIZE_OF() macro: #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) I'd like to have that for other architectures. So, add it for all the architectures that actually use "current" in their TASK_SIZE. For the others, just add a quick #define in sched.h to use plain old TASK_SIZE. 1. http://www.linuxworld.com/news/2007/042407-kernel.html - MIPS portion from Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Matt Mackall <mpm@selenic.com> --- lxc-dave/include/asm-ia64/processor.h | 3 ++- lxc-dave/include/asm-mips/processor.h | 4 ++++ lxc-dave/include/asm-parisc/processor.h | 3 ++- lxc-dave/include/asm-powerpc/processor.h | 3 ++- lxc-dave/include/asm-s390/processor.h | 3 ++- lxc-dave/include/linux/sched.h | 4 ++++ 6 files changed, 16 insertions(+), 4 deletions(-) diff -puN include/asm-ia64/processor.h~PATCH_2_11_maps3-_introduce_task_size_of_for_all_arches include/asm-ia64/processor.h --- lxc/include/asm-ia64/processor.h~PATCH_2_11_maps3-_introduce_task_size_of_for_all_arches 2007-10-15 17:29:22.000000000 -0700 +++ lxc-dave/include/asm-ia64/processor.h 2007-10-15 17:29:22.000000000 -0700 @@ -31,7 +31,8 @@ * each (assuming 8KB page size), for a total of 8TB of user virtual * address space. ...
test_tsk_thread_flag() takes two arguments. -
The following replaces the earlier patches sent. It should address David Rientjes's comments, and has been compile tested on all the architectures that it touches, save for parisc. ---- For the /proc/<pid>/pagemap code[1], we need to able to query how much virtual address space a particular task has. The trick is that we do it through /proc and can't use TASK_SIZE since it references "current" on some arches. The process opening the /proc file might be a 32-bit process opening a 64-bit process's pagemap file. x86_64 already has a TASK_SIZE_OF() macro: #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) I'd like to have that for other architectures. So, add it for all the architectures that actually use "current" in their TASK_SIZE. For the others, just add a quick #define in sched.h to use plain old TASK_SIZE. 1. http://www.linuxworld.com/news/2007/042407-kernel.html - MIPS portion from Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Matt Mackall <mpm@selenic.com> --- lxc-dave/include/asm-ia64/processor.h | 3 ++- lxc-dave/include/asm-mips/processor.h | 4 ++++ lxc-dave/include/asm-parisc/processor.h | 3 ++- lxc-dave/include/asm-powerpc/processor.h | 3 ++- lxc-dave/include/asm-s390/processor.h | 3 ++- lxc-dave/include/linux/sched.h | 4 ++++ 6 files changed, 16 insertions(+), 4 deletions(-) diff -puN include/asm-ia64/processor.h~PATCH_2_11_maps3-_introduce_task_size_of_for_all_arches include/asm-ia64/processor.h --- lxc/include/asm-ia64/processor.h~PATCH_2_11_maps3-_introduce_task_size_of_for_all_arches 2007-10-15 17:29:22.000000000 -0700 +++ lxc-dave/include/asm-ia64/processor.h 2007-10-15 17:29:22.000000000 -0700 @@ -31,7 +31,8 @@ * each (assuming 8KB page size), for a total of 8TB of user virtual * address space. */ -#define ...
Acked-by: David Rientjes <rientjes@google.com> -
From: Matt Mackall <mpm@selenic.com>
This makes physical page map counts available to userspace. Together
with /proc/pid/pagemap and /proc/pid/clear_refs, this can be used to
monitor memory usage on a per-page basis.
[bunk@stusta.de: make struct proc_kpagemap static]
Signed-off-by: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
Index: l/fs/proc/proc_misc.c
===================================================================
--- l.orig/fs/proc/proc_misc.c 2007-10-09 17:37:57.000000000 -0500
+++ l/fs/proc/proc_misc.c 2007-10-10 11:46:50.000000000 -0500
@@ -46,6 +46,7 @@
#include <linux/vmalloc.h>
#include <linux/crash_dump.h>
#include <linux/pid_namespace.h>
+#include <linux/bootmem.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
@@ -656,6 +657,106 @@ static const struct file_operations proc
};
#endif
+#define KPMSIZE sizeof(u64)
+#define KPMMASK (KPMSIZE - 1)
+/* /proc/kpagecount - an array exposing page counts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page count.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 pcount;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ pfn = src / KPMSIZE;
+ count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EIO;
+
+ while (count > 0) {
+ ppage = pfn_to_page(pfn++);
+ if (!ppage)
+ pcount = 0;
+ else
+ pcount = atomic_read(&ppage->_count);
+
+ if (put_user(pcount, out++)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ count -= KPMSIZE;
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret ...This one makes me worry a little bit. Are we sure that this won't expose a wee bit too much to userspace? I can see it making sense to clear the page refs, then inspect whether the page has been referenced again. But, I worry that people are going to start doing things like read NUMA, SPARSEMEM, or other internal information out of these. I've seen quite a few patches lately that do creative things with these *cough*clameter*cough*, and I worry that they're too fluid to get exposed to userspace. Could we just have /proc/kpagereferenced? Is there a legitimate need for other flags to be visible? -- Dave -
Hmm, I would have thought you'd find the NUMA bits especially interesting. Being able to, say, colorize a process' memory map by what nodes its That is a concern. In general, I think getting too cute with page flags and struct page in general is a bad idea because the rules here are already so complex/fragile/confusing/underdocumented, but there's Referenced, dirty, uptodate, lru, active, slab, writeback, reclaim, and buddy all look like they might be interesting to me from the point of view of watching what's happening in the VM graphically in real-time. For instance, watching the slab bit I can watch a 'find /' fill up huge swaths of contiguous dcache memory, then get fragmented to hell and never recover when I do a large userspace malloc. In other words, this thing actually lets you see all the crap that happens in the VM that we usually handwave about. -- Mathematics is the supreme nostalgia of our time. -
This is true, but it forces a lot of logic from the kernel to be run in userspace to figure out what is going on. Looking at mainline today: #define PG_reclaim 17 /* To be reclaimed asap */ ... #define PG_readahead PG_reclaim /* Reminder to do async read-ahead */ All of a sudden, to figure out which flag it actually is, we need to have all of the logic that the kernel does. Does this establish a fixed user<->kernel ABI that will keep us from doing this in the future: -#define PG_slab 7 /* slab debug (Suparna wants this) */ +#define PG_slab 14 /* slab debug (Suparna wants this) */ Or, even something like this: -#define PageSlab(page) test_bit(PG_slab, &(page)->flags) +#define PageSlab(page) (!PageLRU(page) && !PageHighmem(page)) If we actually had several (or even still one file) that exposed this state, independent of the actual content of page->flags, I think we'd be better off. I think that's the difference between a fun, super-useful debugging feature and one that can stay in mainline and have applications stay using it (without breaking) for a long time. The flags you listed are things that I would imagine will always exist, logically. But, we might not always have a specific page flag for pages under writeback or in the buddy list for that matter. PG_buddy isn't that old. Perhaps that would be better abstracted to something like page_in_main_allocator(). -- Dave -
Yeah, there are a bunch of flags that aren't mutually exclusive and we Perhaps we need something like: flags = page->flags; userflags = FLAG_BIT(USER_REFERENCED, flags & PG_referenced) | ... etc. for the flags we want to export. This will let us change to FLAG_BIT(USER_SLAB, PageSlab(page)) | if we make a virtual slab bit. And it shows up in grep. Unfortunately, i386 test_bit is an asm inline and not a macro so we can't hope for the compiler to fold up a bunch of identity bit mappings for us. -- Mathematics is the supreme nostalgia of our time. -
Yeah, that looks like a pretty sane scheme. Do we want to be any more abstract about it? Perhaps instead of USER_SLAB, it should be USER_KERNEL_INTERNAL, or USER_KERNEL_USE. The slab itself is going away We could also Yeah, that looks like a pretty sane scheme. Do we want to be any more abstract about it? Perhaps instead of USER_SLAB, it should be USER_KERNEL_INTERNAL, or USER_KERNEL_USE. The slab itself is going away as we speak. For the bits that we want to export, we could also add the unoptimized access functions for any that don't already have them: #define __ClearPageReserved(page) __clear_bit(PG_reserved, &(page)->flags) Anybody changing bit behavior will certainly go check all of the callers, such as ClearPageReserved() *and* __ClearPageReserved(). -- Dave -
Perhaps. SLUB is still "a slab-based allocator". SLOB isn't, but I Confused. Why are we interested in clear? -- Mathematics is the supreme nostalgia of our time. -
We're not. I just grabbed a random line to show the non-atomic accessors. Any actual one we'd need to add would be: #define __PageBuddy(page) __test_bit(PG_buddy, &(page)->flags) It looks like we don't have any of these non-atomic ones for plain __PageFoo(). So, we'd have to add them for each one that we wanted. Still not much work, and still satisfies the "grep test". :) -- Dave -
Make /proc/ page monitoring configurable This puts the following files under an embedded config option: /proc/pid/clear_refs /proc/pid/smaps /proc/pid/pagemap /proc/kpagecount /proc/kpageflags Signed-off-by: Matt Mackall <mpm@selenic.com> Index: l/fs/proc/base.c =================================================================== --- l.orig/fs/proc/base.c 2007-10-15 17:18:09.000000000 -0500 +++ l/fs/proc/base.c 2007-10-15 17:18:16.000000000 -0500 @@ -2031,7 +2031,7 @@ static const struct pid_entry tgid_base_ LNK("exe", exe), REG("mounts", S_IRUGO, mounts), REG("mountstats", S_IRUSR, mountstats), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), REG("pagemap", S_IRUSR, pagemap), @@ -2318,7 +2318,7 @@ static const struct pid_entry tid_base_s LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), REG("pagemap", S_IRUSR, pagemap), Index: l/fs/proc/proc_misc.c =================================================================== --- l.orig/fs/proc/proc_misc.c 2007-10-15 17:18:13.000000000 -0500 +++ l/fs/proc/proc_misc.c 2007-10-15 17:18:16.000000000 -0500 @@ -657,6 +657,7 @@ static const struct file_operations proc }; #endif +#ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) /* /proc/kpagecount - an array exposing page counts @@ -756,6 +757,7 @@ static struct file_operations proc_kpage .llseek = mem_lseek, .read = kpageflags_read, }; +#endif /* CONFIG_PROC_PAGE_MONITOR */ struct proc_dir_entry *proc_root_kcore; @@ -836,8 +838,10 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif +#ifdef CONFIG_PROC_PAGE_MONITOR create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); ...
How about pulling the EMBEDDED off there? I certainly want it for non-embedded reasons. ;) -- Dave -
That means it will only bother asking you if you've set EMBEDDED;
otherwise its always on.
J
-
But it's at the least confusing. Surely this option should depend on MMU and PROC_FS, and the prompt depend on EMBEDDED? That might be implied by the Kconfig layout, but AFAICT this patch removed the explicit MMU dependency. Rusty. -
Wasn't this your patch? You're right, it ought to say "depends PROC_FS && MMU". Will fix. -- Mathematics is the supreme nostalgia of our time. -
