kernel: backport MGLRU to linux 6.0 for Tianling Shen

This commit is contained in:
coolsnowwolf 2022-10-16 02:16:00 +08:00
parent 415a25a683
commit 351d4bb63b
14 changed files with 7089 additions and 0 deletions

View File

@ -0,0 +1,154 @@
From e3264035bdac67898d685423ffb2f3a9c3a5964a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 4 Aug 2021 01:31:34 -0600
Subject: [PATCH 01/14] mm: x86, arm64: add arch_has_hw_pte_young()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures automatically set the accessed bit in PTEs, e.g.,
x86 and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault
following the TLB miss of this PTE (to emulate the accessed bit).
Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty
page faults when trying to clear the accessed bit in many PTEs.
Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones. Therefore it
should not be used in architecture-independent code that involves
correctness, e.g., to determine whether TLB flushes are required (in
combination with the accessed bit).
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
---
arch/arm64/include/asm/pgtable.h | 15 ++-------------
arch/x86/include/asm/pgtable.h | 6 +++---
include/linux/pgtable.h | 13 +++++++++++++
mm/memory.c | 14 +-------------
4 files changed, 19 insertions(+), 29 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b5df82aa99e6..71a1af42f0e8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
* page after fork() + CoW for pfn mappings. We don't always have a
* hardware-managed access flag on arm64.
*/
-static inline bool arch_faults_on_old_pte(void)
-{
- /* The register read below requires a stable CPU to make any sense */
- cant_migrate();
-
- return !cpu_has_hw_af();
-}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
+#define arch_has_hw_pte_young cpu_has_hw_af
/*
* Experimentally, it's cheap to set the access flag in hardware and we
* benefit from prefaulting mappings as 'old' to start with.
*/
-static inline bool arch_wants_old_prefaulted_pte(void)
-{
- return !arch_faults_on_old_pte();
-}
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
static inline bool pud_sect_supported(void)
{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 44e2d6f1dbaa..dc5f7d8ef68a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_check(void)
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
+#define arch_has_hw_pte_young arch_has_hw_pte_young
+static inline bool arch_has_hw_pte_young(void)
{
- return false;
+ return true;
}
#ifdef CONFIG_PAGE_TABLE_CHECK
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 014ee8f0fbaa..95f408df4695 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index a78814413ac0..68294ce1cb06 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -2870,7 +2858,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
--
2.17.1

View File

@ -0,0 +1,145 @@
From 0c0016e6f53b52166fe4da61c81fa6b27f4650cd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 26 Sep 2020 21:17:18 -0600
Subject: [PATCH 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures support the accessed bit in non-leaf PMD entries,
e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it
as part of linear address translation [1]. Page table walkers that
clear the accessed bit may use this capability to reduce their search
space.
Note that:
1. Although an inline function is preferable, this capability is added
as a configuration option for consistency with the existing macros.
2. Due to the little interest in other varieties, this capability was
only tested on Intel and AMD CPUs.
Thanks to the following developers for their efforts [2][3].
Randy Dunlap <rdunlap@infradead.org>
Stephen Rothwell <sfr@canb.auug.org.au>
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3 (June 2021), section 4.8
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
---
arch/Kconfig | 8 ++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 3 ++-
arch/x86/mm/pgtable.c | 5 ++++-
include/linux/pgtable.h | 4 ++--
5 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 8b311e400ec1..bf19a84fffa2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1418,6 +1418,14 @@ config DYNAMIC_SIGFRAME
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config ARCH_HAS_NONLEAF_PMD_YOUNG
+ bool
+ help
+ Architectures that select this option are capable of setting the
+ accessed bit in non-leaf PMD entries when using them as part of linear
+ address translations. Page table walkers that clear the accessed bit
+ may use this capability to reduce their search space.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..674d694a665e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
select ARCH_HAS_SET_MEMORY
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dc5f7d8ef68a..5059799bebe3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}
static inline unsigned long pages_to_mb(unsigned long npg)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index a932d7712d85..8525f2876fb4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 95f408df4695..d9095251bffd 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
--
2.17.1

View File

@ -0,0 +1,259 @@
From d8e0edcddc441574410a047ede56f79c849a6d37 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 27 Sep 2020 20:49:08 -0600
Subject: [PATCH 03/14] mm/vmscan.c: refactor shrink_node()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch refactors shrink_node() to improve readability for the
upcoming changes to mm/vmscan.c.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
---
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
1 file changed, 104 insertions(+), 94 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 382dbe97329f..4e4331367db9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2728,6 +2728,109 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -3197,109 +3300,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
- /*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
- */
- mem_cgroup_flush_stats();
-
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
--
2.17.1

View File

@ -0,0 +1,64 @@
From bc14d2c7c6d0fb8c79ad0fc5eab488b977cbcccf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:22:40 -0700
Subject: [PATCH 04/14] Revert "include/linux/mm_inline.h: fold
__update_lru_size() into its sole caller"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch undoes the following refactor:
commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller")
The upcoming changes to include/linux/mm_inline.h will reuse
__update_lru_size().
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I6155c407d50199a43b179c7f45904d4b7c052118
---
include/linux/mm_inline.h | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 7b25b53c474a..fb8aadb81cd6 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page)
return folio_is_file_lru(page_folio(page));
}
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
long nr_pages)
{
@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, enum zone_type zid,
+ long nr_pages)
+{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
--
2.17.1

View File

@ -0,0 +1,810 @@
From 8c6beb4548c216da9dae5e1a7612a108396e3f9e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:12:33 -0700
Subject: [PATCH 05/14] mm: multi-gen LRU: groundwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of
working set estimation and proactive reclaim. These techniques are
commonly used to optimize job scheduling (bin packing) in data
centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive"
will be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or
VM_RAND_READ is present; the latter channel is assumed to follow the
latter pattern unless outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros,
i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are
added in this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging
needs to check the accessed bit at least twice before handing this
page over to the eviction. The first check takes care of the accessed
bit set on the initial fault; the second check makes sure this page
has not been used since then. This protocol, AKA second chance,
requires a minimum of two generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
---
fs/fuse/dev.c | 3 +-
include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++
include/linux/mmzone.h | 102 +++++++++++++++++
include/linux/page-flags-layout.h | 13 ++-
include/linux/page-flags.h | 4 +-
include/linux/sched.h | 4 +
kernel/bounds.c | 5 +
mm/Kconfig | 8 ++
mm/huge_memory.c | 3 +-
mm/memcontrol.c | 2 +
mm/memory.c | 25 +++++
mm/mm_init.c | 6 +-
mm/mmzone.c | 2 +
mm/swap.c | 11 +-
mm/vmscan.c | 75 +++++++++++++
15 files changed, 424 insertions(+), 14 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 51897427a534..b4a6e0a1b945 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page)
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index fb8aadb81cd6..2ff703900fd0 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ lockdep_assert_held(&lruvec->lru_lock);
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+static inline bool lru_gen_enabled(void)
+{
+ return true;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->lrugen.max_seq;
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+ /* see the comment on MIN_NR_GENS */
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+ lrugen->nr_pages[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+ lrugen->nr_pages[new_gen][type][zone] + delta);
+
+ /* addition */
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ /* deletion */
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio))
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+
+ if (gen < 0)
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+ /* for folio_migrate_flags() */
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ lru_gen_update_size(lruvec, folio, gen, -1);
+ list_del(&folio->lru);
+
+ return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, false))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, true))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_del_folio(lruvec, folio, false))
+ return;
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e24b40c52468..1abb6d38ed86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -314,6 +314,102 @@ enum lruvec_flags {
*/
};
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS 2U
+#define MAX_NR_GENS 4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+ LRU_GEN_ANON,
+ LRU_GEN_FILE,
+};
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative.
+ */
+struct lru_gen_struct {
+ /* the aging increments the youngest generation number */
+ unsigned long max_seq;
+ /* the eviction increments the oldest generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the multi-gen LRU lists, lazily sorted on eviction */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the multi-gen LRU sizes, eventually consistent */
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -331,6 +427,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* evictable pages divided into generations */
+ struct lru_gen_struct lrugen;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -746,6 +846,8 @@ static inline bool zone_is_empty(struct zone *zone)
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index ef1e3e736e14..240905407a18 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,12 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
+#define LRU_REFS_WIDTH 0
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 465ff35a8c00..0b0ae5084e60 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..8cc46a789193 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -914,6 +914,10 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9795d75b09b2..5ee60777d8e4 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,11 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+#endif
/* End of constants */
return 0;
diff --git a/mm/Kconfig b/mm/Kconfig
index 0331f1461f81..d95f07cd6dcf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1124,6 +1124,14 @@ config PTE_MARKER_UFFD_WP
purposes. It is required to enable userfaultfd write protection on
file-backed memory types like shmem and hugetlbfs.
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f42bb51e023a..79e0b08b4cf9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2438,7 +2438,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..5fd38d12149c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5170,6 +5170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -5228,6 +5229,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
diff --git a/mm/memory.c b/mm/memory.c
index 68294ce1cb06..cd1b5bfd9f3e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5108,6 +5108,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm doesn't apply to sequential or random reads */
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -5139,11 +5160,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9ddaf0e1b0ab..0d7b2bd2454a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0ae7571e35ab..68e1511be12d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
* Poison its list head, so that any operations on it would crash.
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
diff --git a/mm/swap.c b/mm/swap.c
index 9cee7f6a3809..0e423b7d458b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -484,6 +484,11 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
folio_get(folio);
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -575,7 +580,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
long nr_pages = folio_nr_pages(folio);
lruvec_del_folio(lruvec, folio);
@@ -688,8 +693,8 @@ void deactivate_page(struct page *page)
{
struct folio *folio = page_folio(page);
- if (folio_test_lru(folio) && folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4e4331367db9..fb76cfe2fdc2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3050,6 +3050,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* for hotadd_new_pgdat() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return pgdat ? &pgdat->__lruvec : NULL;
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
--
2.17.1

View File

@ -0,0 +1,495 @@
From 93fa87bdef9e7fa9977355c4712c000f31639231 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:43:22 -0700
Subject: [PATCH 07/14] mm: multi-gen LRU: exploit locality in rmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Searching the rmap for PTEs mapping each page on an LRU list (to test
and clear the accessed bit) can be expensive because pages from
different VMAs (PA space) are not cache friendly to the rmap (VA
space). For workloads mostly using mapped pages, searching the rmap
can incur the highest CPU cost in the reclaim path.
This patch exploits spatial locality to reduce the trips into the
rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
new function lru_gen_look_around() scans at most BITS_PER_LONG-1
adjacent PTEs. On finding another young PTE, it clears the accessed
bit and updates the gen counter of the page mapped by this PTE to
(max_seq%MAX_NR_GENS)+1.
Server benchmark results:
Single workload:
fio (buffered I/O): no change
Single workload:
memcached (anon): +[3, 5]%
Ops/sec KB/sec
patch1-6: 1106168.46 43025.04
patch1-7: 1147696.57 44640.29
Configurations:
no change
Client benchmark results:
kswapd profiles:
patch1-6
39.03% lzo1x_1_do_compress (real work)
18.47% page_vma_mapped_walk (overhead)
6.74% _raw_spin_unlock_irq
3.97% do_raw_spin_lock
2.49% ptep_clear_flush
2.48% anon_vma_interval_tree_iter_first
1.92% folio_referenced_one
1.88% __zram_bvec_write
1.48% memmove
1.31% vma_interval_tree_iter_next
patch1-7
48.16% lzo1x_1_do_compress (real work)
8.20% page_vma_mapped_walk (overhead)
7.06% _raw_spin_unlock_irq
2.92% ptep_clear_flush
2.53% __zram_bvec_write
2.11% do_raw_spin_lock
2.02% memmove
1.93% lru_gen_look_around
1.56% free_unref_page_list
1.40% memset
Configurations:
no change
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4b9ca0fd20f566ce554e703f14cee3fe0048c2fd
---
include/linux/memcontrol.h | 31 +++++++
include/linux/mm.h | 5 +
include/linux/mmzone.h | 6 ++
mm/internal.h | 1 +
mm/memcontrol.c | 1 +
mm/rmap.c | 6 ++
mm/swap.c | 4 +-
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
8 files changed, 236 insertions(+), 2 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 567f12323f55..d2b7f6b9998c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -959,6 +961,23 @@ void unlock_page_memcg(struct page *page);
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -1433,6 +1452,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21f8b27bd9fd..88976a521ef5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
{
return &folio_page(folio, 1)->compound_pincount;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 07bd22149c22..2b4dc60d0f6c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -372,6 +372,7 @@ enum lruvec_flags {
#ifndef __GENERATING_BOUNDS_H
struct lruvec;
+struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -427,6 +428,7 @@ struct lru_gen_struct {
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
diff --git a/mm/internal.h b/mm/internal.h
index 785409805ed7..a1fddea6b34f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5fd38d12149c..882180866e31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 93d5a6f793d2..9e0ce48bca08 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -833,6 +833,12 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
diff --git a/mm/swap.c b/mm/swap.c
index f74fd51fa9e1..0a3871a70952 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu)
folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(int cpu)
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e63f95a5f53..8686918e238d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1635,6 +1635,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
+
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -3219,6 +3224,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
* the aging
******************************************************************************/
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_page_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
/* protect pages accessed multiple times through file descriptors */
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
@@ -3230,6 +3258,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
new_gen = (old_gen + 1) % MAX_NR_GENS;
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
@@ -3244,6 +3277,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
return new_gen;
}
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ return folio;
+}
+
static void inc_min_seq(struct lruvec *lruvec, int type)
{
struct lru_gen_struct *lrugen = &lruvec->lrugen;
@@ -3443,6 +3513,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
}
+/*
+ * This function exploits spatial locality when shrink_page_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
/******************************************************************************
* the eviction
******************************************************************************/
@@ -3479,6 +3657,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
return true;
}
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
/* protected */
if (tier > tier_idx) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
--
2.17.1

View File

@ -0,0 +1,295 @@
From 6b9670b94ba2b49b289b997121062500e32fc3e4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:59:54 -0700
Subject: [PATCH 09/14] mm: multi-gen LRU: optimize multiple memcgs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When multiple memcgs are available, it is possible to use generations
as a frame of reference to make better choices and improve overall
performance under global memory pressure. This patch adds a basic
optimization to select memcgs that can drop single-use unmapped clean
pages first. Doing so reduces the chance of going into the aging path
or swapping, which can be costly.
A typical example that benefits from this optimization is a server
running mixed types of workloads, e.g., heavy anon workload in one
memcg and heavy buffered I/O workload in the other.
Though this optimization can be applied to both kswapd and direct
reclaim, it is only added to kswapd to keep the patchset manageable.
Later improvements may cover the direct reclaim path.
While ensuring certain fairness to all eligible memcgs, proportional
scans of individual memcgs also require proper backoff to avoid
overshooting their aggregate reclaim target by too much. Otherwise it
can cause high direct reclaim latency. The conditions for backoff are:
1. At low priorities, for direct reclaim, if aging fairness or direct
reclaim latency is at risk, i.e., aging one memcg multiple times or
swapping after the target is met.
2. At high priorities, for global reclaim, if per-zone free pages are
above respective watermarks.
Server benchmark results:
Mixed workloads:
fio (buffered I/O): +[19, 21]%
IOPS BW
patch1-8: 1880k 7343MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[119, 123]%
Ops/sec KB/sec
patch1-8: 862768.65 33514.68
patch1-9: 1911022.12 74234.54
Mixed workloads:
fio (buffered I/O): +[75, 77]%
IOPS BW
5.19-rc1: 1279k 4996MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[13, 15]%
Ops/sec KB/sec
5.19-rc1: 1673524.04 65008.87
patch1-9: 1911022.12 74234.54
Configurations:
(changes since patch 6)
cat mixed.sh
modprobe brd rd_nr=2 rd_size=56623104
swapoff -a
mkswap /dev/ram0
swapon /dev/ram0
mkfs.ext4 /dev/ram1
mount -t ext4 /dev/ram1 /mnt
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=90m --group_reporting &
pid=$!
sleep 200
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
kill -INT $pid
wait
Client benchmark results:
no change (CONFIG_MEMCG=n)
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7e00e0c733437e534ac98031cf8154a681becc00
---
mm/vmscan.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 95 insertions(+), 9 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fcb437769a60..e7b74ab67973 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,12 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help kswapd make better choices among multiple memcgs */
+ unsigned int memcgs_need_aging:1;
+ unsigned long last_reclaimed;
+#endif
+
/* Allocation order */
s8 order;
@@ -4429,6 +4435,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
VM_WARN_ON_ONCE(!current_is_kswapd());
+ sc->last_reclaimed = sc->nr_reclaimed;
+
+ /*
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ return;
+ }
+
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -4840,7 +4859,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
{
int type;
int scanned;
@@ -4903,6 +4923,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
sc->nr_reclaimed += reclaimed;
+ if (need_swapping && type == LRU_GEN_ANON)
+ *need_swapping = true;
+
return scanned;
}
@@ -4912,9 +4935,8 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap)
+ bool can_swap, bool *need_aging)
{
- bool need_aging;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4924,8 +4946,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!need_aging)
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+ if (!*need_aging)
return nr_to_scan;
/* skip the aging path at the default priority */
@@ -4942,10 +4964,67 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg at most once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
+ * may succeed if all suitable zones are somewhat safe. In either case,
+ * it's better to stop now, and restart later if necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
unsigned long scanned = 0;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
lru_add_drain();
@@ -4965,21 +5044,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
else
swappiness = 0;
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
if (!nr_to_scan)
- break;
+ goto done;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
if (!delta)
- break;
+ goto done;
scanned += delta;
if (scanned >= nr_to_scan)
break;
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+ break;
+
cond_resched();
}
+ /* see the comment in lru_gen_age_node() */
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+ sc->memcgs_need_aging = false;
+done:
clear_mm_walk();
blk_finish_plug(&plug);
--
2.17.1

View File

@ -0,0 +1,490 @@
From ef61bb3622ee0f36e055dfd5006badff08f5ce61 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:52:09 -0700
Subject: [PATCH 10/14] mm: multi-gen LRU: kill switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
0x0001: the multi-gen LRU core
0x0002: walking page table, when arch_has_hw_pte_young() returns
true
0x0004: clearing the accessed bit in non-leaf PMD entries, when
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
[yYnN]: apply to all the components above
E.g.,
echo y >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0007
echo 5 >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0005
NB: the page table walks happen on the scale of seconds under heavy
memory pressure, in which case the mmap_lock contention is a lesser
concern, compared with the LRU lock contention and the I/O congestion.
So far the only well-known case of the mmap_lock contention happens on
Android, due to Scudo [1] which allocates several thousand VMAs for
merely a few hundred MBs. The SPF and the Maple Tree also have
provided their own assessments [2][3]. However, if walking page tables
does worsen the mmap_lock contention, the kill switch can be used to
disable it. In this case the multi-gen LRU will suffer a minor
performance degradation, as shown previously.
Clearing the accessed bit in non-leaf PMD entries can also be
disabled, since this behavior was not tested on x86 varieties other
than Intel and AMD.
[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4c909618e8fed7fb1337f6624bbe542ec920a515
---
include/linux/cgroup.h | 15 ++-
include/linux/mm_inline.h | 15 ++-
include/linux/mmzone.h | 9 ++
kernel/cgroup/cgroup-internal.h | 1 -
mm/Kconfig | 6 +
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
6 files changed, 265 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ac5d0515680e..9179463c3c9f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -708,6 +719,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index f2b2296a42f9..4949eda9a9a2 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
- return true;
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
+#endif
static inline bool lru_gen_in_fault(void)
{
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
/*
* There are three common cases for this page:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e5cf37dc41a4..39bca2e420b7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -384,6 +384,13 @@ enum {
LRU_GEN_FILE,
};
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
@@ -425,6 +432,8 @@ struct lru_gen_struct {
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
};
enum {
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 36b740cb3d59..63dc3e82be4f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
diff --git a/mm/Kconfig b/mm/Kconfig
index 5101dca8f21c..6c86849c4db9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1133,6 +1133,12 @@ config LRU_GEN
help
A high performance LRU implementation to overcommit memory.
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
config LRU_GEN_STATS
bool "Full stats for debugging"
depends on LRU_GEN
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7b74ab67973..ea3d497019ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,7 @@
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3070,6 +3071,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3946,7 +3955,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4044,10 +4054,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_NONLEAF_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (!pmd_young(val))
- continue;
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ }
#endif
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -4309,7 +4321,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!arch_has_hw_pte_young()) {
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -5071,6 +5083,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+}
+
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enabled, store_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -5081,6 +5295,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -5120,6 +5335,9 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
return 0;
};
late_initcall(init_lru_gen);
--
2.17.1

View File

@ -0,0 +1,209 @@
From 9d92c76fb8ac09ff195024139575d8c4db66b672 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:08:50 -0700
Subject: [PATCH 11/14] mm: multi-gen LRU: thrashing prevention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
requested by many desktop users [1].
When set to value N, it prevents the working set of N milliseconds
from getting evicted. The OOM killer is triggered if this working set
cannot be kept in memory. Based on the average human detectable lag
(~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
Larger values like N=3000 make lags less noticeable at the risk of
premature OOM kills.
Compared with the size-based approach [2], this time-based approach
has the following advantages:
1. It is easier to configure because it is agnostic to applications
and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I007499d7e47374b59fd620e8c3962940bc9f788e
---
include/linux/mmzone.h | 2 ++
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 73 insertions(+), 3 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39bca2e420b7..0c502618b37b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -419,6 +419,8 @@ struct lru_gen_struct {
unsigned long max_seq;
/* the eviction increments the oldest generation numbers */
unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
/* the multi-gen LRU lists, lazily sorted on eviction */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the multi-gen LRU sizes, eventually consistent */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ea3d497019ab..0df253819edc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4293,6 +4293,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
for (type = 0; type < ANON_AND_FILE; type++)
reset_ctrl_pos(lruvec, type, false);
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
@@ -4420,7 +4421,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
return false;
}
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
{
bool need_aging;
unsigned long nr_to_scan;
@@ -4434,16 +4435,36 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(memcg))
- return;
+ return false;
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+
+ if (min_ttl) {
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ /* the size is likely too small to be helpful */
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+ return false;
+ }
+
if (need_aging)
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+
+ return true;
}
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
+ bool success = false;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
@@ -4466,12 +4487,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- age_lruvec(lruvec, sc);
+ if (age_lruvec(lruvec, sc, min_ttl))
+ success = true;
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
clear_mm_walk();
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (success || !min_ttl || sc->order)
+ return;
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either below min or empty.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
}
/*
@@ -5228,6 +5269,28 @@ static void lru_gen_change_state(bool enabled)
* sysfs interface
******************************************************************************/
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
unsigned int caps = 0;
@@ -5276,6 +5339,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
);
static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
&lru_gen_enabled_attr.attr,
NULL
};
@@ -5291,12 +5355,16 @@ static struct attribute_group lru_gen_attr_group = {
void lru_gen_init_lruvec(struct lruvec *lruvec)
{
+ int i;
int gen, type, zone;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
--
2.17.1

View File

@ -0,0 +1,564 @@
From d1e0e5fcdea16d4ceead496a0ea2fdbb6bc5bfe4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:12:41 -0700
Subject: [PATCH 12/14] mm: multi-gen LRU: debugfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
reclaim. These techniques are commonly used to optimize job scheduling
(bin packing) in data centers [1][2].
Compared with the page table-based approach and the PFN-based
approach, this lruvec-based approach has the following advantages:
1. It offers better choices because it is aware of memcgs, NUMA nodes,
shared mappings and unmapped page cache.
2. It is more scalable because it is O(nr_hot_pages), whereas the
PFN-based approach is O(nr_total_pages).
Add /sys/kernel/debug/lru_gen_full for debugging.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7bb06f14e0a94901a076cc3767d0855d4f1ea3ab
---
include/linux/nodemask.h | 1 +
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 402 insertions(+), 10 deletions(-)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4b71a96190a8..3a0eec9f2faa 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0df253819edc..3e7aad06299b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,7 @@
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4197,12 +4198,40 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static void inc_min_seq(struct lruvec *lruvec, int type)
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
}
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
@@ -4248,7 +4277,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
@@ -4262,9 +4291,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
- inc_min_seq(lruvec, type);
+ while (!inc_min_seq(lruvec, type, can_swap)) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
/*
@@ -4301,7 +4334,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap)
+ struct scan_control *sc, bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -4322,7 +4355,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4336,7 +4369,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
walk->lruvec = lruvec;
walk->max_seq = max_seq;
walk->can_swap = can_swap;
- walk->force_scan = false;
+ walk->force_scan = force_scan;
do {
success = iterate_mm_list(lruvec, walk, &mm);
@@ -4356,7 +4389,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
- inc_max_seq(lruvec, can_swap);
+ inc_max_seq(lruvec, can_swap, force_scan);
/* either this sees any waiters or they will see updated max_seq */
if (wq_has_sleeper(&lruvec->mm_state.wait))
wake_up_all(&lruvec->mm_state.wait);
@@ -4452,7 +4485,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
}
if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
return true;
}
@@ -5011,7 +5044,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
if (current_is_kswapd())
return 0;
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
return nr_to_scan;
done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
@@ -5349,6 +5382,361 @@ static struct attribute_group lru_gen_attr_group = {
.attrs = lru_gen_attrs,
};
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -5406,6 +5794,9 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
return 0;
};
late_initcall(init_lru_gen);
--
2.17.1

View File

@ -0,0 +1,265 @@
From 22199c9b30ffcc332be643577709a2af960e6786 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 23 Jan 2022 16:44:43 -0700
Subject: [PATCH 13/14] mm: multi-gen LRU: admin guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an admin guide.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
---
Documentation/admin-guide/mm/index.rst | 1 +
Documentation/admin-guide/mm/multigen_lru.rst | 162 ++++++++++++++++++
mm/Kconfig | 3 +-
mm/vmscan.c | 4 +
4 files changed, 169 insertions(+), 1 deletion(-)
create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index 1bd11118dfb1..d1064e0ba34a 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -32,6 +32,7 @@ the Linux memory management.
idle_page_tracking
ksm
memory-hotplug
+ multigen_lru
nommu-mmap
numa_memory_policy
numaperf
diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst
new file mode 100644
index 000000000000..33e068830497
--- /dev/null
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -0,0 +1,162 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Quick start
+===========
+Build the kernel with the following configurations.
+
+* ``CONFIG_LRU_GEN=y``
+* ``CONFIG_LRU_GEN_ENABLED=y``
+
+All set!
+
+Runtime options
+===============
+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
+following subsections.
+
+Kill switch
+-----------
+``enabled`` accepts different values to enable or disable the
+following components. Its default value depends on
+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
+unless some of them have unforeseen side effects. Writing to
+``enabled`` has no effect when a component is not supported by the
+hardware, and valid values will be accepted even when the main switch
+is off.
+
+====== ===============================================================
+Values Components
+====== ===============================================================
+0x0001 The main switch for the multi-gen LRU.
+0x0002 Clearing the accessed bit in leaf page table entries in large
+ batches, when MMU sets it (e.g., on x86). This behavior can
+ theoretically worsen lock contention (mmap_lock). If it is
+ disabled, the multi-gen LRU will suffer a minor performance
+ degradation for workloads that contiguously map hot pages,
+ whose accessed bits can be otherwise cleared by fewer larger
+ batches.
+0x0004 Clearing the accessed bit in non-leaf page table entries as
+ well, when MMU sets it (e.g., on x86). This behavior was not
+ verified on x86 varieties other than Intel and AMD. If it is
+ disabled, the multi-gen LRU will suffer a negligible
+ performance degradation.
+[yYnN] Apply to all the components above.
+====== ===============================================================
+
+E.g.,
+::
+
+ echo y >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0007
+ echo 5 >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0005
+
+Thrashing prevention
+--------------------
+Personal computers are more sensitive to thrashing because it can
+cause janks (lags when rendering UI) and negatively impact user
+experience. The multi-gen LRU offers thrashing prevention to the
+majority of laptop and desktop users who do not have ``oomd``.
+
+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
+``N`` milliseconds from getting evicted. The OOM killer is triggered
+if this working set cannot be kept in memory. In other words, this
+option works as an adjustable pressure relief valve, and when open, it
+terminates applications that are hopefully not being used.
+
+Based on the average human detectable lag (~100ms), ``N=1000`` usually
+eliminates intolerable janks due to thrashing. Larger values like
+``N=3000`` make janks less noticeable at the risk of premature OOM
+kills.
+
+The default value ``0`` means disabled.
+
+Experimental features
+=====================
+``/sys/kernel/debug/lru_gen`` accepts commands described in the
+following subsections. Multiple command lines are supported, so does
+concatenation with delimiters ``,`` and ``;``.
+
+``/sys/kernel/debug/lru_gen_full`` provides additional stats for
+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
+evicted generations in this file.
+
+Working set estimation
+----------------------
+Working set estimation measures how much memory an application needs
+in a given time interval, and it is usually done with little impact on
+the performance of the application. E.g., data centers want to
+optimize job scheduling (bin packing) to improve memory utilizations.
+When a new job comes in, the job scheduler needs to find out whether
+each server it manages can allocate a certain amount of memory for
+this new job before it can pick a candidate. To do so, the job
+scheduler needs to estimate the working sets of the existing jobs.
+
+When it is read, ``lru_gen`` returns a histogram of numbers of pages
+accessed over different time intervals for each memcg and node.
+``MAX_NR_GENS`` decides the number of bins for each histogram. The
+histograms are noncumulative.
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages
+ ...
+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages
+
+Each bin contains an estimated number of pages that have been accessed
+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
+the former is the largest and that of the latter is the smallest.
+
+Users can write the following command to ``lru_gen`` to create a new
+generation ``max_gen_nr+1``:
+
+ ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
+
+``can_swap`` defaults to the swap setting and, if it is set to ``1``,
+it forces the scan of anon pages when swap is off, and vice versa.
+``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
+employs heuristics to reduce the overhead, which is likely to reduce
+the coverage as well.
+
+A typical use case is that a job scheduler runs this command at a
+certain time interval to create new generations, and it ranks the
+servers it manages based on the sizes of their cold pages defined by
+this time interval.
+
+Proactive reclaim
+-----------------
+Proactive reclaim induces page reclaim when there is no memory
+pressure. It usually targets cold pages only. E.g., when a new job
+comes in, the job scheduler wants to proactively reclaim cold pages on
+the server it selected, to improve the chance of successfully landing
+this new job.
+
+Users can write the following command to ``lru_gen`` to evict
+generations less than or equal to ``min_gen_nr``.
+
+ ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
+
+``min_gen_nr`` should be less than ``max_gen_nr-1``, since
+``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
+the active list) and therefore cannot be evicted. ``swappiness``
+overrides the default value in ``/proc/sys/vm/swappiness``.
+``nr_to_reclaim`` limits the number of pages to evict.
+
+A typical use case is that a job scheduler runs this command before it
+tries to land a new job on a server. If it fails to materialize enough
+cold pages because of the overestimation, it retries on the next
+server according to the ranking result obtained from the working set
+estimation step. This less forceful approach limits the impacts on the
+existing jobs.
diff --git a/mm/Kconfig b/mm/Kconfig
index 6c86849c4db9..96cd3ae25c6f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1131,7 +1131,8 @@ config LRU_GEN
# make sure folio->flags has enough spare bits
depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
help
- A high performance LRU implementation to overcommit memory.
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
config LRU_GEN_ENABLED
bool "Enable by default"
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3e7aad06299b..146a54cf1bd9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5307,6 +5307,7 @@ static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, c
return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5340,6 +5341,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5487,6 +5489,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_putc(m, '\n');
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
@@ -5645,6 +5648,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
return err;
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
size_t len, loff_t *pos)
{
--
2.17.1

View File

@ -0,0 +1,210 @@
From bd82a74f6b5c0a75ef61be5e9be34319bb17328f Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:35:00 -0700
Subject: [PATCH 14/14] mm: multi-gen LRU: design doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add a design doc.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I958afcabf5abc37b3e58f72638d35a349c31b98d
---
Documentation/mm/index.rst | 1 +
Documentation/mm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++
2 files changed, 160 insertions(+)
create mode 100644 Documentation/mm/multigen_lru.rst
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
index 575ccd40e30c..4aa12b8be278 100644
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose.
ksm
memory-model
mmu_notifier
+ multigen_lru
numa
overcommit-accounting
page_migration
diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
new file mode 100644
index 000000000000..d7062c6a8946
--- /dev/null
+++ b/Documentation/mm/multigen_lru.rst
@@ -0,0 +1,159 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Design overview
+===============
+Objectives
+----------
+The design objectives are:
+
+* Good representation of access recency
+* Try to profit from spatial locality
+* Fast paths to make obvious choices
+* Simple self-correcting heuristics
+
+The representation of access recency is at the core of all LRU
+implementations. In the multi-gen LRU, each generation represents a
+group of pages with similar access recency. Generations establish a
+(time-based) common frame of reference and therefore help make better
+choices, e.g., between different memcgs on a computer or different
+computers in a data center (for job scheduling).
+
+Exploiting spatial locality improves efficiency when gathering the
+accessed bit. A rmap walk targets a single page and does not try to
+profit from discovering a young PTE. A page table walk can sweep all
+the young PTEs in an address space, but the address space can be too
+sparse to make a profit. The key is to optimize both methods and use
+them in combination.
+
+Fast paths reduce code complexity and runtime overhead. Unmapped pages
+do not require TLB flushes; clean pages do not require writeback.
+These facts are only helpful when other conditions, e.g., access
+recency, are similar. With generations as a common frame of reference,
+additional factors stand out. But obvious choices might not be good
+choices; thus self-correction is necessary.
+
+The benefits of simple self-correcting heuristics are self-evident.
+Again, with generations as a common frame of reference, this becomes
+attainable. Specifically, pages in the same generation can be
+categorized based on additional factors, and a feedback loop can
+statistically compare the refault percentages across those categories
+and infer which of them are better choices.
+
+Assumptions
+-----------
+The protection of hot pages and the selection of cold pages are based
+on page access channels and patterns. There are two access channels:
+
+* Accesses through page tables
+* Accesses through file descriptors
+
+The protection of the former channel is by design stronger because:
+
+1. The uncertainty in determining the access patterns of the former
+ channel is higher due to the approximation of the accessed bit.
+2. The cost of evicting the former channel is higher due to the TLB
+ flushes required and the likelihood of encountering the dirty bit.
+3. The penalty of underprotecting the former channel is higher because
+ applications usually do not prepare themselves for major page
+ faults like they do for blocked I/O. E.g., GUI applications
+ commonly use dedicated I/O threads to avoid blocking rendering
+ threads.
+
+There are also two access patterns:
+
+* Accesses exhibiting temporal locality
+* Accesses not exhibiting temporal locality
+
+For the reasons listed above, the former channel is assumed to follow
+the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
+present, and the latter channel is assumed to follow the latter
+pattern unless outlying refaults have been observed.
+
+Workflow overview
+=================
+Evictable pages are divided into multiple generations for each
+``lruvec``. The youngest generation number is stored in
+``lrugen->max_seq`` for both anon and file types as they are aged on
+an equal footing. The oldest generation numbers are stored in
+``lrugen->min_seq[]`` separately for anon and file types as clean file
+pages can be evicted regardless of swap constraints. These three
+variables are monotonically increasing.
+
+Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
+bits in order to fit into the gen counter in ``folio->flags``. Each
+truncated generation number is an index to ``lrugen->lists[]``. The
+sliding window technique is used to track at least ``MIN_NR_GENS`` and
+at most ``MAX_NR_GENS`` generations. The gen counter stores a value
+within ``[1, MAX_NR_GENS]`` while a page is on one of
+``lrugen->lists[]``; otherwise it stores zero.
+
+Each generation is divided into multiple tiers. A page accessed ``N``
+times through file descriptors is in tier ``order_base_2(N)``. Unlike
+generations, tiers do not have dedicated ``lrugen->lists[]``. In
+contrast to moving across generations, which requires the LRU lock,
+moving across tiers only involves atomic operations on
+``folio->flags`` and therefore has a negligible cost. A feedback loop
+modeled after the PID controller monitors refaults over all the tiers
+from anon and file types and decides which tiers from which types to
+evict or protect.
+
+There are two conceptually independent procedures: the aging and the
+eviction. They form a closed-loop system, i.e., the page reclaim.
+
+Aging
+-----
+The aging produces young generations. Given an ``lruvec``, it
+increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
+``MIN_NR_GENS``. The aging promotes hot pages to the youngest
+generation when it finds them accessed through page tables; the
+demotion of cold pages happens consequently when it increments
+``max_seq``. The aging uses page table walks and rmap walks to find
+young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
+and calls ``walk_page_range()`` with each ``mm_struct`` on this list
+to scan PTEs, and after each iteration, it increments ``max_seq``. For
+the latter, when the eviction walks the rmap and finds a young PTE,
+the aging scans the adjacent PTEs. For both, on finding a young PTE,
+the aging clears the accessed bit and updates the gen counter of the
+page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
+
+Eviction
+--------
+The eviction consumes old generations. Given an ``lruvec``, it
+increments ``min_seq`` when ``lrugen->lists[]`` indexed by
+``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
+evict from, it first compares ``min_seq[]`` to select the older type.
+If both types are equally old, it selects the one whose first tier has
+a lower refault percentage. The first tier contains single-use
+unmapped clean pages, which are the best bet. The eviction sorts a
+page according to its gen counter if the aging has found this page
+accessed through page tables and updated its gen counter. It also
+moves a page to the next generation, i.e., ``min_seq+1``, if this page
+was accessed multiple times through file descriptors and the feedback
+loop has detected outlying refaults from the tier this page is in. To
+this end, the feedback loop uses the first tier as the baseline, for
+the reason stated earlier.
+
+Summary
+-------
+The multi-gen LRU can be disassembled into the following parts:
+
+* Generations
+* Rmap walks
+* Page table walks
+* Bloom filters
+* PID controller
+
+The aging and the eviction form a producer-consumer model;
+specifically, the latter drives the former by the sliding window over
+generations. Within the aging, rmap walks drive page table walks by
+inserting hot densely populated page tables to the Bloom filters.
+Within the eviction, the PID controller uses refaults as the feedback
+to select types to evict and tiers to protect.
--
2.17.1