mirror of
https://github.com/coolsnowwolf/lede.git
synced 2025-04-16 04:13:31 +00:00
kernel: backport MGLRU to linux 6.0 for Tianling Shen
This commit is contained in:
parent
415a25a683
commit
351d4bb63b
@ -0,0 +1,154 @@
|
|||||||
|
From e3264035bdac67898d685423ffb2f3a9c3a5964a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Wed, 4 Aug 2021 01:31:34 -0600
|
||||||
|
Subject: [PATCH 01/14] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Some architectures automatically set the accessed bit in PTEs, e.g.,
|
||||||
|
x86 and arm64 v8.2. On architectures that do not have this capability,
|
||||||
|
clearing the accessed bit in a PTE usually triggers a page fault
|
||||||
|
following the TLB miss of this PTE (to emulate the accessed bit).
|
||||||
|
|
||||||
|
Being aware of this capability can help make better decisions, e.g.,
|
||||||
|
whether to spread the work out over a period of time to reduce bursty
|
||||||
|
page faults when trying to clear the accessed bit in many PTEs.
|
||||||
|
|
||||||
|
Note that theoretically this capability can be unreliable, e.g.,
|
||||||
|
hotplugged CPUs might be different from builtin ones. Therefore it
|
||||||
|
should not be used in architecture-independent code that involves
|
||||||
|
correctness, e.g., to determine whether TLB flushes are required (in
|
||||||
|
combination with the accessed bit).
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Acked-by: Will Deacon <will@kernel.org>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
||||||
|
---
|
||||||
|
arch/arm64/include/asm/pgtable.h | 15 ++-------------
|
||||||
|
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||||
|
include/linux/pgtable.h | 13 +++++++++++++
|
||||||
|
mm/memory.c | 14 +-------------
|
||||||
|
4 files changed, 19 insertions(+), 29 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
|
||||||
|
index b5df82aa99e6..71a1af42f0e8 100644
|
||||||
|
--- a/arch/arm64/include/asm/pgtable.h
|
||||||
|
+++ b/arch/arm64/include/asm/pgtable.h
|
||||||
|
@@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
|
||||||
|
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||||
|
* hardware-managed access flag on arm64.
|
||||||
|
*/
|
||||||
|
-static inline bool arch_faults_on_old_pte(void)
|
||||||
|
-{
|
||||||
|
- /* The register read below requires a stable CPU to make any sense */
|
||||||
|
- cant_migrate();
|
||||||
|
-
|
||||||
|
- return !cpu_has_hw_af();
|
||||||
|
-}
|
||||||
|
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||||
|
+#define arch_has_hw_pte_young cpu_has_hw_af
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||||
|
* benefit from prefaulting mappings as 'old' to start with.
|
||||||
|
*/
|
||||||
|
-static inline bool arch_wants_old_prefaulted_pte(void)
|
||||||
|
-{
|
||||||
|
- return !arch_faults_on_old_pte();
|
||||||
|
-}
|
||||||
|
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||||
|
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
||||||
|
|
||||||
|
static inline bool pud_sect_supported(void)
|
||||||
|
{
|
||||||
|
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
|
||||||
|
index 44e2d6f1dbaa..dc5f7d8ef68a 100644
|
||||||
|
--- a/arch/x86/include/asm/pgtable.h
|
||||||
|
+++ b/arch/x86/include/asm/pgtable.h
|
||||||
|
@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_check(void)
|
||||||
|
return boot_cpu_has_bug(X86_BUG_L1TF);
|
||||||
|
}
|
||||||
|
|
||||||
|
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||||
|
-static inline bool arch_faults_on_old_pte(void)
|
||||||
|
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||||
|
+static inline bool arch_has_hw_pte_young(void)
|
||||||
|
{
|
||||||
|
- return false;
|
||||||
|
+ return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||||
|
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
|
||||||
|
index 014ee8f0fbaa..95f408df4695 100644
|
||||||
|
--- a/include/linux/pgtable.h
|
||||||
|
+++ b/include/linux/pgtable.h
|
||||||
|
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
|
||||||
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+#ifndef arch_has_hw_pte_young
|
||||||
|
+/*
|
||||||
|
+ * Return whether the accessed bit is supported on the local CPU.
|
||||||
|
+ *
|
||||||
|
+ * This stub assumes accessing through an old PTE triggers a page fault.
|
||||||
|
+ * Architectures that automatically set the access bit should overwrite it.
|
||||||
|
+ */
|
||||||
|
+static inline bool arch_has_hw_pte_young(void)
|
||||||
|
+{
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||||
|
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||||
|
unsigned long address,
|
||||||
|
diff --git a/mm/memory.c b/mm/memory.c
|
||||||
|
index a78814413ac0..68294ce1cb06 100644
|
||||||
|
--- a/mm/memory.c
|
||||||
|
+++ b/mm/memory.c
|
||||||
|
@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly =
|
||||||
|
2;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#ifndef arch_faults_on_old_pte
|
||||||
|
-static inline bool arch_faults_on_old_pte(void)
|
||||||
|
-{
|
||||||
|
- /*
|
||||||
|
- * Those arches which don't have hw access flag feature need to
|
||||||
|
- * implement their own helper. By default, "true" means pagefault
|
||||||
|
- * will be hit on old pte.
|
||||||
|
- */
|
||||||
|
- return true;
|
||||||
|
-}
|
||||||
|
-#endif
|
||||||
|
-
|
||||||
|
#ifndef arch_wants_old_prefaulted_pte
|
||||||
|
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||||
|
{
|
||||||
|
@@ -2870,7 +2858,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
|
||||||
|
* On architectures with software "accessed" bits, we would
|
||||||
|
* take a double page fault, so mark it accessed here.
|
||||||
|
*/
|
||||||
|
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
|
||||||
|
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
|
||||||
|
pte_t entry;
|
||||||
|
|
||||||
|
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,145 @@
|
|||||||
|
From 0c0016e6f53b52166fe4da61c81fa6b27f4650cd Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Sat, 26 Sep 2020 21:17:18 -0600
|
||||||
|
Subject: [PATCH 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Some architectures support the accessed bit in non-leaf PMD entries,
|
||||||
|
e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it
|
||||||
|
as part of linear address translation [1]. Page table walkers that
|
||||||
|
clear the accessed bit may use this capability to reduce their search
|
||||||
|
space.
|
||||||
|
|
||||||
|
Note that:
|
||||||
|
1. Although an inline function is preferable, this capability is added
|
||||||
|
as a configuration option for consistency with the existing macros.
|
||||||
|
2. Due to the little interest in other varieties, this capability was
|
||||||
|
only tested on Intel and AMD CPUs.
|
||||||
|
|
||||||
|
Thanks to the following developers for their efforts [2][3].
|
||||||
|
Randy Dunlap <rdunlap@infradead.org>
|
||||||
|
Stephen Rothwell <sfr@canb.auug.org.au>
|
||||||
|
|
||||||
|
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
|
||||||
|
Volume 3 (June 2021), section 4.8
|
||||||
|
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
|
||||||
|
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
|
||||||
|
---
|
||||||
|
arch/Kconfig | 8 ++++++++
|
||||||
|
arch/x86/Kconfig | 1 +
|
||||||
|
arch/x86/include/asm/pgtable.h | 3 ++-
|
||||||
|
arch/x86/mm/pgtable.c | 5 ++++-
|
||||||
|
include/linux/pgtable.h | 4 ++--
|
||||||
|
5 files changed, 17 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/arch/Kconfig b/arch/Kconfig
|
||||||
|
index 8b311e400ec1..bf19a84fffa2 100644
|
||||||
|
--- a/arch/Kconfig
|
||||||
|
+++ b/arch/Kconfig
|
||||||
|
@@ -1418,6 +1418,14 @@ config DYNAMIC_SIGFRAME
|
||||||
|
config HAVE_ARCH_NODE_DEV_GROUP
|
||||||
|
bool
|
||||||
|
|
||||||
|
+config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||||
|
+ bool
|
||||||
|
+ help
|
||||||
|
+ Architectures that select this option are capable of setting the
|
||||||
|
+ accessed bit in non-leaf PMD entries when using them as part of linear
|
||||||
|
+ address translations. Page table walkers that clear the accessed bit
|
||||||
|
+ may use this capability to reduce their search space.
|
||||||
|
+
|
||||||
|
source "kernel/gcov/Kconfig"
|
||||||
|
|
||||||
|
source "scripts/gcc-plugins/Kconfig"
|
||||||
|
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
|
||||||
|
index f9920f1341c8..674d694a665e 100644
|
||||||
|
--- a/arch/x86/Kconfig
|
||||||
|
+++ b/arch/x86/Kconfig
|
||||||
|
@@ -85,6 +85,7 @@ config X86
|
||||||
|
select ARCH_HAS_PMEM_API if X86_64
|
||||||
|
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||||
|
select ARCH_HAS_PTE_SPECIAL
|
||||||
|
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
|
||||||
|
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||||
|
select ARCH_HAS_COPY_MC if X86_64
|
||||||
|
select ARCH_HAS_SET_MEMORY
|
||||||
|
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
|
||||||
|
index dc5f7d8ef68a..5059799bebe3 100644
|
||||||
|
--- a/arch/x86/include/asm/pgtable.h
|
||||||
|
+++ b/arch/x86/include/asm/pgtable.h
|
||||||
|
@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||||
|
|
||||||
|
static inline int pmd_bad(pmd_t pmd)
|
||||||
|
{
|
||||||
|
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||||
|
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
|
||||||
|
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned long pages_to_mb(unsigned long npg)
|
||||||
|
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
|
||||||
|
index a932d7712d85..8525f2876fb4 100644
|
||||||
|
--- a/arch/x86/mm/pgtable.c
|
||||||
|
+++ b/arch/x86/mm/pgtable.c
|
||||||
|
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||||
|
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr, pmd_t *pmdp)
|
||||||
|
{
|
||||||
|
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
int pudp_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr, pud_t *pudp)
|
||||||
|
{
|
||||||
|
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
|
||||||
|
index 95f408df4695..d9095251bffd 100644
|
||||||
|
--- a/include/linux/pgtable.h
|
||||||
|
+++ b/include/linux/pgtable.h
|
||||||
|
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
|
||||||
|
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||||
|
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
unsigned long address,
|
||||||
|
pmd_t *pmdp)
|
||||||
|
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
BUILD_BUG();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,259 @@
|
|||||||
|
From d8e0edcddc441574410a047ede56f79c849a6d37 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Sun, 27 Sep 2020 20:49:08 -0600
|
||||||
|
Subject: [PATCH 03/14] mm/vmscan.c: refactor shrink_node()
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
This patch refactors shrink_node() to improve readability for the
|
||||||
|
upcoming changes to mm/vmscan.c.
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||||
|
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
||||||
|
---
|
||||||
|
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
|
||||||
|
1 file changed, 104 insertions(+), 94 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 382dbe97329f..4e4331367db9 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -2728,6 +2728,109 @@ enum scan_balance {
|
||||||
|
SCAN_FILE,
|
||||||
|
};
|
||||||
|
|
||||||
|
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
||||||
|
+{
|
||||||
|
+ unsigned long file;
|
||||||
|
+ struct lruvec *target_lruvec;
|
||||||
|
+
|
||||||
|
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||||
|
+ * lruvec stats for heuristics.
|
||||||
|
+ */
|
||||||
|
+ mem_cgroup_flush_stats();
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Determine the scan balance between anon and file LRUs.
|
||||||
|
+ */
|
||||||
|
+ spin_lock_irq(&target_lruvec->lru_lock);
|
||||||
|
+ sc->anon_cost = target_lruvec->anon_cost;
|
||||||
|
+ sc->file_cost = target_lruvec->file_cost;
|
||||||
|
+ spin_unlock_irq(&target_lruvec->lru_lock);
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Target desirable inactive:active list ratios for the anon
|
||||||
|
+ * and file LRU lists.
|
||||||
|
+ */
|
||||||
|
+ if (!sc->force_deactivate) {
|
||||||
|
+ unsigned long refaults;
|
||||||
|
+
|
||||||
|
+ refaults = lruvec_page_state(target_lruvec,
|
||||||
|
+ WORKINGSET_ACTIVATE_ANON);
|
||||||
|
+ if (refaults != target_lruvec->refaults[0] ||
|
||||||
|
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||||
|
+ sc->may_deactivate |= DEACTIVATE_ANON;
|
||||||
|
+ else
|
||||||
|
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * When refaults are being observed, it means a new
|
||||||
|
+ * workingset is being established. Deactivate to get
|
||||||
|
+ * rid of any stale active pages quickly.
|
||||||
|
+ */
|
||||||
|
+ refaults = lruvec_page_state(target_lruvec,
|
||||||
|
+ WORKINGSET_ACTIVATE_FILE);
|
||||||
|
+ if (refaults != target_lruvec->refaults[1] ||
|
||||||
|
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||||
|
+ sc->may_deactivate |= DEACTIVATE_FILE;
|
||||||
|
+ else
|
||||||
|
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||||
|
+ } else
|
||||||
|
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * If we have plenty of inactive file pages that aren't
|
||||||
|
+ * thrashing, try to reclaim those first before touching
|
||||||
|
+ * anonymous pages.
|
||||||
|
+ */
|
||||||
|
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||||
|
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||||
|
+ sc->cache_trim_mode = 1;
|
||||||
|
+ else
|
||||||
|
+ sc->cache_trim_mode = 0;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Prevent the reclaimer from falling into the cache trap: as
|
||||||
|
+ * cache pages start out inactive, every cache fault will tip
|
||||||
|
+ * the scan balance towards the file LRU. And as the file LRU
|
||||||
|
+ * shrinks, so does the window for rotation from references.
|
||||||
|
+ * This means we have a runaway feedback loop where a tiny
|
||||||
|
+ * thrashing file LRU becomes infinitely more attractive than
|
||||||
|
+ * anon pages. Try to detect this based on file LRU size.
|
||||||
|
+ */
|
||||||
|
+ if (!cgroup_reclaim(sc)) {
|
||||||
|
+ unsigned long total_high_wmark = 0;
|
||||||
|
+ unsigned long free, anon;
|
||||||
|
+ int z;
|
||||||
|
+
|
||||||
|
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||||
|
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||||
|
+ node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||||
|
+
|
||||||
|
+ for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||||
|
+ struct zone *zone = &pgdat->node_zones[z];
|
||||||
|
+
|
||||||
|
+ if (!managed_zone(zone))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ total_high_wmark += high_wmark_pages(zone);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * Consider anon: if that's low too, this isn't a
|
||||||
|
+ * runaway file reclaim problem, but rather just
|
||||||
|
+ * extreme pressure. Reclaim as per usual then.
|
||||||
|
+ */
|
||||||
|
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||||
|
+
|
||||||
|
+ sc->file_is_tiny =
|
||||||
|
+ file + free <= total_high_wmark &&
|
||||||
|
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||||
|
+ anon >> sc->priority;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/*
|
||||||
|
* Determine how aggressively the anon and file LRU lists should be
|
||||||
|
* scanned.
|
||||||
|
@@ -3197,109 +3300,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||||
|
unsigned long nr_reclaimed, nr_scanned;
|
||||||
|
struct lruvec *target_lruvec;
|
||||||
|
bool reclaimable = false;
|
||||||
|
- unsigned long file;
|
||||||
|
|
||||||
|
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||||
|
|
||||||
|
again:
|
||||||
|
- /*
|
||||||
|
- * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||||
|
- * lruvec stats for heuristics.
|
||||||
|
- */
|
||||||
|
- mem_cgroup_flush_stats();
|
||||||
|
-
|
||||||
|
memset(&sc->nr, 0, sizeof(sc->nr));
|
||||||
|
|
||||||
|
nr_reclaimed = sc->nr_reclaimed;
|
||||||
|
nr_scanned = sc->nr_scanned;
|
||||||
|
|
||||||
|
- /*
|
||||||
|
- * Determine the scan balance between anon and file LRUs.
|
||||||
|
- */
|
||||||
|
- spin_lock_irq(&target_lruvec->lru_lock);
|
||||||
|
- sc->anon_cost = target_lruvec->anon_cost;
|
||||||
|
- sc->file_cost = target_lruvec->file_cost;
|
||||||
|
- spin_unlock_irq(&target_lruvec->lru_lock);
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * Target desirable inactive:active list ratios for the anon
|
||||||
|
- * and file LRU lists.
|
||||||
|
- */
|
||||||
|
- if (!sc->force_deactivate) {
|
||||||
|
- unsigned long refaults;
|
||||||
|
-
|
||||||
|
- refaults = lruvec_page_state(target_lruvec,
|
||||||
|
- WORKINGSET_ACTIVATE_ANON);
|
||||||
|
- if (refaults != target_lruvec->refaults[0] ||
|
||||||
|
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||||
|
- sc->may_deactivate |= DEACTIVATE_ANON;
|
||||||
|
- else
|
||||||
|
- sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * When refaults are being observed, it means a new
|
||||||
|
- * workingset is being established. Deactivate to get
|
||||||
|
- * rid of any stale active pages quickly.
|
||||||
|
- */
|
||||||
|
- refaults = lruvec_page_state(target_lruvec,
|
||||||
|
- WORKINGSET_ACTIVATE_FILE);
|
||||||
|
- if (refaults != target_lruvec->refaults[1] ||
|
||||||
|
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||||
|
- sc->may_deactivate |= DEACTIVATE_FILE;
|
||||||
|
- else
|
||||||
|
- sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||||
|
- } else
|
||||||
|
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * If we have plenty of inactive file pages that aren't
|
||||||
|
- * thrashing, try to reclaim those first before touching
|
||||||
|
- * anonymous pages.
|
||||||
|
- */
|
||||||
|
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||||
|
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||||
|
- sc->cache_trim_mode = 1;
|
||||||
|
- else
|
||||||
|
- sc->cache_trim_mode = 0;
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * Prevent the reclaimer from falling into the cache trap: as
|
||||||
|
- * cache pages start out inactive, every cache fault will tip
|
||||||
|
- * the scan balance towards the file LRU. And as the file LRU
|
||||||
|
- * shrinks, so does the window for rotation from references.
|
||||||
|
- * This means we have a runaway feedback loop where a tiny
|
||||||
|
- * thrashing file LRU becomes infinitely more attractive than
|
||||||
|
- * anon pages. Try to detect this based on file LRU size.
|
||||||
|
- */
|
||||||
|
- if (!cgroup_reclaim(sc)) {
|
||||||
|
- unsigned long total_high_wmark = 0;
|
||||||
|
- unsigned long free, anon;
|
||||||
|
- int z;
|
||||||
|
-
|
||||||
|
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||||
|
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||||
|
- node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||||
|
-
|
||||||
|
- for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||||
|
- struct zone *zone = &pgdat->node_zones[z];
|
||||||
|
- if (!managed_zone(zone))
|
||||||
|
- continue;
|
||||||
|
-
|
||||||
|
- total_high_wmark += high_wmark_pages(zone);
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- /*
|
||||||
|
- * Consider anon: if that's low too, this isn't a
|
||||||
|
- * runaway file reclaim problem, but rather just
|
||||||
|
- * extreme pressure. Reclaim as per usual then.
|
||||||
|
- */
|
||||||
|
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||||
|
-
|
||||||
|
- sc->file_is_tiny =
|
||||||
|
- file + free <= total_high_wmark &&
|
||||||
|
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||||
|
- anon >> sc->priority;
|
||||||
|
- }
|
||||||
|
+ prepare_scan_count(pgdat, sc);
|
||||||
|
|
||||||
|
shrink_node_memcgs(pgdat, sc);
|
||||||
|
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,64 @@
|
|||||||
|
From bc14d2c7c6d0fb8c79ad0fc5eab488b977cbcccf Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Sun, 6 Mar 2022 20:22:40 -0700
|
||||||
|
Subject: [PATCH 04/14] Revert "include/linux/mm_inline.h: fold
|
||||||
|
__update_lru_size() into its sole caller"
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
This patch undoes the following refactor:
|
||||||
|
commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller")
|
||||||
|
|
||||||
|
The upcoming changes to include/linux/mm_inline.h will reuse
|
||||||
|
__update_lru_size().
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I6155c407d50199a43b179c7f45904d4b7c052118
|
||||||
|
---
|
||||||
|
include/linux/mm_inline.h | 9 ++++++++-
|
||||||
|
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
|
||||||
|
index 7b25b53c474a..fb8aadb81cd6 100644
|
||||||
|
--- a/include/linux/mm_inline.h
|
||||||
|
+++ b/include/linux/mm_inline.h
|
||||||
|
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page)
|
||||||
|
return folio_is_file_lru(page_folio(page));
|
||||||
|
}
|
||||||
|
|
||||||
|
-static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||||
|
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||||
|
enum lru_list lru, enum zone_type zid,
|
||||||
|
long nr_pages)
|
||||||
|
{
|
||||||
|
@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||||
|
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||||
|
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||||
|
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||||
|
+ enum lru_list lru, enum zone_type zid,
|
||||||
|
+ long nr_pages)
|
||||||
|
+{
|
||||||
|
+ __update_lru_size(lruvec, lru, zid, nr_pages);
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
||||||
|
#endif
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,810 @@
|
|||||||
|
From 8c6beb4548c216da9dae5e1a7612a108396e3f9e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Mon, 25 Jan 2021 21:12:33 -0700
|
||||||
|
Subject: [PATCH 05/14] mm: multi-gen LRU: groundwork
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Evictable pages are divided into multiple generations for each lruvec.
|
||||||
|
The youngest generation number is stored in lrugen->max_seq for both
|
||||||
|
anon and file types as they are aged on an equal footing. The oldest
|
||||||
|
generation numbers are stored in lrugen->min_seq[] separately for anon
|
||||||
|
and file types as clean file pages can be evicted regardless of swap
|
||||||
|
constraints. These three variables are monotonically increasing.
|
||||||
|
|
||||||
|
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
|
||||||
|
in order to fit into the gen counter in folio->flags. Each truncated
|
||||||
|
generation number is an index to lrugen->lists[]. The sliding window
|
||||||
|
technique is used to track at least MIN_NR_GENS and at most
|
||||||
|
MAX_NR_GENS generations. The gen counter stores a value within [1,
|
||||||
|
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
|
||||||
|
stores 0.
|
||||||
|
|
||||||
|
There are two conceptually independent procedures: "the aging", which
|
||||||
|
produces young generations, and "the eviction", which consumes old
|
||||||
|
generations. They form a closed-loop system, i.e., "the page reclaim".
|
||||||
|
Both procedures can be invoked from userspace for the purposes of
|
||||||
|
working set estimation and proactive reclaim. These techniques are
|
||||||
|
commonly used to optimize job scheduling (bin packing) in data
|
||||||
|
centers [1][2].
|
||||||
|
|
||||||
|
To avoid confusion, the terms "hot" and "cold" will be applied to the
|
||||||
|
multi-gen LRU, as a new convention; the terms "active" and "inactive"
|
||||||
|
will be applied to the active/inactive LRU, as usual.
|
||||||
|
|
||||||
|
The protection of hot pages and the selection of cold pages are based
|
||||||
|
on page access channels and patterns. There are two access channels:
|
||||||
|
one through page tables and the other through file descriptors. The
|
||||||
|
protection of the former channel is by design stronger because:
|
||||||
|
1. The uncertainty in determining the access patterns of the former
|
||||||
|
channel is higher due to the approximation of the accessed bit.
|
||||||
|
2. The cost of evicting the former channel is higher due to the TLB
|
||||||
|
flushes required and the likelihood of encountering the dirty bit.
|
||||||
|
3. The penalty of underprotecting the former channel is higher because
|
||||||
|
applications usually do not prepare themselves for major page
|
||||||
|
faults like they do for blocked I/O. E.g., GUI applications
|
||||||
|
commonly use dedicated I/O threads to avoid blocking rendering
|
||||||
|
threads.
|
||||||
|
There are also two access patterns: one with temporal locality and the
|
||||||
|
other without. For the reasons listed above, the former channel is
|
||||||
|
assumed to follow the former pattern unless VM_SEQ_READ or
|
||||||
|
VM_RAND_READ is present; the latter channel is assumed to follow the
|
||||||
|
latter pattern unless outlying refaults have been observed [3][4].
|
||||||
|
|
||||||
|
The next patch will address the "outlying refaults". Three macros,
|
||||||
|
i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are
|
||||||
|
added in this patch to make the entire patchset less diffy.
|
||||||
|
|
||||||
|
A page is added to the youngest generation on faulting. The aging
|
||||||
|
needs to check the accessed bit at least twice before handing this
|
||||||
|
page over to the eviction. The first check takes care of the accessed
|
||||||
|
bit set on the initial fault; the second check makes sure this page
|
||||||
|
has not been used since then. This protocol, AKA second chance,
|
||||||
|
requires a minimum of two generations, hence MIN_NR_GENS.
|
||||||
|
|
||||||
|
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||||
|
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||||
|
[3] https://lwn.net/Articles/495543/
|
||||||
|
[4] https://lwn.net/Articles/815342/
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
|
||||||
|
---
|
||||||
|
fs/fuse/dev.c | 3 +-
|
||||||
|
include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++
|
||||||
|
include/linux/mmzone.h | 102 +++++++++++++++++
|
||||||
|
include/linux/page-flags-layout.h | 13 ++-
|
||||||
|
include/linux/page-flags.h | 4 +-
|
||||||
|
include/linux/sched.h | 4 +
|
||||||
|
kernel/bounds.c | 5 +
|
||||||
|
mm/Kconfig | 8 ++
|
||||||
|
mm/huge_memory.c | 3 +-
|
||||||
|
mm/memcontrol.c | 2 +
|
||||||
|
mm/memory.c | 25 +++++
|
||||||
|
mm/mm_init.c | 6 +-
|
||||||
|
mm/mmzone.c | 2 +
|
||||||
|
mm/swap.c | 11 +-
|
||||||
|
mm/vmscan.c | 75 +++++++++++++
|
||||||
|
15 files changed, 424 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
|
||||||
|
index 51897427a534..b4a6e0a1b945 100644
|
||||||
|
--- a/fs/fuse/dev.c
|
||||||
|
+++ b/fs/fuse/dev.c
|
||||||
|
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page)
|
||||||
|
1 << PG_active |
|
||||||
|
1 << PG_workingset |
|
||||||
|
1 << PG_reclaim |
|
||||||
|
- 1 << PG_waiters))) {
|
||||||
|
+ 1 << PG_waiters |
|
||||||
|
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||||
|
dump_page(page, "fuse: trying to steal weird page");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
|
||||||
|
index fb8aadb81cd6..2ff703900fd0 100644
|
||||||
|
--- a/include/linux/mm_inline.h
|
||||||
|
+++ b/include/linux/mm_inline.h
|
||||||
|
@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||||
|
{
|
||||||
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||||
|
|
||||||
|
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||||
|
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
||||||
|
+
|
||||||
|
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||||
|
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||||
|
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||||
|
@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
|
||||||
|
return lru;
|
||||||
|
}
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_enabled(void)
|
||||||
|
+{
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_in_fault(void)
|
||||||
|
+{
|
||||||
|
+ return current->in_lru_fault;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||||
|
+{
|
||||||
|
+ return seq % MAX_NR_GENS;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline int folio_lru_gen(struct folio *folio)
|
||||||
|
+{
|
||||||
|
+ unsigned long flags = READ_ONCE(folio->flags);
|
||||||
|
+
|
||||||
|
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||||
|
+{
|
||||||
|
+ unsigned long max_seq = lruvec->lrugen.max_seq;
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||||
|
+
|
||||||
|
+ /* see the comment on MIN_NR_GENS */
|
||||||
|
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
|
||||||
|
+ int old_gen, int new_gen)
|
||||||
|
+{
|
||||||
|
+ int type = folio_is_file_lru(folio);
|
||||||
|
+ int zone = folio_zonenum(folio);
|
||||||
|
+ int delta = folio_nr_pages(folio);
|
||||||
|
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||||
|
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||||
|
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
||||||
|
+
|
||||||
|
+ if (old_gen >= 0)
|
||||||
|
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
||||||
|
+ lrugen->nr_pages[old_gen][type][zone] - delta);
|
||||||
|
+ if (new_gen >= 0)
|
||||||
|
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
||||||
|
+ lrugen->nr_pages[new_gen][type][zone] + delta);
|
||||||
|
+
|
||||||
|
+ /* addition */
|
||||||
|
+ if (old_gen < 0) {
|
||||||
|
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||||
|
+ lru += LRU_ACTIVE;
|
||||||
|
+ __update_lru_size(lruvec, lru, zone, delta);
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* deletion */
|
||||||
|
+ if (new_gen < 0) {
|
||||||
|
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||||
|
+ lru += LRU_ACTIVE;
|
||||||
|
+ __update_lru_size(lruvec, lru, zone, -delta);
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
|
+{
|
||||||
|
+ unsigned long seq;
|
||||||
|
+ unsigned long flags;
|
||||||
|
+ int gen = folio_lru_gen(folio);
|
||||||
|
+ int type = folio_is_file_lru(folio);
|
||||||
|
+ int zone = folio_zonenum(folio);
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
|
||||||
|
+
|
||||||
|
+ if (folio_test_unevictable(folio))
|
||||||
|
+ return false;
|
||||||
|
+ /*
|
||||||
|
+ * There are three common cases for this page:
|
||||||
|
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
|
||||||
|
+ * migrated, add it to the youngest generation.
|
||||||
|
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
||||||
|
+ * not in swapcache or a dirty page pending writeback, add it to the
|
||||||
|
+ * second oldest generation.
|
||||||
|
+ * 3. Everything else (clean, cold) is added to the oldest generation.
|
||||||
|
+ */
|
||||||
|
+ if (folio_test_active(folio))
|
||||||
|
+ seq = lrugen->max_seq;
|
||||||
|
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
|
||||||
|
+ (folio_test_reclaim(folio) &&
|
||||||
|
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
|
||||||
|
+ seq = lrugen->min_seq[type] + 1;
|
||||||
|
+ else
|
||||||
|
+ seq = lrugen->min_seq[type];
|
||||||
|
+
|
||||||
|
+ gen = lru_gen_from_seq(seq);
|
||||||
|
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||||
|
+ /* see the comment on MIN_NR_GENS about PG_active */
|
||||||
|
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||||
|
+
|
||||||
|
+ lru_gen_update_size(lruvec, folio, -1, gen);
|
||||||
|
+ /* for folio_rotate_reclaimable() */
|
||||||
|
+ if (reclaiming)
|
||||||
|
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
|
||||||
|
+ else
|
||||||
|
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
|
+{
|
||||||
|
+ unsigned long flags;
|
||||||
|
+ int gen = folio_lru_gen(folio);
|
||||||
|
+
|
||||||
|
+ if (gen < 0)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
||||||
|
+
|
||||||
|
+ /* for folio_migrate_flags() */
|
||||||
|
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
||||||
|
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
|
||||||
|
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||||
|
+
|
||||||
|
+ lru_gen_update_size(lruvec, folio, gen, -1);
|
||||||
|
+ list_del(&folio->lru);
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#else /* !CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_enabled(void)
|
||||||
|
+{
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_in_fault(void)
|
||||||
|
+{
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
|
+{
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
|
+{
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#endif /* CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
static __always_inline
|
||||||
|
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
|
||||||
|
{
|
||||||
|
enum lru_list lru = folio_lru_list(folio);
|
||||||
|
|
||||||
|
+ if (lru_gen_add_folio(lruvec, folio, false))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||||
|
folio_nr_pages(folio));
|
||||||
|
if (lru != LRU_UNEVICTABLE)
|
||||||
|
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
|
||||||
|
{
|
||||||
|
enum lru_list lru = folio_lru_list(folio);
|
||||||
|
|
||||||
|
+ if (lru_gen_add_folio(lruvec, folio, true))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||||
|
folio_nr_pages(folio));
|
||||||
|
/* This is not expected to be used on LRU_UNEVICTABLE */
|
||||||
|
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
|
||||||
|
{
|
||||||
|
enum lru_list lru = folio_lru_list(folio);
|
||||||
|
|
||||||
|
+ if (lru_gen_del_folio(lruvec, folio, false))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
if (lru != LRU_UNEVICTABLE)
|
||||||
|
list_del(&folio->lru);
|
||||||
|
update_lru_size(lruvec, lru, folio_zonenum(folio),
|
||||||
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||||
|
index e24b40c52468..1abb6d38ed86 100644
|
||||||
|
--- a/include/linux/mmzone.h
|
||||||
|
+++ b/include/linux/mmzone.h
|
||||||
|
@@ -314,6 +314,102 @@ enum lruvec_flags {
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
|
+#endif /* !__GENERATING_BOUNDS_H */
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+ * Evictable pages are divided into multiple generations. The youngest and the
|
||||||
|
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
||||||
|
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||||
|
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||||
|
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
|
||||||
|
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||||
|
+ *
|
||||||
|
+ * A page is added to the youngest generation on faulting. The aging needs to
|
||||||
|
+ * check the accessed bit at least twice before handing this page over to the
|
||||||
|
+ * eviction. The first check takes care of the accessed bit set on the initial
|
||||||
|
+ * fault; the second check makes sure this page hasn't been used since then.
|
||||||
|
+ * This process, AKA second chance, requires a minimum of two generations,
|
||||||
|
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
||||||
|
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
||||||
|
+ * rest of generations, if they exist, are considered inactive. See
|
||||||
|
+ * lru_gen_is_active().
|
||||||
|
+ *
|
||||||
|
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||||
|
+ * the aging needs not to worry about it. And it's set again when a page
|
||||||
|
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||||
|
+ * See lru_gen_add_folio() and lru_gen_del_folio().
|
||||||
|
+ *
|
||||||
|
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
||||||
|
+ * number of categories of the active/inactive LRU when keeping track of
|
||||||
|
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
||||||
|
+ * in folio->flags.
|
||||||
|
+ */
|
||||||
|
+#define MIN_NR_GENS 2U
|
||||||
|
+#define MAX_NR_GENS 4U
|
||||||
|
+
|
||||||
|
+#ifndef __GENERATING_BOUNDS_H
|
||||||
|
+
|
||||||
|
+struct lruvec;
|
||||||
|
+
|
||||||
|
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||||
|
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+
|
||||||
|
+enum {
|
||||||
|
+ LRU_GEN_ANON,
|
||||||
|
+ LRU_GEN_FILE,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+/*
|
||||||
|
+ * The youngest generation number is stored in max_seq for both anon and file
|
||||||
|
+ * types as they are aged on an equal footing. The oldest generation numbers are
|
||||||
|
+ * stored in min_seq[] separately for anon and file types as clean file pages
|
||||||
|
+ * can be evicted regardless of swap constraints.
|
||||||
|
+ *
|
||||||
|
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
|
||||||
|
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
|
||||||
|
+ * min_seq behind.
|
||||||
|
+ *
|
||||||
|
+ * The number of pages in each generation is eventually consistent and therefore
|
||||||
|
+ * can be transiently negative.
|
||||||
|
+ */
|
||||||
|
+struct lru_gen_struct {
|
||||||
|
+ /* the aging increments the youngest generation number */
|
||||||
|
+ unsigned long max_seq;
|
||||||
|
+ /* the eviction increments the oldest generation numbers */
|
||||||
|
+ unsigned long min_seq[ANON_AND_FILE];
|
||||||
|
+ /* the multi-gen LRU lists, lazily sorted on eviction */
|
||||||
|
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||||
|
+ /* the multi-gen LRU sizes, eventually consistent */
|
||||||
|
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||||
|
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#else /* !CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#endif /* CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
struct lruvec {
|
||||||
|
struct list_head lists[NR_LRU_LISTS];
|
||||||
|
/* per lruvec lru_lock for memcg */
|
||||||
|
@@ -331,6 +427,10 @@ struct lruvec {
|
||||||
|
unsigned long refaults[ANON_AND_FILE];
|
||||||
|
/* Various lruvec state flags (enum lruvec_flags) */
|
||||||
|
unsigned long flags;
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+ /* evictable pages divided into generations */
|
||||||
|
+ struct lru_gen_struct lrugen;
|
||||||
|
+#endif
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
struct pglist_data *pgdat;
|
||||||
|
#endif
|
||||||
|
@@ -746,6 +846,8 @@ static inline bool zone_is_empty(struct zone *zone)
|
||||||
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||||
|
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||||
|
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||||
|
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||||
|
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Define the bit shifts to access each section. For non-existent
|
||||||
|
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
|
||||||
|
index ef1e3e736e14..240905407a18 100644
|
||||||
|
--- a/include/linux/page-flags-layout.h
|
||||||
|
+++ b/include/linux/page-flags-layout.h
|
||||||
|
@@ -55,7 +55,8 @@
|
||||||
|
#define SECTIONS_WIDTH 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||||
|
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
#define NODES_WIDTH NODES_SHIFT
|
||||||
|
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||||
|
#error "Vmemmap: No space for nodes field in page flags"
|
||||||
|
@@ -89,8 +90,8 @@
|
||||||
|
#define LAST_CPUPID_SHIFT 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||||
|
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||||
|
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||||
|
#else
|
||||||
|
#define LAST_CPUPID_WIDTH 0
|
||||||
|
@@ -100,10 +101,12 @@
|
||||||
|
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||||
|
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||||
|
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||||
|
#error "Not enough bits in page flags"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
+#define LRU_REFS_WIDTH 0
|
||||||
|
+
|
||||||
|
#endif
|
||||||
|
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
||||||
|
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
|
||||||
|
index 465ff35a8c00..0b0ae5084e60 100644
|
||||||
|
--- a/include/linux/page-flags.h
|
||||||
|
+++ b/include/linux/page-flags.h
|
||||||
|
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
|
||||||
|
1UL << PG_private | 1UL << PG_private_2 | \
|
||||||
|
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||||
|
1UL << PG_slab | 1UL << PG_active | \
|
||||||
|
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||||
|
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Flags checked when a page is prepped for return by the page allocator.
|
||||||
|
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
|
||||||
|
* alloc-free cycle to prevent from reusing the page.
|
||||||
|
*/
|
||||||
|
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||||
|
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||||
|
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||||
|
|
||||||
|
#define PAGE_FLAGS_PRIVATE \
|
||||||
|
(1UL << PG_private | 1UL << PG_private_2)
|
||||||
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||||||
|
index e7b2f8a5c711..8cc46a789193 100644
|
||||||
|
--- a/include/linux/sched.h
|
||||||
|
+++ b/include/linux/sched.h
|
||||||
|
@@ -914,6 +914,10 @@ struct task_struct {
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
unsigned in_user_fault:1;
|
||||||
|
#endif
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+ /* whether the LRU algorithm may apply to this access */
|
||||||
|
+ unsigned in_lru_fault:1;
|
||||||
|
+#endif
|
||||||
|
#ifdef CONFIG_COMPAT_BRK
|
||||||
|
unsigned brk_randomized:1;
|
||||||
|
#endif
|
||||||
|
diff --git a/kernel/bounds.c b/kernel/bounds.c
|
||||||
|
index 9795d75b09b2..5ee60777d8e4 100644
|
||||||
|
--- a/kernel/bounds.c
|
||||||
|
+++ b/kernel/bounds.c
|
||||||
|
@@ -22,6 +22,11 @@ int main(void)
|
||||||
|
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||||
|
#endif
|
||||||
|
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
||||||
|
+#else
|
||||||
|
+ DEFINE(LRU_GEN_WIDTH, 0);
|
||||||
|
+#endif
|
||||||
|
/* End of constants */
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
diff --git a/mm/Kconfig b/mm/Kconfig
|
||||||
|
index 0331f1461f81..d95f07cd6dcf 100644
|
||||||
|
--- a/mm/Kconfig
|
||||||
|
+++ b/mm/Kconfig
|
||||||
|
@@ -1124,6 +1124,14 @@ config PTE_MARKER_UFFD_WP
|
||||||
|
purposes. It is required to enable userfaultfd write protection on
|
||||||
|
file-backed memory types like shmem and hugetlbfs.
|
||||||
|
|
||||||
|
+config LRU_GEN
|
||||||
|
+ bool "Multi-Gen LRU"
|
||||||
|
+ depends on MMU
|
||||||
|
+ # make sure folio->flags has enough spare bits
|
||||||
|
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
||||||
|
+ help
|
||||||
|
+ A high performance LRU implementation to overcommit memory.
|
||||||
|
+
|
||||||
|
source "mm/damon/Kconfig"
|
||||||
|
|
||||||
|
endmenu
|
||||||
|
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
|
||||||
|
index f42bb51e023a..79e0b08b4cf9 100644
|
||||||
|
--- a/mm/huge_memory.c
|
||||||
|
+++ b/mm/huge_memory.c
|
||||||
|
@@ -2438,7 +2438,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
|
||||||
|
#ifdef CONFIG_64BIT
|
||||||
|
(1L << PG_arch_2) |
|
||||||
|
#endif
|
||||||
|
- (1L << PG_dirty)));
|
||||||
|
+ (1L << PG_dirty) |
|
||||||
|
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||||
|
|
||||||
|
/* ->mapping in first tail page is compound_mapcount */
|
||||||
|
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||||
|
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
|
||||||
|
index b69979c9ced5..5fd38d12149c 100644
|
||||||
|
--- a/mm/memcontrol.c
|
||||||
|
+++ b/mm/memcontrol.c
|
||||||
|
@@ -5170,6 +5170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
||||||
|
|
||||||
|
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
+ lru_gen_exit_memcg(memcg);
|
||||||
|
memcg_wb_domain_exit(memcg);
|
||||||
|
__mem_cgroup_free(memcg);
|
||||||
|
}
|
||||||
|
@@ -5228,6 +5229,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||||
|
memcg->deferred_split_queue.split_queue_len = 0;
|
||||||
|
#endif
|
||||||
|
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||||
|
+ lru_gen_init_memcg(memcg);
|
||||||
|
return memcg;
|
||||||
|
fail:
|
||||||
|
mem_cgroup_id_remove(memcg);
|
||||||
|
diff --git a/mm/memory.c b/mm/memory.c
|
||||||
|
index 68294ce1cb06..cd1b5bfd9f3e 100644
|
||||||
|
--- a/mm/memory.c
|
||||||
|
+++ b/mm/memory.c
|
||||||
|
@@ -5108,6 +5108,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
|
||||||
|
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
|
||||||
|
}
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||||
|
+{
|
||||||
|
+ /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||||
|
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void lru_gen_exit_fault(void)
|
||||||
|
+{
|
||||||
|
+ current->in_lru_fault = false;
|
||||||
|
+}
|
||||||
|
+#else
|
||||||
|
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void lru_gen_exit_fault(void)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+#endif /* CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
/*
|
||||||
|
* By the time we get here, we already hold the mm semaphore
|
||||||
|
*
|
||||||
|
@@ -5139,11 +5160,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
|
||||||
|
if (flags & FAULT_FLAG_USER)
|
||||||
|
mem_cgroup_enter_user_fault();
|
||||||
|
|
||||||
|
+ lru_gen_enter_fault(vma);
|
||||||
|
+
|
||||||
|
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||||
|
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||||
|
else
|
||||||
|
ret = __handle_mm_fault(vma, address, flags);
|
||||||
|
|
||||||
|
+ lru_gen_exit_fault();
|
||||||
|
+
|
||||||
|
if (flags & FAULT_FLAG_USER) {
|
||||||
|
mem_cgroup_exit_user_fault();
|
||||||
|
/*
|
||||||
|
diff --git a/mm/mm_init.c b/mm/mm_init.c
|
||||||
|
index 9ddaf0e1b0ab..0d7b2bd2454a 100644
|
||||||
|
--- a/mm/mm_init.c
|
||||||
|
+++ b/mm/mm_init.c
|
||||||
|
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
|
||||||
|
|
||||||
|
shift = 8 * sizeof(unsigned long);
|
||||||
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||||
|
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||||
|
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||||
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||||
|
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||||
|
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||||
|
SECTIONS_WIDTH,
|
||||||
|
NODES_WIDTH,
|
||||||
|
ZONES_WIDTH,
|
||||||
|
LAST_CPUPID_WIDTH,
|
||||||
|
KASAN_TAG_WIDTH,
|
||||||
|
+ LRU_GEN_WIDTH,
|
||||||
|
+ LRU_REFS_WIDTH,
|
||||||
|
NR_PAGEFLAGS);
|
||||||
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||||
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||||
|
diff --git a/mm/mmzone.c b/mm/mmzone.c
|
||||||
|
index 0ae7571e35ab..68e1511be12d 100644
|
||||||
|
--- a/mm/mmzone.c
|
||||||
|
+++ b/mm/mmzone.c
|
||||||
|
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
|
||||||
|
* Poison its list head, so that any operations on it would crash.
|
||||||
|
*/
|
||||||
|
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
|
||||||
|
+
|
||||||
|
+ lru_gen_init_lruvec(lruvec);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||||
|
diff --git a/mm/swap.c b/mm/swap.c
|
||||||
|
index 9cee7f6a3809..0e423b7d458b 100644
|
||||||
|
--- a/mm/swap.c
|
||||||
|
+++ b/mm/swap.c
|
||||||
|
@@ -484,6 +484,11 @@ void folio_add_lru(struct folio *folio)
|
||||||
|
folio_test_unevictable(folio), folio);
|
||||||
|
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||||
|
|
||||||
|
+ /* see the comment in lru_gen_add_folio() */
|
||||||
|
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
|
||||||
|
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
||||||
|
+ folio_set_active(folio);
|
||||||
|
+
|
||||||
|
folio_get(folio);
|
||||||
|
local_lock(&cpu_fbatches.lock);
|
||||||
|
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
|
||||||
|
@@ -575,7 +580,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
|
||||||
|
|
||||||
|
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
|
||||||
|
{
|
||||||
|
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
|
||||||
|
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
|
||||||
|
long nr_pages = folio_nr_pages(folio);
|
||||||
|
|
||||||
|
lruvec_del_folio(lruvec, folio);
|
||||||
|
@@ -688,8 +693,8 @@ void deactivate_page(struct page *page)
|
||||||
|
{
|
||||||
|
struct folio *folio = page_folio(page);
|
||||||
|
|
||||||
|
- if (folio_test_lru(folio) && folio_test_active(folio) &&
|
||||||
|
- !folio_test_unevictable(folio)) {
|
||||||
|
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
|
||||||
|
+ (folio_test_active(folio) || lru_gen_enabled())) {
|
||||||
|
struct folio_batch *fbatch;
|
||||||
|
|
||||||
|
folio_get(folio);
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 4e4331367db9..fb76cfe2fdc2 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -3050,6 +3050,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
|
||||||
|
return can_demote(pgdat->node_id, sc);
|
||||||
|
}
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+
|
||||||
|
+/******************************************************************************
|
||||||
|
+ * shorthand helpers
|
||||||
|
+ ******************************************************************************/
|
||||||
|
+
|
||||||
|
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||||
|
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||||
|
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||||
|
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||||
|
+
|
||||||
|
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||||
|
+{
|
||||||
|
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+ if (memcg) {
|
||||||
|
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||||
|
+
|
||||||
|
+ /* for hotadd_new_pgdat() */
|
||||||
|
+ if (!lruvec->pgdat)
|
||||||
|
+ lruvec->pgdat = pgdat;
|
||||||
|
+
|
||||||
|
+ return lruvec;
|
||||||
|
+ }
|
||||||
|
+#endif
|
||||||
|
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
||||||
|
+
|
||||||
|
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/******************************************************************************
|
||||||
|
+ * initialization
|
||||||
|
+ ******************************************************************************/
|
||||||
|
+
|
||||||
|
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||||
|
+{
|
||||||
|
+ int gen, type, zone;
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+
|
||||||
|
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||||
|
+
|
||||||
|
+ for_each_gen_type_zone(gen, type, zone)
|
||||||
|
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+ int nid;
|
||||||
|
+
|
||||||
|
+ for_each_node(nid) {
|
||||||
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
||||||
|
+ sizeof(lruvec->lrugen.nr_pages)));
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+static int __init init_lru_gen(void)
|
||||||
|
+{
|
||||||
|
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||||
|
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+};
|
||||||
|
+late_initcall(init_lru_gen);
|
||||||
|
+
|
||||||
|
+#endif /* CONFIG_LRU_GEN */
|
||||||
|
+
|
||||||
|
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||||
|
{
|
||||||
|
unsigned long nr[NR_LRU_LISTS];
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,495 @@
|
|||||||
|
From 93fa87bdef9e7fa9977355c4712c000f31639231 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Thu, 27 Jan 2022 20:43:22 -0700
|
||||||
|
Subject: [PATCH 07/14] mm: multi-gen LRU: exploit locality in rmap
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Searching the rmap for PTEs mapping each page on an LRU list (to test
|
||||||
|
and clear the accessed bit) can be expensive because pages from
|
||||||
|
different VMAs (PA space) are not cache friendly to the rmap (VA
|
||||||
|
space). For workloads mostly using mapped pages, searching the rmap
|
||||||
|
can incur the highest CPU cost in the reclaim path.
|
||||||
|
|
||||||
|
This patch exploits spatial locality to reduce the trips into the
|
||||||
|
rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
|
||||||
|
new function lru_gen_look_around() scans at most BITS_PER_LONG-1
|
||||||
|
adjacent PTEs. On finding another young PTE, it clears the accessed
|
||||||
|
bit and updates the gen counter of the page mapped by this PTE to
|
||||||
|
(max_seq%MAX_NR_GENS)+1.
|
||||||
|
|
||||||
|
Server benchmark results:
|
||||||
|
Single workload:
|
||||||
|
fio (buffered I/O): no change
|
||||||
|
|
||||||
|
Single workload:
|
||||||
|
memcached (anon): +[3, 5]%
|
||||||
|
Ops/sec KB/sec
|
||||||
|
patch1-6: 1106168.46 43025.04
|
||||||
|
patch1-7: 1147696.57 44640.29
|
||||||
|
|
||||||
|
Configurations:
|
||||||
|
no change
|
||||||
|
|
||||||
|
Client benchmark results:
|
||||||
|
kswapd profiles:
|
||||||
|
patch1-6
|
||||||
|
39.03% lzo1x_1_do_compress (real work)
|
||||||
|
18.47% page_vma_mapped_walk (overhead)
|
||||||
|
6.74% _raw_spin_unlock_irq
|
||||||
|
3.97% do_raw_spin_lock
|
||||||
|
2.49% ptep_clear_flush
|
||||||
|
2.48% anon_vma_interval_tree_iter_first
|
||||||
|
1.92% folio_referenced_one
|
||||||
|
1.88% __zram_bvec_write
|
||||||
|
1.48% memmove
|
||||||
|
1.31% vma_interval_tree_iter_next
|
||||||
|
|
||||||
|
patch1-7
|
||||||
|
48.16% lzo1x_1_do_compress (real work)
|
||||||
|
8.20% page_vma_mapped_walk (overhead)
|
||||||
|
7.06% _raw_spin_unlock_irq
|
||||||
|
2.92% ptep_clear_flush
|
||||||
|
2.53% __zram_bvec_write
|
||||||
|
2.11% do_raw_spin_lock
|
||||||
|
2.02% memmove
|
||||||
|
1.93% lru_gen_look_around
|
||||||
|
1.56% free_unref_page_list
|
||||||
|
1.40% memset
|
||||||
|
|
||||||
|
Configurations:
|
||||||
|
no change
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Barry Song <baohua@kernel.org>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I4b9ca0fd20f566ce554e703f14cee3fe0048c2fd
|
||||||
|
---
|
||||||
|
include/linux/memcontrol.h | 31 +++++++
|
||||||
|
include/linux/mm.h | 5 +
|
||||||
|
include/linux/mmzone.h | 6 ++
|
||||||
|
mm/internal.h | 1 +
|
||||||
|
mm/memcontrol.c | 1 +
|
||||||
|
mm/rmap.c | 6 ++
|
||||||
|
mm/swap.c | 4 +-
|
||||||
|
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
|
||||||
|
8 files changed, 236 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
|
||||||
|
index 567f12323f55..d2b7f6b9998c 100644
|
||||||
|
--- a/include/linux/memcontrol.h
|
||||||
|
+++ b/include/linux/memcontrol.h
|
||||||
|
@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
|
||||||
|
* - LRU isolation
|
||||||
|
* - lock_page_memcg()
|
||||||
|
* - exclusive reference
|
||||||
|
+ * - mem_cgroup_trylock_pages()
|
||||||
|
*
|
||||||
|
* For a kmem folio a caller should hold an rcu read lock to protect memcg
|
||||||
|
* associated with a kmem folio from being released.
|
||||||
|
@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
|
||||||
|
* - LRU isolation
|
||||||
|
* - lock_page_memcg()
|
||||||
|
* - exclusive reference
|
||||||
|
+ * - mem_cgroup_trylock_pages()
|
||||||
|
*
|
||||||
|
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||||
|
* associated with a kmem page from being released.
|
||||||
|
@@ -959,6 +961,23 @@ void unlock_page_memcg(struct page *page);
|
||||||
|
|
||||||
|
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
|
||||||
|
|
||||||
|
+/* try to stablize folio_memcg() for all the pages in a memcg */
|
||||||
|
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+ rcu_read_lock();
|
||||||
|
+
|
||||||
|
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
|
||||||
|
+ return true;
|
||||||
|
+
|
||||||
|
+ rcu_read_unlock();
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void mem_cgroup_unlock_pages(void)
|
||||||
|
+{
|
||||||
|
+ rcu_read_unlock();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||||
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||||
|
int idx, int val)
|
||||||
|
@@ -1433,6 +1452,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||||
|
+{
|
||||||
|
+ /* to match folio_memcg_rcu() */
|
||||||
|
+ rcu_read_lock();
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void mem_cgroup_unlock_pages(void)
|
||||||
|
+{
|
||||||
|
+ rcu_read_unlock();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static inline void mem_cgroup_handle_over_high(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
diff --git a/include/linux/mm.h b/include/linux/mm.h
|
||||||
|
index 21f8b27bd9fd..88976a521ef5 100644
|
||||||
|
--- a/include/linux/mm.h
|
||||||
|
+++ b/include/linux/mm.h
|
||||||
|
@@ -1465,6 +1465,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
|
||||||
|
return page_to_pfn(&folio->page);
|
||||||
|
}
|
||||||
|
|
||||||
|
+static inline struct folio *pfn_folio(unsigned long pfn)
|
||||||
|
+{
|
||||||
|
+ return page_folio(pfn_to_page(pfn));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
|
||||||
|
{
|
||||||
|
return &folio_page(folio, 1)->compound_pincount;
|
||||||
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||||
|
index 07bd22149c22..2b4dc60d0f6c 100644
|
||||||
|
--- a/include/linux/mmzone.h
|
||||||
|
+++ b/include/linux/mmzone.h
|
||||||
|
@@ -372,6 +372,7 @@ enum lruvec_flags {
|
||||||
|
#ifndef __GENERATING_BOUNDS_H
|
||||||
|
|
||||||
|
struct lruvec;
|
||||||
|
+struct page_vma_mapped_walk;
|
||||||
|
|
||||||
|
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||||
|
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||||
|
@@ -427,6 +428,7 @@ struct lru_gen_struct {
|
||||||
|
};
|
||||||
|
|
||||||
|
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||||
|
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||||
|
@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||||
|
+{
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
diff --git a/mm/internal.h b/mm/internal.h
|
||||||
|
index 785409805ed7..a1fddea6b34f 100644
|
||||||
|
--- a/mm/internal.h
|
||||||
|
+++ b/mm/internal.h
|
||||||
|
@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||||
|
void folio_rotate_reclaimable(struct folio *folio);
|
||||||
|
bool __folio_end_writeback(struct folio *folio);
|
||||||
|
void deactivate_file_folio(struct folio *folio);
|
||||||
|
+void folio_activate(struct folio *folio);
|
||||||
|
|
||||||
|
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||||
|
unsigned long floor, unsigned long ceiling);
|
||||||
|
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
|
||||||
|
index 5fd38d12149c..882180866e31 100644
|
||||||
|
--- a/mm/memcontrol.c
|
||||||
|
+++ b/mm/memcontrol.c
|
||||||
|
@@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
|
||||||
|
* - LRU isolation
|
||||||
|
* - lock_page_memcg()
|
||||||
|
* - exclusive reference
|
||||||
|
+ * - mem_cgroup_trylock_pages()
|
||||||
|
*/
|
||||||
|
folio->memcg_data = (unsigned long)memcg;
|
||||||
|
}
|
||||||
|
diff --git a/mm/rmap.c b/mm/rmap.c
|
||||||
|
index 93d5a6f793d2..9e0ce48bca08 100644
|
||||||
|
--- a/mm/rmap.c
|
||||||
|
+++ b/mm/rmap.c
|
||||||
|
@@ -833,6 +833,12 @@ static bool folio_referenced_one(struct folio *folio,
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pvmw.pte) {
|
||||||
|
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||||
|
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||||
|
+ lru_gen_look_around(&pvmw);
|
||||||
|
+ referenced++;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if (ptep_clear_flush_young_notify(vma, address,
|
||||||
|
pvmw.pte)) {
|
||||||
|
/*
|
||||||
|
diff --git a/mm/swap.c b/mm/swap.c
|
||||||
|
index f74fd51fa9e1..0a3871a70952 100644
|
||||||
|
--- a/mm/swap.c
|
||||||
|
+++ b/mm/swap.c
|
||||||
|
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu)
|
||||||
|
folio_batch_move_lru(fbatch, folio_activate_fn);
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void folio_activate(struct folio *folio)
|
||||||
|
+void folio_activate(struct folio *folio)
|
||||||
|
{
|
||||||
|
if (folio_test_lru(folio) && !folio_test_active(folio) &&
|
||||||
|
!folio_test_unevictable(folio)) {
|
||||||
|
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(int cpu)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void folio_activate(struct folio *folio)
|
||||||
|
+void folio_activate(struct folio *folio)
|
||||||
|
{
|
||||||
|
struct lruvec *lruvec;
|
||||||
|
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 8e63f95a5f53..8686918e238d 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -1635,6 +1635,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||||
|
if (!sc->may_unmap && folio_mapped(folio))
|
||||||
|
goto keep_locked;
|
||||||
|
|
||||||
|
+ /* folio_update_gen() tried to promote this page? */
|
||||||
|
+ if (lru_gen_enabled() && !ignore_references &&
|
||||||
|
+ folio_mapped(folio) && folio_test_referenced(folio))
|
||||||
|
+ goto keep_locked;
|
||||||
|
+
|
||||||
|
/*
|
||||||
|
* The number of dirty pages determines if a node is marked
|
||||||
|
* reclaim_congested. kswapd will stall and start writing
|
||||||
|
@@ -3219,6 +3224,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
|
||||||
|
* the aging
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
+/* promote pages accessed through page tables */
|
||||||
|
+static int folio_update_gen(struct folio *folio, int gen)
|
||||||
|
+{
|
||||||
|
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||||
|
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
|
||||||
|
+
|
||||||
|
+ do {
|
||||||
|
+ /* lru_gen_del_folio() has isolated this page? */
|
||||||
|
+ if (!(old_flags & LRU_GEN_MASK)) {
|
||||||
|
+ /* for shrink_page_list() */
|
||||||
|
+ new_flags = old_flags | BIT(PG_referenced);
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||||
|
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||||
|
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
|
||||||
|
+
|
||||||
|
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/* protect pages accessed multiple times through file descriptors */
|
||||||
|
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
|
{
|
||||||
|
@@ -3230,6 +3258,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
|
||||||
|
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
|
||||||
|
|
||||||
|
do {
|
||||||
|
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||||
|
+ /* folio_update_gen() has promoted this page? */
|
||||||
|
+ if (new_gen >= 0 && new_gen != old_gen)
|
||||||
|
+ return new_gen;
|
||||||
|
+
|
||||||
|
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||||||
|
|
||||||
|
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||||
|
@@ -3244,6 +3277,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
|
||||||
|
return new_gen;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
+{
|
||||||
|
+ unsigned long pfn = pte_pfn(pte);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
||||||
|
+
|
||||||
|
+ if (!pte_present(pte) || is_zero_pfn(pfn))
|
||||||
|
+ return -1;
|
||||||
|
+
|
||||||
|
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
|
||||||
|
+ return -1;
|
||||||
|
+
|
||||||
|
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
||||||
|
+ return -1;
|
||||||
|
+
|
||||||
|
+ return pfn;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
|
||||||
|
+ struct pglist_data *pgdat)
|
||||||
|
+{
|
||||||
|
+ struct folio *folio;
|
||||||
|
+
|
||||||
|
+ /* try to avoid unnecessary memory loads */
|
||||||
|
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
||||||
|
+ return NULL;
|
||||||
|
+
|
||||||
|
+ folio = pfn_folio(pfn);
|
||||||
|
+ if (folio_nid(folio) != pgdat->node_id)
|
||||||
|
+ return NULL;
|
||||||
|
+
|
||||||
|
+ if (folio_memcg_rcu(folio) != memcg)
|
||||||
|
+ return NULL;
|
||||||
|
+
|
||||||
|
+ return folio;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||||
|
{
|
||||||
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
@@ -3443,6 +3513,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||||
|
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||||
|
}
|
||||||
|
|
||||||
|
+/*
|
||||||
|
+ * This function exploits spatial locality when shrink_page_list() walks the
|
||||||
|
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
||||||
|
+ */
|
||||||
|
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||||
|
+{
|
||||||
|
+ int i;
|
||||||
|
+ pte_t *pte;
|
||||||
|
+ unsigned long start;
|
||||||
|
+ unsigned long end;
|
||||||
|
+ unsigned long addr;
|
||||||
|
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
||||||
|
+ struct folio *folio = pfn_folio(pvmw->pfn);
|
||||||
|
+ struct mem_cgroup *memcg = folio_memcg(folio);
|
||||||
|
+ struct pglist_data *pgdat = folio_pgdat(folio);
|
||||||
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
|
||||||
|
+
|
||||||
|
+ lockdep_assert_held(pvmw->ptl);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
|
||||||
|
+
|
||||||
|
+ if (spin_is_contended(pvmw->ptl))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
||||||
|
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
||||||
|
+
|
||||||
|
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
|
||||||
|
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||||
|
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
|
||||||
|
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||||
|
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
|
||||||
|
+ else {
|
||||||
|
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||||
|
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
|
||||||
|
+
|
||||||
|
+ rcu_read_lock();
|
||||||
|
+ arch_enter_lazy_mmu_mode();
|
||||||
|
+
|
||||||
|
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
||||||
|
+ unsigned long pfn;
|
||||||
|
+
|
||||||
|
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
|
||||||
|
+ if (pfn == -1)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ if (!pte_young(pte[i]))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ folio = get_pfn_folio(pfn, memcg, pgdat);
|
||||||
|
+ if (!folio)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
||||||
|
+ VM_WARN_ON_ONCE(true);
|
||||||
|
+
|
||||||
|
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
|
||||||
|
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
|
||||||
|
+ !folio_test_swapcache(folio)))
|
||||||
|
+ folio_mark_dirty(folio);
|
||||||
|
+
|
||||||
|
+ old_gen = folio_lru_gen(folio);
|
||||||
|
+ if (old_gen < 0)
|
||||||
|
+ folio_set_referenced(folio);
|
||||||
|
+ else if (old_gen != new_gen)
|
||||||
|
+ __set_bit(i, bitmap);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ arch_leave_lazy_mmu_mode();
|
||||||
|
+ rcu_read_unlock();
|
||||||
|
+
|
||||||
|
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
||||||
|
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||||
|
+ folio = pfn_folio(pte_pfn(pte[i]));
|
||||||
|
+ folio_activate(folio);
|
||||||
|
+ }
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* folio_update_gen() requires stable folio_memcg() */
|
||||||
|
+ if (!mem_cgroup_trylock_pages(memcg))
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||||||
|
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
||||||
|
+
|
||||||
|
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||||
|
+ folio = pfn_folio(pte_pfn(pte[i]));
|
||||||
|
+ if (folio_memcg_rcu(folio) != memcg)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ old_gen = folio_update_gen(folio, new_gen);
|
||||||
|
+ if (old_gen < 0 || old_gen == new_gen)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||||
|
+
|
||||||
|
+ mem_cgroup_unlock_pages();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/******************************************************************************
|
||||||
|
* the eviction
|
||||||
|
******************************************************************************/
|
||||||
|
@@ -3479,6 +3657,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* promoted */
|
||||||
|
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||||
|
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
|
||||||
|
+ return true;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* protected */
|
||||||
|
if (tier > tier_idx) {
|
||||||
|
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,295 @@
|
|||||||
|
From 6b9670b94ba2b49b289b997121062500e32fc3e4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Thu, 27 Jan 2022 19:59:54 -0700
|
||||||
|
Subject: [PATCH 09/14] mm: multi-gen LRU: optimize multiple memcgs
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
When multiple memcgs are available, it is possible to use generations
|
||||||
|
as a frame of reference to make better choices and improve overall
|
||||||
|
performance under global memory pressure. This patch adds a basic
|
||||||
|
optimization to select memcgs that can drop single-use unmapped clean
|
||||||
|
pages first. Doing so reduces the chance of going into the aging path
|
||||||
|
or swapping, which can be costly.
|
||||||
|
|
||||||
|
A typical example that benefits from this optimization is a server
|
||||||
|
running mixed types of workloads, e.g., heavy anon workload in one
|
||||||
|
memcg and heavy buffered I/O workload in the other.
|
||||||
|
|
||||||
|
Though this optimization can be applied to both kswapd and direct
|
||||||
|
reclaim, it is only added to kswapd to keep the patchset manageable.
|
||||||
|
Later improvements may cover the direct reclaim path.
|
||||||
|
|
||||||
|
While ensuring certain fairness to all eligible memcgs, proportional
|
||||||
|
scans of individual memcgs also require proper backoff to avoid
|
||||||
|
overshooting their aggregate reclaim target by too much. Otherwise it
|
||||||
|
can cause high direct reclaim latency. The conditions for backoff are:
|
||||||
|
1. At low priorities, for direct reclaim, if aging fairness or direct
|
||||||
|
reclaim latency is at risk, i.e., aging one memcg multiple times or
|
||||||
|
swapping after the target is met.
|
||||||
|
2. At high priorities, for global reclaim, if per-zone free pages are
|
||||||
|
above respective watermarks.
|
||||||
|
|
||||||
|
Server benchmark results:
|
||||||
|
Mixed workloads:
|
||||||
|
fio (buffered I/O): +[19, 21]%
|
||||||
|
IOPS BW
|
||||||
|
patch1-8: 1880k 7343MiB/s
|
||||||
|
patch1-9: 2252k 8796MiB/s
|
||||||
|
|
||||||
|
memcached (anon): +[119, 123]%
|
||||||
|
Ops/sec KB/sec
|
||||||
|
patch1-8: 862768.65 33514.68
|
||||||
|
patch1-9: 1911022.12 74234.54
|
||||||
|
|
||||||
|
Mixed workloads:
|
||||||
|
fio (buffered I/O): +[75, 77]%
|
||||||
|
IOPS BW
|
||||||
|
5.19-rc1: 1279k 4996MiB/s
|
||||||
|
patch1-9: 2252k 8796MiB/s
|
||||||
|
|
||||||
|
memcached (anon): +[13, 15]%
|
||||||
|
Ops/sec KB/sec
|
||||||
|
5.19-rc1: 1673524.04 65008.87
|
||||||
|
patch1-9: 1911022.12 74234.54
|
||||||
|
|
||||||
|
Configurations:
|
||||||
|
(changes since patch 6)
|
||||||
|
|
||||||
|
cat mixed.sh
|
||||||
|
modprobe brd rd_nr=2 rd_size=56623104
|
||||||
|
|
||||||
|
swapoff -a
|
||||||
|
mkswap /dev/ram0
|
||||||
|
swapon /dev/ram0
|
||||||
|
|
||||||
|
mkfs.ext4 /dev/ram1
|
||||||
|
mount -t ext4 /dev/ram1 /mnt
|
||||||
|
|
||||||
|
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||||
|
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||||
|
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
|
||||||
|
--ratio 1:0 --pipeline 8 -d 2000
|
||||||
|
|
||||||
|
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
|
||||||
|
--buffered=1 --ioengine=io_uring --iodepth=128 \
|
||||||
|
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
|
||||||
|
--rw=randread --random_distribution=random --norandommap \
|
||||||
|
--time_based --ramp_time=10m --runtime=90m --group_reporting &
|
||||||
|
pid=$!
|
||||||
|
|
||||||
|
sleep 200
|
||||||
|
|
||||||
|
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||||
|
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||||
|
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
|
||||||
|
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
|
||||||
|
|
||||||
|
kill -INT $pid
|
||||||
|
wait
|
||||||
|
|
||||||
|
Client benchmark results:
|
||||||
|
no change (CONFIG_MEMCG=n)
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I7e00e0c733437e534ac98031cf8154a681becc00
|
||||||
|
---
|
||||||
|
mm/vmscan.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||||
|
1 file changed, 95 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index fcb437769a60..e7b74ab67973 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -131,6 +131,12 @@ struct scan_control {
|
||||||
|
/* Always discard instead of demoting to lower tier memory */
|
||||||
|
unsigned int no_demotion:1;
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN
|
||||||
|
+ /* help kswapd make better choices among multiple memcgs */
|
||||||
|
+ unsigned int memcgs_need_aging:1;
|
||||||
|
+ unsigned long last_reclaimed;
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
/* Allocation order */
|
||||||
|
s8 order;
|
||||||
|
|
||||||
|
@@ -4429,6 +4435,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||||
|
|
||||||
|
+ sc->last_reclaimed = sc->nr_reclaimed;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * To reduce the chance of going into the aging path, which can be
|
||||||
|
+ * costly, optimistically skip it if the flag below was cleared in the
|
||||||
|
+ * eviction path. This improves the overall performance when multiple
|
||||||
|
+ * memcgs are available.
|
||||||
|
+ */
|
||||||
|
+ if (!sc->memcgs_need_aging) {
|
||||||
|
+ sc->memcgs_need_aging = true;
|
||||||
|
+ return;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
set_mm_walk(pgdat);
|
||||||
|
|
||||||
|
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||||
|
@@ -4840,7 +4859,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
|
||||||
|
return scanned;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||||
|
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||||
|
+ bool *need_swapping)
|
||||||
|
{
|
||||||
|
int type;
|
||||||
|
int scanned;
|
||||||
|
@@ -4903,6 +4923,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
|
||||||
|
|
||||||
|
sc->nr_reclaimed += reclaimed;
|
||||||
|
|
||||||
|
+ if (need_swapping && type == LRU_GEN_ANON)
|
||||||
|
+ *need_swapping = true;
|
||||||
|
+
|
||||||
|
return scanned;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -4912,9 +4935,8 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
|
||||||
|
* reclaim.
|
||||||
|
*/
|
||||||
|
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||||
|
- bool can_swap)
|
||||||
|
+ bool can_swap, bool *need_aging)
|
||||||
|
{
|
||||||
|
- bool need_aging;
|
||||||
|
unsigned long nr_to_scan;
|
||||||
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||||
|
DEFINE_MAX_SEQ(lruvec);
|
||||||
|
@@ -4924,8 +4946,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
|
||||||
|
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||||
|
- if (!need_aging)
|
||||||
|
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||||
|
+ if (!*need_aging)
|
||||||
|
return nr_to_scan;
|
||||||
|
|
||||||
|
/* skip the aging path at the default priority */
|
||||||
|
@@ -4942,10 +4964,67 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
|
||||||
|
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||||
|
+ struct scan_control *sc, bool need_swapping)
|
||||||
|
+{
|
||||||
|
+ int i;
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
+
|
||||||
|
+ if (!current_is_kswapd()) {
|
||||||
|
+ /* age each memcg at most once to ensure fairness */
|
||||||
|
+ if (max_seq - seq > 1)
|
||||||
|
+ return true;
|
||||||
|
+
|
||||||
|
+ /* over-swapping can increase allocation latency */
|
||||||
|
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||||
|
+ return true;
|
||||||
|
+
|
||||||
|
+ /* give this thread a chance to exit and free its memory */
|
||||||
|
+ if (fatal_signal_pending(current)) {
|
||||||
|
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||||
|
+ return true;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (cgroup_reclaim(sc))
|
||||||
|
+ return false;
|
||||||
|
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ /* keep scanning at low priorities to ensure fairness */
|
||||||
|
+ if (sc->priority > DEF_PRIORITY - 2)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * A minimum amount of work was done under global memory pressure. For
|
||||||
|
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
|
||||||
|
+ * may succeed if all suitable zones are somewhat safe. In either case,
|
||||||
|
+ * it's better to stop now, and restart later if necessary.
|
||||||
|
+ */
|
||||||
|
+ for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||||
|
+ unsigned long wmark;
|
||||||
|
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||||
|
+
|
||||||
|
+ if (!managed_zone(zone))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||||
|
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||||
|
{
|
||||||
|
struct blk_plug plug;
|
||||||
|
+ bool need_aging = false;
|
||||||
|
+ bool need_swapping = false;
|
||||||
|
unsigned long scanned = 0;
|
||||||
|
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
|
||||||
|
lru_add_drain();
|
||||||
|
|
||||||
|
@@ -4965,21 +5044,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
|
||||||
|
else
|
||||||
|
swappiness = 0;
|
||||||
|
|
||||||
|
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||||
|
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||||
|
if (!nr_to_scan)
|
||||||
|
- break;
|
||||||
|
+ goto done;
|
||||||
|
|
||||||
|
- delta = evict_folios(lruvec, sc, swappiness);
|
||||||
|
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
|
||||||
|
if (!delta)
|
||||||
|
- break;
|
||||||
|
+ goto done;
|
||||||
|
|
||||||
|
scanned += delta;
|
||||||
|
if (scanned >= nr_to_scan)
|
||||||
|
break;
|
||||||
|
|
||||||
|
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||||
|
+ break;
|
||||||
|
+
|
||||||
|
cond_resched();
|
||||||
|
}
|
||||||
|
|
||||||
|
+ /* see the comment in lru_gen_age_node() */
|
||||||
|
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||||
|
+ sc->memcgs_need_aging = false;
|
||||||
|
+done:
|
||||||
|
clear_mm_walk();
|
||||||
|
|
||||||
|
blk_finish_plug(&plug);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,490 @@
|
|||||||
|
From ef61bb3622ee0f36e055dfd5006badff08f5ce61 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Thu, 27 Jan 2022 19:52:09 -0700
|
||||||
|
Subject: [PATCH 10/14] mm: multi-gen LRU: kill switch
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
|
||||||
|
can be disabled include:
|
||||||
|
0x0001: the multi-gen LRU core
|
||||||
|
0x0002: walking page table, when arch_has_hw_pte_young() returns
|
||||||
|
true
|
||||||
|
0x0004: clearing the accessed bit in non-leaf PMD entries, when
|
||||||
|
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
|
||||||
|
[yYnN]: apply to all the components above
|
||||||
|
E.g.,
|
||||||
|
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0007
|
||||||
|
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0005
|
||||||
|
|
||||||
|
NB: the page table walks happen on the scale of seconds under heavy
|
||||||
|
memory pressure, in which case the mmap_lock contention is a lesser
|
||||||
|
concern, compared with the LRU lock contention and the I/O congestion.
|
||||||
|
So far the only well-known case of the mmap_lock contention happens on
|
||||||
|
Android, due to Scudo [1] which allocates several thousand VMAs for
|
||||||
|
merely a few hundred MBs. The SPF and the Maple Tree also have
|
||||||
|
provided their own assessments [2][3]. However, if walking page tables
|
||||||
|
does worsen the mmap_lock contention, the kill switch can be used to
|
||||||
|
disable it. In this case the multi-gen LRU will suffer a minor
|
||||||
|
performance degradation, as shown previously.
|
||||||
|
|
||||||
|
Clearing the accessed bit in non-leaf PMD entries can also be
|
||||||
|
disabled, since this behavior was not tested on x86 varieties other
|
||||||
|
than Intel and AMD.
|
||||||
|
|
||||||
|
[1] https://source.android.com/devices/tech/debug/scudo
|
||||||
|
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
|
||||||
|
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I4c909618e8fed7fb1337f6624bbe542ec920a515
|
||||||
|
---
|
||||||
|
include/linux/cgroup.h | 15 ++-
|
||||||
|
include/linux/mm_inline.h | 15 ++-
|
||||||
|
include/linux/mmzone.h | 9 ++
|
||||||
|
kernel/cgroup/cgroup-internal.h | 1 -
|
||||||
|
mm/Kconfig | 6 +
|
||||||
|
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
|
||||||
|
6 files changed, 265 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
|
||||||
|
index ac5d0515680e..9179463c3c9f 100644
|
||||||
|
--- a/include/linux/cgroup.h
|
||||||
|
+++ b/include/linux/cgroup.h
|
||||||
|
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
|
||||||
|
css_put(&cgrp->self);
|
||||||
|
}
|
||||||
|
|
||||||
|
+extern struct mutex cgroup_mutex;
|
||||||
|
+
|
||||||
|
+static inline void cgroup_lock(void)
|
||||||
|
+{
|
||||||
|
+ mutex_lock(&cgroup_mutex);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static inline void cgroup_unlock(void)
|
||||||
|
+{
|
||||||
|
+ mutex_unlock(&cgroup_mutex);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/**
|
||||||
|
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||||
|
* @task: the task to obtain css_set for
|
||||||
|
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
|
||||||
|
* as locks used during the cgroup_subsys::attach() methods.
|
||||||
|
*/
|
||||||
|
#ifdef CONFIG_PROVE_RCU
|
||||||
|
-extern struct mutex cgroup_mutex;
|
||||||
|
extern spinlock_t css_set_lock;
|
||||||
|
#define task_css_set_check(task, __c) \
|
||||||
|
rcu_dereference_check((task)->cgroups, \
|
||||||
|
@@ -708,6 +719,8 @@ struct cgroup;
|
||||||
|
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||||
|
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||||
|
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||||
|
+static inline void cgroup_lock(void) {}
|
||||||
|
+static inline void cgroup_unlock(void) {}
|
||||||
|
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||||
|
struct task_struct *t) { return 0; }
|
||||||
|
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||||
|
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
|
||||||
|
index f2b2296a42f9..4949eda9a9a2 100644
|
||||||
|
--- a/include/linux/mm_inline.h
|
||||||
|
+++ b/include/linux/mm_inline.h
|
||||||
|
@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
|
||||||
|
|
||||||
|
#ifdef CONFIG_LRU_GEN
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||||
|
static inline bool lru_gen_enabled(void)
|
||||||
|
{
|
||||||
|
- return true;
|
||||||
|
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||||
|
+
|
||||||
|
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||||
|
+}
|
||||||
|
+#else
|
||||||
|
+static inline bool lru_gen_enabled(void)
|
||||||
|
+{
|
||||||
|
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||||
|
+
|
||||||
|
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||||
|
}
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
static inline bool lru_gen_in_fault(void)
|
||||||
|
{
|
||||||
|
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
|
||||||
|
|
||||||
|
- if (folio_test_unevictable(folio))
|
||||||
|
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
|
||||||
|
return false;
|
||||||
|
/*
|
||||||
|
* There are three common cases for this page:
|
||||||
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||||
|
index e5cf37dc41a4..39bca2e420b7 100644
|
||||||
|
--- a/include/linux/mmzone.h
|
||||||
|
+++ b/include/linux/mmzone.h
|
||||||
|
@@ -384,6 +384,13 @@ enum {
|
||||||
|
LRU_GEN_FILE,
|
||||||
|
};
|
||||||
|
|
||||||
|
+enum {
|
||||||
|
+ LRU_GEN_CORE,
|
||||||
|
+ LRU_GEN_MM_WALK,
|
||||||
|
+ LRU_GEN_NONLEAF_YOUNG,
|
||||||
|
+ NR_LRU_GEN_CAPS
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||||
|
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
||||||
|
|
||||||
|
@@ -425,6 +432,8 @@ struct lru_gen_struct {
|
||||||
|
/* can be modified without holding the LRU lock */
|
||||||
|
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||||
|
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||||
|
+ /* whether the multi-gen LRU is enabled */
|
||||||
|
+ bool enabled;
|
||||||
|
};
|
||||||
|
|
||||||
|
enum {
|
||||||
|
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
|
||||||
|
index 36b740cb3d59..63dc3e82be4f 100644
|
||||||
|
--- a/kernel/cgroup/cgroup-internal.h
|
||||||
|
+++ b/kernel/cgroup/cgroup-internal.h
|
||||||
|
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
|
||||||
|
#define DEFINE_CGROUP_MGCTX(name) \
|
||||||
|
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||||
|
|
||||||
|
-extern struct mutex cgroup_mutex;
|
||||||
|
extern spinlock_t css_set_lock;
|
||||||
|
extern struct cgroup_subsys *cgroup_subsys[];
|
||||||
|
extern struct list_head cgroup_roots;
|
||||||
|
diff --git a/mm/Kconfig b/mm/Kconfig
|
||||||
|
index 5101dca8f21c..6c86849c4db9 100644
|
||||||
|
--- a/mm/Kconfig
|
||||||
|
+++ b/mm/Kconfig
|
||||||
|
@@ -1133,6 +1133,12 @@ config LRU_GEN
|
||||||
|
help
|
||||||
|
A high performance LRU implementation to overcommit memory.
|
||||||
|
|
||||||
|
+config LRU_GEN_ENABLED
|
||||||
|
+ bool "Enable by default"
|
||||||
|
+ depends on LRU_GEN
|
||||||
|
+ help
|
||||||
|
+ This option enables the multi-gen LRU by default.
|
||||||
|
+
|
||||||
|
config LRU_GEN_STATS
|
||||||
|
bool "Full stats for debugging"
|
||||||
|
depends on LRU_GEN
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index e7b74ab67973..ea3d497019ab 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -51,6 +51,7 @@
|
||||||
|
#include <linux/psi.h>
|
||||||
|
#include <linux/pagewalk.h>
|
||||||
|
#include <linux/shmem_fs.h>
|
||||||
|
+#include <linux/ctype.h>
|
||||||
|
|
||||||
|
#include <asm/tlbflush.h>
|
||||||
|
#include <asm/div64.h>
|
||||||
|
@@ -3070,6 +3071,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
|
||||||
|
|
||||||
|
#ifdef CONFIG_LRU_GEN
|
||||||
|
|
||||||
|
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||||
|
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||||
|
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
|
||||||
|
+#else
|
||||||
|
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||||
|
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
/******************************************************************************
|
||||||
|
* shorthand helpers
|
||||||
|
******************************************************************************/
|
||||||
|
@@ -3946,7 +3955,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
|
||||||
|
goto next;
|
||||||
|
|
||||||
|
if (!pmd_trans_huge(pmd[i])) {
|
||||||
|
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
||||||
|
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||||
|
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||||
|
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||||
|
goto next;
|
||||||
|
}
|
||||||
|
@@ -4044,10 +4054,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
|
||||||
|
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||||
|
|
||||||
|
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||||
|
- if (!pmd_young(val))
|
||||||
|
- continue;
|
||||||
|
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||||
|
+ if (!pmd_young(val))
|
||||||
|
+ continue;
|
||||||
|
|
||||||
|
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||||
|
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||||
|
continue;
|
||||||
|
@@ -4309,7 +4321,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||||
|
* handful of PTEs. Spreading the work out over a period of time usually
|
||||||
|
* is less efficient, but it avoids bursty page faults.
|
||||||
|
*/
|
||||||
|
- if (!arch_has_hw_pte_young()) {
|
||||||
|
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||||
|
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
@@ -5071,6 +5083,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
|
||||||
|
blk_finish_plug(&plug);
|
||||||
|
}
|
||||||
|
|
||||||
|
+/******************************************************************************
|
||||||
|
+ * state change
|
||||||
|
+ ******************************************************************************/
|
||||||
|
+
|
||||||
|
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||||
|
+{
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+
|
||||||
|
+ if (lrugen->enabled) {
|
||||||
|
+ enum lru_list lru;
|
||||||
|
+
|
||||||
|
+ for_each_evictable_lru(lru) {
|
||||||
|
+ if (!list_empty(&lruvec->lists[lru]))
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ } else {
|
||||||
|
+ int gen, type, zone;
|
||||||
|
+
|
||||||
|
+ for_each_gen_type_zone(gen, type, zone) {
|
||||||
|
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static bool fill_evictable(struct lruvec *lruvec)
|
||||||
|
+{
|
||||||
|
+ enum lru_list lru;
|
||||||
|
+ int remaining = MAX_LRU_BATCH;
|
||||||
|
+
|
||||||
|
+ for_each_evictable_lru(lru) {
|
||||||
|
+ int type = is_file_lru(lru);
|
||||||
|
+ bool active = is_active_lru(lru);
|
||||||
|
+ struct list_head *head = &lruvec->lists[lru];
|
||||||
|
+
|
||||||
|
+ while (!list_empty(head)) {
|
||||||
|
+ bool success;
|
||||||
|
+ struct folio *folio = lru_to_folio(head);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
|
||||||
|
+
|
||||||
|
+ lruvec_del_folio(lruvec, folio);
|
||||||
|
+ success = lru_gen_add_folio(lruvec, folio, false);
|
||||||
|
+ VM_WARN_ON_ONCE(!success);
|
||||||
|
+
|
||||||
|
+ if (!--remaining)
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static bool drain_evictable(struct lruvec *lruvec)
|
||||||
|
+{
|
||||||
|
+ int gen, type, zone;
|
||||||
|
+ int remaining = MAX_LRU_BATCH;
|
||||||
|
+
|
||||||
|
+ for_each_gen_type_zone(gen, type, zone) {
|
||||||
|
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||||
|
+
|
||||||
|
+ while (!list_empty(head)) {
|
||||||
|
+ bool success;
|
||||||
|
+ struct folio *folio = lru_to_folio(head);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
|
||||||
|
+
|
||||||
|
+ success = lru_gen_del_folio(lruvec, folio, false);
|
||||||
|
+ VM_WARN_ON_ONCE(!success);
|
||||||
|
+ lruvec_add_folio(lruvec, folio);
|
||||||
|
+
|
||||||
|
+ if (!--remaining)
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void lru_gen_change_state(bool enabled)
|
||||||
|
+{
|
||||||
|
+ static DEFINE_MUTEX(state_mutex);
|
||||||
|
+
|
||||||
|
+ struct mem_cgroup *memcg;
|
||||||
|
+
|
||||||
|
+ cgroup_lock();
|
||||||
|
+ cpus_read_lock();
|
||||||
|
+ get_online_mems();
|
||||||
|
+ mutex_lock(&state_mutex);
|
||||||
|
+
|
||||||
|
+ if (enabled == lru_gen_enabled())
|
||||||
|
+ goto unlock;
|
||||||
|
+
|
||||||
|
+ if (enabled)
|
||||||
|
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||||
|
+ else
|
||||||
|
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||||
|
+
|
||||||
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||||
|
+ do {
|
||||||
|
+ int nid;
|
||||||
|
+
|
||||||
|
+ for_each_node(nid) {
|
||||||
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||||
|
+
|
||||||
|
+ if (!lruvec)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||||
|
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
||||||
|
+
|
||||||
|
+ lruvec->lrugen.enabled = enabled;
|
||||||
|
+
|
||||||
|
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
||||||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||||
|
+ cond_resched();
|
||||||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cond_resched();
|
||||||
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||||
|
+unlock:
|
||||||
|
+ mutex_unlock(&state_mutex);
|
||||||
|
+ put_online_mems();
|
||||||
|
+ cpus_read_unlock();
|
||||||
|
+ cgroup_unlock();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/******************************************************************************
|
||||||
|
+ * sysfs interface
|
||||||
|
+ ******************************************************************************/
|
||||||
|
+
|
||||||
|
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||||
|
+{
|
||||||
|
+ unsigned int caps = 0;
|
||||||
|
+
|
||||||
|
+ if (get_cap(LRU_GEN_CORE))
|
||||||
|
+ caps |= BIT(LRU_GEN_CORE);
|
||||||
|
+
|
||||||
|
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||||
|
+ caps |= BIT(LRU_GEN_MM_WALK);
|
||||||
|
+
|
||||||
|
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||||
|
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||||
|
+
|
||||||
|
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
||||||
|
+ const char *buf, size_t len)
|
||||||
|
+{
|
||||||
|
+ int i;
|
||||||
|
+ unsigned int caps;
|
||||||
|
+
|
||||||
|
+ if (tolower(*buf) == 'n')
|
||||||
|
+ caps = 0;
|
||||||
|
+ else if (tolower(*buf) == 'y')
|
||||||
|
+ caps = -1;
|
||||||
|
+ else if (kstrtouint(buf, 0, &caps))
|
||||||
|
+ return -EINVAL;
|
||||||
|
+
|
||||||
|
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||||
|
+ bool enabled = caps & BIT(i);
|
||||||
|
+
|
||||||
|
+ if (i == LRU_GEN_CORE)
|
||||||
|
+ lru_gen_change_state(enabled);
|
||||||
|
+ else if (enabled)
|
||||||
|
+ static_branch_enable(&lru_gen_caps[i]);
|
||||||
|
+ else
|
||||||
|
+ static_branch_disable(&lru_gen_caps[i]);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return len;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||||
|
+ enabled, 0644, show_enabled, store_enabled
|
||||||
|
+);
|
||||||
|
+
|
||||||
|
+static struct attribute *lru_gen_attrs[] = {
|
||||||
|
+ &lru_gen_enabled_attr.attr,
|
||||||
|
+ NULL
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static struct attribute_group lru_gen_attr_group = {
|
||||||
|
+ .name = "lru_gen",
|
||||||
|
+ .attrs = lru_gen_attrs,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
/******************************************************************************
|
||||||
|
* initialization
|
||||||
|
******************************************************************************/
|
||||||
|
@@ -5081,6 +5295,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||||
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
|
||||||
|
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||||
|
+ lrugen->enabled = lru_gen_enabled();
|
||||||
|
|
||||||
|
for_each_gen_type_zone(gen, type, zone)
|
||||||
|
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||||
|
@@ -5120,6 +5335,9 @@ static int __init init_lru_gen(void)
|
||||||
|
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||||
|
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||||
|
|
||||||
|
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||||
|
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||||
|
+
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
late_initcall(init_lru_gen);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,209 @@
|
|||||||
|
From 9d92c76fb8ac09ff195024139575d8c4db66b672 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Thu, 27 Jan 2022 20:08:50 -0700
|
||||||
|
Subject: [PATCH 11/14] mm: multi-gen LRU: thrashing prevention
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
|
||||||
|
requested by many desktop users [1].
|
||||||
|
|
||||||
|
When set to value N, it prevents the working set of N milliseconds
|
||||||
|
from getting evicted. The OOM killer is triggered if this working set
|
||||||
|
cannot be kept in memory. Based on the average human detectable lag
|
||||||
|
(~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
|
||||||
|
Larger values like N=3000 make lags less noticeable at the risk of
|
||||||
|
premature OOM kills.
|
||||||
|
|
||||||
|
Compared with the size-based approach [2], this time-based approach
|
||||||
|
has the following advantages:
|
||||||
|
1. It is easier to configure because it is agnostic to applications
|
||||||
|
and memory sizes.
|
||||||
|
2. It is more reliable because it is directly wired to the OOM killer.
|
||||||
|
|
||||||
|
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
|
||||||
|
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I007499d7e47374b59fd620e8c3962940bc9f788e
|
||||||
|
---
|
||||||
|
include/linux/mmzone.h | 2 ++
|
||||||
|
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
|
||||||
|
2 files changed, 73 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
|
||||||
|
index 39bca2e420b7..0c502618b37b 100644
|
||||||
|
--- a/include/linux/mmzone.h
|
||||||
|
+++ b/include/linux/mmzone.h
|
||||||
|
@@ -419,6 +419,8 @@ struct lru_gen_struct {
|
||||||
|
unsigned long max_seq;
|
||||||
|
/* the eviction increments the oldest generation numbers */
|
||||||
|
unsigned long min_seq[ANON_AND_FILE];
|
||||||
|
+ /* the birth time of each generation in jiffies */
|
||||||
|
+ unsigned long timestamps[MAX_NR_GENS];
|
||||||
|
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||||
|
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||||
|
/* the multi-gen LRU sizes, eventually consistent */
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index ea3d497019ab..0df253819edc 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -4293,6 +4293,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
for (type = 0; type < ANON_AND_FILE; type++)
|
||||||
|
reset_ctrl_pos(lruvec, type, false);
|
||||||
|
|
||||||
|
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
|
||||||
|
/* make sure preceding modifications appear */
|
||||||
|
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
||||||
|
|
||||||
|
@@ -4420,7 +4421,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||||
|
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||||
|
{
|
||||||
|
bool need_aging;
|
||||||
|
unsigned long nr_to_scan;
|
||||||
|
@@ -4434,16 +4435,36 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||||
|
mem_cgroup_calculate_protection(NULL, memcg);
|
||||||
|
|
||||||
|
if (mem_cgroup_below_min(memcg))
|
||||||
|
- return;
|
||||||
|
+ return false;
|
||||||
|
|
||||||
|
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||||
|
+
|
||||||
|
+ if (min_ttl) {
|
||||||
|
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||||
|
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||||
|
+
|
||||||
|
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ /* the size is likely too small to be helpful */
|
||||||
|
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if (need_aging)
|
||||||
|
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* to protect the working set of the last N jiffies */
|
||||||
|
+static unsigned long lru_gen_min_ttl __read_mostly;
|
||||||
|
+
|
||||||
|
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||||
|
{
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
+ bool success = false;
|
||||||
|
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||||
|
|
||||||
|
@@ -4466,12 +4487,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||||
|
do {
|
||||||
|
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||||
|
|
||||||
|
- age_lruvec(lruvec, sc);
|
||||||
|
+ if (age_lruvec(lruvec, sc, min_ttl))
|
||||||
|
+ success = true;
|
||||||
|
|
||||||
|
cond_resched();
|
||||||
|
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||||
|
|
||||||
|
clear_mm_walk();
|
||||||
|
+
|
||||||
|
+ /* check the order to exclude compaction-induced reclaim */
|
||||||
|
+ if (success || !min_ttl || sc->order)
|
||||||
|
+ return;
|
||||||
|
+
|
||||||
|
+ /*
|
||||||
|
+ * The main goal is to OOM kill if every generation from all memcgs is
|
||||||
|
+ * younger than min_ttl. However, another possibility is all memcgs are
|
||||||
|
+ * either below min or empty.
|
||||||
|
+ */
|
||||||
|
+ if (mutex_trylock(&oom_lock)) {
|
||||||
|
+ struct oom_control oc = {
|
||||||
|
+ .gfp_mask = sc->gfp_mask,
|
||||||
|
+ };
|
||||||
|
+
|
||||||
|
+ out_of_memory(&oc);
|
||||||
|
+
|
||||||
|
+ mutex_unlock(&oom_lock);
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
@@ -5228,6 +5269,28 @@ static void lru_gen_change_state(bool enabled)
|
||||||
|
* sysfs interface
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||||
|
+{
|
||||||
|
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||||
|
+ const char *buf, size_t len)
|
||||||
|
+{
|
||||||
|
+ unsigned int msecs;
|
||||||
|
+
|
||||||
|
+ if (kstrtouint(buf, 0, &msecs))
|
||||||
|
+ return -EINVAL;
|
||||||
|
+
|
||||||
|
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||||
|
+
|
||||||
|
+ return len;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||||
|
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||||
|
+);
|
||||||
|
+
|
||||||
|
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||||
|
{
|
||||||
|
unsigned int caps = 0;
|
||||||
|
@@ -5276,6 +5339,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||||
|
);
|
||||||
|
|
||||||
|
static struct attribute *lru_gen_attrs[] = {
|
||||||
|
+ &lru_gen_min_ttl_attr.attr,
|
||||||
|
&lru_gen_enabled_attr.attr,
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
@@ -5291,12 +5355,16 @@ static struct attribute_group lru_gen_attr_group = {
|
||||||
|
|
||||||
|
void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
+ int i;
|
||||||
|
int gen, type, zone;
|
||||||
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
|
||||||
|
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||||
|
lrugen->enabled = lru_gen_enabled();
|
||||||
|
|
||||||
|
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||||
|
+ lrugen->timestamps[i] = jiffies;
|
||||||
|
+
|
||||||
|
for_each_gen_type_zone(gen, type, zone)
|
||||||
|
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||||
|
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,564 @@
|
|||||||
|
From d1e0e5fcdea16d4ceead496a0ea2fdbb6bc5bfe4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Thu, 27 Jan 2022 20:12:41 -0700
|
||||||
|
Subject: [PATCH 12/14] mm: multi-gen LRU: debugfs interface
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
|
||||||
|
reclaim. These techniques are commonly used to optimize job scheduling
|
||||||
|
(bin packing) in data centers [1][2].
|
||||||
|
|
||||||
|
Compared with the page table-based approach and the PFN-based
|
||||||
|
approach, this lruvec-based approach has the following advantages:
|
||||||
|
1. It offers better choices because it is aware of memcgs, NUMA nodes,
|
||||||
|
shared mappings and unmapped page cache.
|
||||||
|
2. It is more scalable because it is O(nr_hot_pages), whereas the
|
||||||
|
PFN-based approach is O(nr_total_pages).
|
||||||
|
|
||||||
|
Add /sys/kernel/debug/lru_gen_full for debugging.
|
||||||
|
|
||||||
|
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||||
|
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I7bb06f14e0a94901a076cc3767d0855d4f1ea3ab
|
||||||
|
---
|
||||||
|
include/linux/nodemask.h | 1 +
|
||||||
|
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
|
||||||
|
2 files changed, 402 insertions(+), 10 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
|
||||||
|
index 4b71a96190a8..3a0eec9f2faa 100644
|
||||||
|
--- a/include/linux/nodemask.h
|
||||||
|
+++ b/include/linux/nodemask.h
|
||||||
|
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
|
||||||
|
#define first_online_node 0
|
||||||
|
#define first_memory_node 0
|
||||||
|
#define next_online_node(nid) (MAX_NUMNODES)
|
||||||
|
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||||
|
#define nr_node_ids 1U
|
||||||
|
#define nr_online_nodes 1U
|
||||||
|
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 0df253819edc..3e7aad06299b 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -52,6 +52,7 @@
|
||||||
|
#include <linux/pagewalk.h>
|
||||||
|
#include <linux/shmem_fs.h>
|
||||||
|
#include <linux/ctype.h>
|
||||||
|
+#include <linux/debugfs.h>
|
||||||
|
|
||||||
|
#include <asm/tlbflush.h>
|
||||||
|
#include <asm/div64.h>
|
||||||
|
@@ -4197,12 +4198,40 @@ static void clear_mm_walk(void)
|
||||||
|
kfree(walk);
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||||
|
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
|
||||||
|
{
|
||||||
|
+ int zone;
|
||||||
|
+ int remaining = MAX_LRU_BATCH;
|
||||||
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||||
|
+
|
||||||
|
+ if (type == LRU_GEN_ANON && !can_swap)
|
||||||
|
+ goto done;
|
||||||
|
+
|
||||||
|
+ /* prevent cold/hot inversion if force_scan is true */
|
||||||
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||||
|
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||||
|
+
|
||||||
|
+ while (!list_empty(head)) {
|
||||||
|
+ struct folio *folio = lru_to_folio(head);
|
||||||
|
+
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
|
||||||
|
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
|
||||||
|
|
||||||
|
+ new_gen = folio_inc_gen(lruvec, folio, false);
|
||||||
|
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
|
||||||
|
+
|
||||||
|
+ if (!--remaining)
|
||||||
|
+ return false;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+done:
|
||||||
|
reset_ctrl_pos(lruvec, type, true);
|
||||||
|
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
|
||||||
|
+
|
||||||
|
+ return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
@@ -4248,7 +4277,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
|
||||||
|
{
|
||||||
|
int prev, next;
|
||||||
|
int type, zone;
|
||||||
|
@@ -4262,9 +4291,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
|
||||||
|
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
|
||||||
|
|
||||||
|
- inc_min_seq(lruvec, type);
|
||||||
|
+ while (!inc_min_seq(lruvec, type, can_swap)) {
|
||||||
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||||
|
+ cond_resched();
|
||||||
|
+ spin_lock_irq(&lruvec->lru_lock);
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
@@ -4301,7 +4334,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||||
|
- struct scan_control *sc, bool can_swap)
|
||||||
|
+ struct scan_control *sc, bool can_swap, bool force_scan)
|
||||||
|
{
|
||||||
|
bool success;
|
||||||
|
struct lru_gen_mm_walk *walk;
|
||||||
|
@@ -4322,7 +4355,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||||
|
* handful of PTEs. Spreading the work out over a period of time usually
|
||||||
|
* is less efficient, but it avoids bursty page faults.
|
||||||
|
*/
|
||||||
|
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||||
|
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||||
|
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
@@ -4336,7 +4369,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||||
|
walk->lruvec = lruvec;
|
||||||
|
walk->max_seq = max_seq;
|
||||||
|
walk->can_swap = can_swap;
|
||||||
|
- walk->force_scan = false;
|
||||||
|
+ walk->force_scan = force_scan;
|
||||||
|
|
||||||
|
do {
|
||||||
|
success = iterate_mm_list(lruvec, walk, &mm);
|
||||||
|
@@ -4356,7 +4389,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||||
|
|
||||||
|
- inc_max_seq(lruvec, can_swap);
|
||||||
|
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||||
|
/* either this sees any waiters or they will see updated max_seq */
|
||||||
|
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||||
|
wake_up_all(&lruvec->mm_state.wait);
|
||||||
|
@@ -4452,7 +4485,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
|
||||||
|
}
|
||||||
|
|
||||||
|
if (need_aging)
|
||||||
|
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||||
|
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
@@ -5011,7 +5044,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
|
||||||
|
if (current_is_kswapd())
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
|
||||||
|
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||||
|
return nr_to_scan;
|
||||||
|
done:
|
||||||
|
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||||
|
@@ -5349,6 +5382,361 @@ static struct attribute_group lru_gen_attr_group = {
|
||||||
|
.attrs = lru_gen_attrs,
|
||||||
|
};
|
||||||
|
|
||||||
|
+/******************************************************************************
|
||||||
|
+ * debugfs interface
|
||||||
|
+ ******************************************************************************/
|
||||||
|
+
|
||||||
|
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||||
|
+{
|
||||||
|
+ struct mem_cgroup *memcg;
|
||||||
|
+ loff_t nr_to_skip = *pos;
|
||||||
|
+
|
||||||
|
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||||
|
+ if (!m->private)
|
||||||
|
+ return ERR_PTR(-ENOMEM);
|
||||||
|
+
|
||||||
|
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||||
|
+ do {
|
||||||
|
+ int nid;
|
||||||
|
+
|
||||||
|
+ for_each_node_state(nid, N_MEMORY) {
|
||||||
|
+ if (!nr_to_skip--)
|
||||||
|
+ return get_lruvec(memcg, nid);
|
||||||
|
+ }
|
||||||
|
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||||
|
+
|
||||||
|
+ return NULL;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||||
|
+{
|
||||||
|
+ if (!IS_ERR_OR_NULL(v))
|
||||||
|
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||||
|
+
|
||||||
|
+ kvfree(m->private);
|
||||||
|
+ m->private = NULL;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||||
|
+{
|
||||||
|
+ int nid = lruvec_pgdat(v)->node_id;
|
||||||
|
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||||
|
+
|
||||||
|
+ ++*pos;
|
||||||
|
+
|
||||||
|
+ nid = next_memory_node(nid);
|
||||||
|
+ if (nid == MAX_NUMNODES) {
|
||||||
|
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||||
|
+ if (!memcg)
|
||||||
|
+ return NULL;
|
||||||
|
+
|
||||||
|
+ nid = first_memory_node;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return get_lruvec(memcg, nid);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||||
|
+ unsigned long max_seq, unsigned long *min_seq,
|
||||||
|
+ unsigned long seq)
|
||||||
|
+{
|
||||||
|
+ int i;
|
||||||
|
+ int type, tier;
|
||||||
|
+ int hist = lru_hist_from_seq(seq);
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+
|
||||||
|
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||||
|
+ seq_printf(m, " %10d", tier);
|
||||||
|
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||||
|
+ const char *s = " ";
|
||||||
|
+ unsigned long n[3] = {};
|
||||||
|
+
|
||||||
|
+ if (seq == max_seq) {
|
||||||
|
+ s = "RT ";
|
||||||
|
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||||
|
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||||
|
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||||
|
+ s = "rep";
|
||||||
|
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||||
|
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||||
|
+ if (tier)
|
||||||
|
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ for (i = 0; i < 3; i++)
|
||||||
|
+ seq_printf(m, " %10lu%c", n[i], s[i]);
|
||||||
|
+ }
|
||||||
|
+ seq_putc(m, '\n');
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ seq_puts(m, " ");
|
||||||
|
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||||
|
+ const char *s = " ";
|
||||||
|
+ unsigned long n = 0;
|
||||||
|
+
|
||||||
|
+ if (seq == max_seq && NR_HIST_GENS == 1) {
|
||||||
|
+ s = "LOYNFA";
|
||||||
|
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||||
|
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
|
||||||
|
+ s = "loynfa";
|
||||||
|
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ seq_printf(m, " %10lu%c", n, s[i]);
|
||||||
|
+ }
|
||||||
|
+ seq_putc(m, '\n');
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||||
|
+{
|
||||||
|
+ unsigned long seq;
|
||||||
|
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||||
|
+ struct lruvec *lruvec = v;
|
||||||
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||||
|
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||||
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
+ DEFINE_MIN_SEQ(lruvec);
|
||||||
|
+
|
||||||
|
+ if (nid == first_memory_node) {
|
||||||
|
+ const char *path = memcg ? m->private : "";
|
||||||
|
+
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+ if (memcg)
|
||||||
|
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||||
|
+#endif
|
||||||
|
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ seq_printf(m, " node %5d\n", nid);
|
||||||
|
+
|
||||||
|
+ if (!full)
|
||||||
|
+ seq = min_seq[LRU_GEN_ANON];
|
||||||
|
+ else if (max_seq >= MAX_NR_GENS)
|
||||||
|
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||||
|
+ else
|
||||||
|
+ seq = 0;
|
||||||
|
+
|
||||||
|
+ for (; seq <= max_seq; seq++) {
|
||||||
|
+ int type, zone;
|
||||||
|
+ int gen = lru_gen_from_seq(seq);
|
||||||
|
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||||
|
+
|
||||||
|
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
|
||||||
|
+
|
||||||
|
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||||
|
+ unsigned long size = 0;
|
||||||
|
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
|
||||||
|
+
|
||||||
|
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||||
|
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||||
|
+
|
||||||
|
+ seq_printf(m, " %10lu%c", size, mark);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ seq_putc(m, '\n');
|
||||||
|
+
|
||||||
|
+ if (full)
|
||||||
|
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static const struct seq_operations lru_gen_seq_ops = {
|
||||||
|
+ .start = lru_gen_seq_start,
|
||||||
|
+ .stop = lru_gen_seq_stop,
|
||||||
|
+ .next = lru_gen_seq_next,
|
||||||
|
+ .show = lru_gen_seq_show,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||||
|
+ bool can_swap, bool force_scan)
|
||||||
|
+{
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
+ DEFINE_MIN_SEQ(lruvec);
|
||||||
|
+
|
||||||
|
+ if (seq < max_seq)
|
||||||
|
+ return 0;
|
||||||
|
+
|
||||||
|
+ if (seq > max_seq)
|
||||||
|
+ return -EINVAL;
|
||||||
|
+
|
||||||
|
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
|
||||||
|
+ return -ERANGE;
|
||||||
|
+
|
||||||
|
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||||
|
+ int swappiness, unsigned long nr_to_reclaim)
|
||||||
|
+{
|
||||||
|
+ DEFINE_MAX_SEQ(lruvec);
|
||||||
|
+
|
||||||
|
+ if (seq + MIN_NR_GENS > max_seq)
|
||||||
|
+ return -EINVAL;
|
||||||
|
+
|
||||||
|
+ sc->nr_reclaimed = 0;
|
||||||
|
+
|
||||||
|
+ while (!signal_pending(current)) {
|
||||||
|
+ DEFINE_MIN_SEQ(lruvec);
|
||||||
|
+
|
||||||
|
+ if (seq < min_seq[!swappiness])
|
||||||
|
+ return 0;
|
||||||
|
+
|
||||||
|
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||||
|
+ return 0;
|
||||||
|
+
|
||||||
|
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
|
||||||
|
+ return 0;
|
||||||
|
+
|
||||||
|
+ cond_resched();
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return -EINTR;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
|
||||||
|
+ struct scan_control *sc, int swappiness, unsigned long opt)
|
||||||
|
+{
|
||||||
|
+ struct lruvec *lruvec;
|
||||||
|
+ int err = -EINVAL;
|
||||||
|
+ struct mem_cgroup *memcg = NULL;
|
||||||
|
+
|
||||||
|
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||||
|
+ return -EINVAL;
|
||||||
|
+
|
||||||
|
+ if (!mem_cgroup_disabled()) {
|
||||||
|
+ rcu_read_lock();
|
||||||
|
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||||
|
+#ifdef CONFIG_MEMCG
|
||||||
|
+ if (memcg && !css_tryget(&memcg->css))
|
||||||
|
+ memcg = NULL;
|
||||||
|
+#endif
|
||||||
|
+ rcu_read_unlock();
|
||||||
|
+
|
||||||
|
+ if (!memcg)
|
||||||
|
+ return -EINVAL;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||||
|
+ goto done;
|
||||||
|
+
|
||||||
|
+ lruvec = get_lruvec(memcg, nid);
|
||||||
|
+
|
||||||
|
+ if (swappiness < 0)
|
||||||
|
+ swappiness = get_swappiness(lruvec, sc);
|
||||||
|
+ else if (swappiness > 200)
|
||||||
|
+ goto done;
|
||||||
|
+
|
||||||
|
+ switch (cmd) {
|
||||||
|
+ case '+':
|
||||||
|
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
|
||||||
|
+ break;
|
||||||
|
+ case '-':
|
||||||
|
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+done:
|
||||||
|
+ mem_cgroup_put(memcg);
|
||||||
|
+
|
||||||
|
+ return err;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||||
|
+ size_t len, loff_t *pos)
|
||||||
|
+{
|
||||||
|
+ void *buf;
|
||||||
|
+ char *cur, *next;
|
||||||
|
+ unsigned int flags;
|
||||||
|
+ struct blk_plug plug;
|
||||||
|
+ int err = -EINVAL;
|
||||||
|
+ struct scan_control sc = {
|
||||||
|
+ .may_writepage = true,
|
||||||
|
+ .may_unmap = true,
|
||||||
|
+ .may_swap = true,
|
||||||
|
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||||
|
+ .gfp_mask = GFP_KERNEL,
|
||||||
|
+ };
|
||||||
|
+
|
||||||
|
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||||
|
+ if (!buf)
|
||||||
|
+ return -ENOMEM;
|
||||||
|
+
|
||||||
|
+ if (copy_from_user(buf, src, len)) {
|
||||||
|
+ kvfree(buf);
|
||||||
|
+ return -EFAULT;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||||
|
+ flags = memalloc_noreclaim_save();
|
||||||
|
+ blk_start_plug(&plug);
|
||||||
|
+ if (!set_mm_walk(NULL)) {
|
||||||
|
+ err = -ENOMEM;
|
||||||
|
+ goto done;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ next = buf;
|
||||||
|
+ next[len] = '\0';
|
||||||
|
+
|
||||||
|
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||||
|
+ int n;
|
||||||
|
+ int end;
|
||||||
|
+ char cmd;
|
||||||
|
+ unsigned int memcg_id;
|
||||||
|
+ unsigned int nid;
|
||||||
|
+ unsigned long seq;
|
||||||
|
+ unsigned int swappiness = -1;
|
||||||
|
+ unsigned long opt = -1;
|
||||||
|
+
|
||||||
|
+ cur = skip_spaces(cur);
|
||||||
|
+ if (!*cur)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||||
|
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||||
|
+ if (n < 4 || cur[end]) {
|
||||||
|
+ err = -EINVAL;
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
|
||||||
|
+ if (err)
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+done:
|
||||||
|
+ clear_mm_walk();
|
||||||
|
+ blk_finish_plug(&plug);
|
||||||
|
+ memalloc_noreclaim_restore(flags);
|
||||||
|
+ set_task_reclaim_state(current, NULL);
|
||||||
|
+
|
||||||
|
+ kvfree(buf);
|
||||||
|
+
|
||||||
|
+ return err ? : len;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||||
|
+{
|
||||||
|
+ return seq_open(file, &lru_gen_seq_ops);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static const struct file_operations lru_gen_rw_fops = {
|
||||||
|
+ .open = lru_gen_seq_open,
|
||||||
|
+ .read = seq_read,
|
||||||
|
+ .write = lru_gen_seq_write,
|
||||||
|
+ .llseek = seq_lseek,
|
||||||
|
+ .release = seq_release,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+static const struct file_operations lru_gen_ro_fops = {
|
||||||
|
+ .open = lru_gen_seq_open,
|
||||||
|
+ .read = seq_read,
|
||||||
|
+ .llseek = seq_lseek,
|
||||||
|
+ .release = seq_release,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
/******************************************************************************
|
||||||
|
* initialization
|
||||||
|
******************************************************************************/
|
||||||
|
@@ -5406,6 +5794,9 @@ static int __init init_lru_gen(void)
|
||||||
|
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||||
|
pr_err("lru_gen: failed to create sysfs group\n");
|
||||||
|
|
||||||
|
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||||
|
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||||
|
+
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
late_initcall(init_lru_gen);
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,265 @@
|
|||||||
|
From 22199c9b30ffcc332be643577709a2af960e6786 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Sun, 23 Jan 2022 16:44:43 -0700
|
||||||
|
Subject: [PATCH 13/14] mm: multi-gen LRU: admin guide
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Add an admin guide.
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
|
||||||
|
---
|
||||||
|
Documentation/admin-guide/mm/index.rst | 1 +
|
||||||
|
Documentation/admin-guide/mm/multigen_lru.rst | 162 ++++++++++++++++++
|
||||||
|
mm/Kconfig | 3 +-
|
||||||
|
mm/vmscan.c | 4 +
|
||||||
|
4 files changed, 169 insertions(+), 1 deletion(-)
|
||||||
|
create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
|
||||||
|
|
||||||
|
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
|
||||||
|
index 1bd11118dfb1..d1064e0ba34a 100644
|
||||||
|
--- a/Documentation/admin-guide/mm/index.rst
|
||||||
|
+++ b/Documentation/admin-guide/mm/index.rst
|
||||||
|
@@ -32,6 +32,7 @@ the Linux memory management.
|
||||||
|
idle_page_tracking
|
||||||
|
ksm
|
||||||
|
memory-hotplug
|
||||||
|
+ multigen_lru
|
||||||
|
nommu-mmap
|
||||||
|
numa_memory_policy
|
||||||
|
numaperf
|
||||||
|
diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst
|
||||||
|
new file mode 100644
|
||||||
|
index 000000000000..33e068830497
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
|
||||||
|
@@ -0,0 +1,162 @@
|
||||||
|
+.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
+
|
||||||
|
+=============
|
||||||
|
+Multi-Gen LRU
|
||||||
|
+=============
|
||||||
|
+The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||||
|
+page reclaim and improves performance under memory pressure. Page
|
||||||
|
+reclaim decides the kernel's caching policy and ability to overcommit
|
||||||
|
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||||
|
+
|
||||||
|
+Quick start
|
||||||
|
+===========
|
||||||
|
+Build the kernel with the following configurations.
|
||||||
|
+
|
||||||
|
+* ``CONFIG_LRU_GEN=y``
|
||||||
|
+* ``CONFIG_LRU_GEN_ENABLED=y``
|
||||||
|
+
|
||||||
|
+All set!
|
||||||
|
+
|
||||||
|
+Runtime options
|
||||||
|
+===============
|
||||||
|
+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
|
||||||
|
+following subsections.
|
||||||
|
+
|
||||||
|
+Kill switch
|
||||||
|
+-----------
|
||||||
|
+``enabled`` accepts different values to enable or disable the
|
||||||
|
+following components. Its default value depends on
|
||||||
|
+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
|
||||||
|
+unless some of them have unforeseen side effects. Writing to
|
||||||
|
+``enabled`` has no effect when a component is not supported by the
|
||||||
|
+hardware, and valid values will be accepted even when the main switch
|
||||||
|
+is off.
|
||||||
|
+
|
||||||
|
+====== ===============================================================
|
||||||
|
+Values Components
|
||||||
|
+====== ===============================================================
|
||||||
|
+0x0001 The main switch for the multi-gen LRU.
|
||||||
|
+0x0002 Clearing the accessed bit in leaf page table entries in large
|
||||||
|
+ batches, when MMU sets it (e.g., on x86). This behavior can
|
||||||
|
+ theoretically worsen lock contention (mmap_lock). If it is
|
||||||
|
+ disabled, the multi-gen LRU will suffer a minor performance
|
||||||
|
+ degradation for workloads that contiguously map hot pages,
|
||||||
|
+ whose accessed bits can be otherwise cleared by fewer larger
|
||||||
|
+ batches.
|
||||||
|
+0x0004 Clearing the accessed bit in non-leaf page table entries as
|
||||||
|
+ well, when MMU sets it (e.g., on x86). This behavior was not
|
||||||
|
+ verified on x86 varieties other than Intel and AMD. If it is
|
||||||
|
+ disabled, the multi-gen LRU will suffer a negligible
|
||||||
|
+ performance degradation.
|
||||||
|
+[yYnN] Apply to all the components above.
|
||||||
|
+====== ===============================================================
|
||||||
|
+
|
||||||
|
+E.g.,
|
||||||
|
+::
|
||||||
|
+
|
||||||
|
+ echo y >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
+ cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
+ 0x0007
|
||||||
|
+ echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
+ cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
+ 0x0005
|
||||||
|
+
|
||||||
|
+Thrashing prevention
|
||||||
|
+--------------------
|
||||||
|
+Personal computers are more sensitive to thrashing because it can
|
||||||
|
+cause janks (lags when rendering UI) and negatively impact user
|
||||||
|
+experience. The multi-gen LRU offers thrashing prevention to the
|
||||||
|
+majority of laptop and desktop users who do not have ``oomd``.
|
||||||
|
+
|
||||||
|
+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
|
||||||
|
+``N`` milliseconds from getting evicted. The OOM killer is triggered
|
||||||
|
+if this working set cannot be kept in memory. In other words, this
|
||||||
|
+option works as an adjustable pressure relief valve, and when open, it
|
||||||
|
+terminates applications that are hopefully not being used.
|
||||||
|
+
|
||||||
|
+Based on the average human detectable lag (~100ms), ``N=1000`` usually
|
||||||
|
+eliminates intolerable janks due to thrashing. Larger values like
|
||||||
|
+``N=3000`` make janks less noticeable at the risk of premature OOM
|
||||||
|
+kills.
|
||||||
|
+
|
||||||
|
+The default value ``0`` means disabled.
|
||||||
|
+
|
||||||
|
+Experimental features
|
||||||
|
+=====================
|
||||||
|
+``/sys/kernel/debug/lru_gen`` accepts commands described in the
|
||||||
|
+following subsections. Multiple command lines are supported, so does
|
||||||
|
+concatenation with delimiters ``,`` and ``;``.
|
||||||
|
+
|
||||||
|
+``/sys/kernel/debug/lru_gen_full`` provides additional stats for
|
||||||
|
+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
|
||||||
|
+evicted generations in this file.
|
||||||
|
+
|
||||||
|
+Working set estimation
|
||||||
|
+----------------------
|
||||||
|
+Working set estimation measures how much memory an application needs
|
||||||
|
+in a given time interval, and it is usually done with little impact on
|
||||||
|
+the performance of the application. E.g., data centers want to
|
||||||
|
+optimize job scheduling (bin packing) to improve memory utilizations.
|
||||||
|
+When a new job comes in, the job scheduler needs to find out whether
|
||||||
|
+each server it manages can allocate a certain amount of memory for
|
||||||
|
+this new job before it can pick a candidate. To do so, the job
|
||||||
|
+scheduler needs to estimate the working sets of the existing jobs.
|
||||||
|
+
|
||||||
|
+When it is read, ``lru_gen`` returns a histogram of numbers of pages
|
||||||
|
+accessed over different time intervals for each memcg and node.
|
||||||
|
+``MAX_NR_GENS`` decides the number of bins for each histogram. The
|
||||||
|
+histograms are noncumulative.
|
||||||
|
+::
|
||||||
|
+
|
||||||
|
+ memcg memcg_id memcg_path
|
||||||
|
+ node node_id
|
||||||
|
+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
+ ...
|
||||||
|
+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
+
|
||||||
|
+Each bin contains an estimated number of pages that have been accessed
|
||||||
|
+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
|
||||||
|
+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
|
||||||
|
+the former is the largest and that of the latter is the smallest.
|
||||||
|
+
|
||||||
|
+Users can write the following command to ``lru_gen`` to create a new
|
||||||
|
+generation ``max_gen_nr+1``:
|
||||||
|
+
|
||||||
|
+ ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
|
||||||
|
+
|
||||||
|
+``can_swap`` defaults to the swap setting and, if it is set to ``1``,
|
||||||
|
+it forces the scan of anon pages when swap is off, and vice versa.
|
||||||
|
+``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
|
||||||
|
+employs heuristics to reduce the overhead, which is likely to reduce
|
||||||
|
+the coverage as well.
|
||||||
|
+
|
||||||
|
+A typical use case is that a job scheduler runs this command at a
|
||||||
|
+certain time interval to create new generations, and it ranks the
|
||||||
|
+servers it manages based on the sizes of their cold pages defined by
|
||||||
|
+this time interval.
|
||||||
|
+
|
||||||
|
+Proactive reclaim
|
||||||
|
+-----------------
|
||||||
|
+Proactive reclaim induces page reclaim when there is no memory
|
||||||
|
+pressure. It usually targets cold pages only. E.g., when a new job
|
||||||
|
+comes in, the job scheduler wants to proactively reclaim cold pages on
|
||||||
|
+the server it selected, to improve the chance of successfully landing
|
||||||
|
+this new job.
|
||||||
|
+
|
||||||
|
+Users can write the following command to ``lru_gen`` to evict
|
||||||
|
+generations less than or equal to ``min_gen_nr``.
|
||||||
|
+
|
||||||
|
+ ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
|
||||||
|
+
|
||||||
|
+``min_gen_nr`` should be less than ``max_gen_nr-1``, since
|
||||||
|
+``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
|
||||||
|
+the active list) and therefore cannot be evicted. ``swappiness``
|
||||||
|
+overrides the default value in ``/proc/sys/vm/swappiness``.
|
||||||
|
+``nr_to_reclaim`` limits the number of pages to evict.
|
||||||
|
+
|
||||||
|
+A typical use case is that a job scheduler runs this command before it
|
||||||
|
+tries to land a new job on a server. If it fails to materialize enough
|
||||||
|
+cold pages because of the overestimation, it retries on the next
|
||||||
|
+server according to the ranking result obtained from the working set
|
||||||
|
+estimation step. This less forceful approach limits the impacts on the
|
||||||
|
+existing jobs.
|
||||||
|
diff --git a/mm/Kconfig b/mm/Kconfig
|
||||||
|
index 6c86849c4db9..96cd3ae25c6f 100644
|
||||||
|
--- a/mm/Kconfig
|
||||||
|
+++ b/mm/Kconfig
|
||||||
|
@@ -1131,7 +1131,8 @@ config LRU_GEN
|
||||||
|
# make sure folio->flags has enough spare bits
|
||||||
|
depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
||||||
|
help
|
||||||
|
- A high performance LRU implementation to overcommit memory.
|
||||||
|
+ A high performance LRU implementation to overcommit memory. See
|
||||||
|
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
|
||||||
|
|
||||||
|
config LRU_GEN_ENABLED
|
||||||
|
bool "Enable by default"
|
||||||
|
diff --git a/mm/vmscan.c b/mm/vmscan.c
|
||||||
|
index 3e7aad06299b..146a54cf1bd9 100644
|
||||||
|
--- a/mm/vmscan.c
|
||||||
|
+++ b/mm/vmscan.c
|
||||||
|
@@ -5307,6 +5307,7 @@ static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, c
|
||||||
|
return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
|
||||||
|
static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||||
|
const char *buf, size_t len)
|
||||||
|
{
|
||||||
|
@@ -5340,6 +5341,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
|
||||||
|
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
|
||||||
|
static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
||||||
|
const char *buf, size_t len)
|
||||||
|
{
|
||||||
|
@@ -5487,6 +5489,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||||
|
seq_putc(m, '\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
|
||||||
|
static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||||
|
{
|
||||||
|
unsigned long seq;
|
||||||
|
@@ -5645,6 +5648,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
|
||||||
|
static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||||
|
size_t len, loff_t *pos)
|
||||||
|
{
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
@ -0,0 +1,210 @@
|
|||||||
|
From bd82a74f6b5c0a75ef61be5e9be34319bb17328f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yu Zhao <yuzhao@google.com>
|
||||||
|
Date: Sun, 6 Mar 2022 20:35:00 -0700
|
||||||
|
Subject: [PATCH 14/14] mm: multi-gen LRU: design doc
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
Add a design doc.
|
||||||
|
|
||||||
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||||
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||||
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||||
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||||
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||||
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||||
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||||
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||||
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||||
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||||
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||||
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||||
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||||
|
Change-Id: I958afcabf5abc37b3e58f72638d35a349c31b98d
|
||||||
|
---
|
||||||
|
Documentation/mm/index.rst | 1 +
|
||||||
|
Documentation/mm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++
|
||||||
|
2 files changed, 160 insertions(+)
|
||||||
|
create mode 100644 Documentation/mm/multigen_lru.rst
|
||||||
|
|
||||||
|
diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
|
||||||
|
index 575ccd40e30c..4aa12b8be278 100644
|
||||||
|
--- a/Documentation/mm/index.rst
|
||||||
|
+++ b/Documentation/mm/index.rst
|
||||||
|
@@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose.
|
||||||
|
ksm
|
||||||
|
memory-model
|
||||||
|
mmu_notifier
|
||||||
|
+ multigen_lru
|
||||||
|
numa
|
||||||
|
overcommit-accounting
|
||||||
|
page_migration
|
||||||
|
diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
|
||||||
|
new file mode 100644
|
||||||
|
index 000000000000..d7062c6a8946
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/Documentation/mm/multigen_lru.rst
|
||||||
|
@@ -0,0 +1,159 @@
|
||||||
|
+.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
+
|
||||||
|
+=============
|
||||||
|
+Multi-Gen LRU
|
||||||
|
+=============
|
||||||
|
+The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||||
|
+page reclaim and improves performance under memory pressure. Page
|
||||||
|
+reclaim decides the kernel's caching policy and ability to overcommit
|
||||||
|
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||||
|
+
|
||||||
|
+Design overview
|
||||||
|
+===============
|
||||||
|
+Objectives
|
||||||
|
+----------
|
||||||
|
+The design objectives are:
|
||||||
|
+
|
||||||
|
+* Good representation of access recency
|
||||||
|
+* Try to profit from spatial locality
|
||||||
|
+* Fast paths to make obvious choices
|
||||||
|
+* Simple self-correcting heuristics
|
||||||
|
+
|
||||||
|
+The representation of access recency is at the core of all LRU
|
||||||
|
+implementations. In the multi-gen LRU, each generation represents a
|
||||||
|
+group of pages with similar access recency. Generations establish a
|
||||||
|
+(time-based) common frame of reference and therefore help make better
|
||||||
|
+choices, e.g., between different memcgs on a computer or different
|
||||||
|
+computers in a data center (for job scheduling).
|
||||||
|
+
|
||||||
|
+Exploiting spatial locality improves efficiency when gathering the
|
||||||
|
+accessed bit. A rmap walk targets a single page and does not try to
|
||||||
|
+profit from discovering a young PTE. A page table walk can sweep all
|
||||||
|
+the young PTEs in an address space, but the address space can be too
|
||||||
|
+sparse to make a profit. The key is to optimize both methods and use
|
||||||
|
+them in combination.
|
||||||
|
+
|
||||||
|
+Fast paths reduce code complexity and runtime overhead. Unmapped pages
|
||||||
|
+do not require TLB flushes; clean pages do not require writeback.
|
||||||
|
+These facts are only helpful when other conditions, e.g., access
|
||||||
|
+recency, are similar. With generations as a common frame of reference,
|
||||||
|
+additional factors stand out. But obvious choices might not be good
|
||||||
|
+choices; thus self-correction is necessary.
|
||||||
|
+
|
||||||
|
+The benefits of simple self-correcting heuristics are self-evident.
|
||||||
|
+Again, with generations as a common frame of reference, this becomes
|
||||||
|
+attainable. Specifically, pages in the same generation can be
|
||||||
|
+categorized based on additional factors, and a feedback loop can
|
||||||
|
+statistically compare the refault percentages across those categories
|
||||||
|
+and infer which of them are better choices.
|
||||||
|
+
|
||||||
|
+Assumptions
|
||||||
|
+-----------
|
||||||
|
+The protection of hot pages and the selection of cold pages are based
|
||||||
|
+on page access channels and patterns. There are two access channels:
|
||||||
|
+
|
||||||
|
+* Accesses through page tables
|
||||||
|
+* Accesses through file descriptors
|
||||||
|
+
|
||||||
|
+The protection of the former channel is by design stronger because:
|
||||||
|
+
|
||||||
|
+1. The uncertainty in determining the access patterns of the former
|
||||||
|
+ channel is higher due to the approximation of the accessed bit.
|
||||||
|
+2. The cost of evicting the former channel is higher due to the TLB
|
||||||
|
+ flushes required and the likelihood of encountering the dirty bit.
|
||||||
|
+3. The penalty of underprotecting the former channel is higher because
|
||||||
|
+ applications usually do not prepare themselves for major page
|
||||||
|
+ faults like they do for blocked I/O. E.g., GUI applications
|
||||||
|
+ commonly use dedicated I/O threads to avoid blocking rendering
|
||||||
|
+ threads.
|
||||||
|
+
|
||||||
|
+There are also two access patterns:
|
||||||
|
+
|
||||||
|
+* Accesses exhibiting temporal locality
|
||||||
|
+* Accesses not exhibiting temporal locality
|
||||||
|
+
|
||||||
|
+For the reasons listed above, the former channel is assumed to follow
|
||||||
|
+the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
|
||||||
|
+present, and the latter channel is assumed to follow the latter
|
||||||
|
+pattern unless outlying refaults have been observed.
|
||||||
|
+
|
||||||
|
+Workflow overview
|
||||||
|
+=================
|
||||||
|
+Evictable pages are divided into multiple generations for each
|
||||||
|
+``lruvec``. The youngest generation number is stored in
|
||||||
|
+``lrugen->max_seq`` for both anon and file types as they are aged on
|
||||||
|
+an equal footing. The oldest generation numbers are stored in
|
||||||
|
+``lrugen->min_seq[]`` separately for anon and file types as clean file
|
||||||
|
+pages can be evicted regardless of swap constraints. These three
|
||||||
|
+variables are monotonically increasing.
|
||||||
|
+
|
||||||
|
+Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
|
||||||
|
+bits in order to fit into the gen counter in ``folio->flags``. Each
|
||||||
|
+truncated generation number is an index to ``lrugen->lists[]``. The
|
||||||
|
+sliding window technique is used to track at least ``MIN_NR_GENS`` and
|
||||||
|
+at most ``MAX_NR_GENS`` generations. The gen counter stores a value
|
||||||
|
+within ``[1, MAX_NR_GENS]`` while a page is on one of
|
||||||
|
+``lrugen->lists[]``; otherwise it stores zero.
|
||||||
|
+
|
||||||
|
+Each generation is divided into multiple tiers. A page accessed ``N``
|
||||||
|
+times through file descriptors is in tier ``order_base_2(N)``. Unlike
|
||||||
|
+generations, tiers do not have dedicated ``lrugen->lists[]``. In
|
||||||
|
+contrast to moving across generations, which requires the LRU lock,
|
||||||
|
+moving across tiers only involves atomic operations on
|
||||||
|
+``folio->flags`` and therefore has a negligible cost. A feedback loop
|
||||||
|
+modeled after the PID controller monitors refaults over all the tiers
|
||||||
|
+from anon and file types and decides which tiers from which types to
|
||||||
|
+evict or protect.
|
||||||
|
+
|
||||||
|
+There are two conceptually independent procedures: the aging and the
|
||||||
|
+eviction. They form a closed-loop system, i.e., the page reclaim.
|
||||||
|
+
|
||||||
|
+Aging
|
||||||
|
+-----
|
||||||
|
+The aging produces young generations. Given an ``lruvec``, it
|
||||||
|
+increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
|
||||||
|
+``MIN_NR_GENS``. The aging promotes hot pages to the youngest
|
||||||
|
+generation when it finds them accessed through page tables; the
|
||||||
|
+demotion of cold pages happens consequently when it increments
|
||||||
|
+``max_seq``. The aging uses page table walks and rmap walks to find
|
||||||
|
+young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
|
||||||
|
+and calls ``walk_page_range()`` with each ``mm_struct`` on this list
|
||||||
|
+to scan PTEs, and after each iteration, it increments ``max_seq``. For
|
||||||
|
+the latter, when the eviction walks the rmap and finds a young PTE,
|
||||||
|
+the aging scans the adjacent PTEs. For both, on finding a young PTE,
|
||||||
|
+the aging clears the accessed bit and updates the gen counter of the
|
||||||
|
+page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
|
||||||
|
+
|
||||||
|
+Eviction
|
||||||
|
+--------
|
||||||
|
+The eviction consumes old generations. Given an ``lruvec``, it
|
||||||
|
+increments ``min_seq`` when ``lrugen->lists[]`` indexed by
|
||||||
|
+``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
|
||||||
|
+evict from, it first compares ``min_seq[]`` to select the older type.
|
||||||
|
+If both types are equally old, it selects the one whose first tier has
|
||||||
|
+a lower refault percentage. The first tier contains single-use
|
||||||
|
+unmapped clean pages, which are the best bet. The eviction sorts a
|
||||||
|
+page according to its gen counter if the aging has found this page
|
||||||
|
+accessed through page tables and updated its gen counter. It also
|
||||||
|
+moves a page to the next generation, i.e., ``min_seq+1``, if this page
|
||||||
|
+was accessed multiple times through file descriptors and the feedback
|
||||||
|
+loop has detected outlying refaults from the tier this page is in. To
|
||||||
|
+this end, the feedback loop uses the first tier as the baseline, for
|
||||||
|
+the reason stated earlier.
|
||||||
|
+
|
||||||
|
+Summary
|
||||||
|
+-------
|
||||||
|
+The multi-gen LRU can be disassembled into the following parts:
|
||||||
|
+
|
||||||
|
+* Generations
|
||||||
|
+* Rmap walks
|
||||||
|
+* Page table walks
|
||||||
|
+* Bloom filters
|
||||||
|
+* PID controller
|
||||||
|
+
|
||||||
|
+The aging and the eviction form a producer-consumer model;
|
||||||
|
+specifically, the latter drives the former by the sliding window over
|
||||||
|
+generations. Within the aging, rmap walks drive page table walks by
|
||||||
|
+inserting hot densely populated page tables to the Bloom filters.
|
||||||
|
+Within the eviction, the PID controller uses refaults as the feedback
|
||||||
|
+to select types to evict and tiers to protect.
|
||||||
|
--
|
||||||
|
2.17.1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user