treewide: remove kernel 6.0 support

This commit is contained in:
AmadeusGhost 2022-12-14 12:10:38 +08:00
parent 2e543c861a
commit 56de4ccd61
169 changed files with 19 additions and 38070 deletions

View File

@ -1,2 +0,0 @@
LINUX_VERSION-6.0 = .12
LINUX_KERNEL_HASH-6.0.12 = 89b730edf8942b49e02f9894244205886c9a214d629b35b88c4ff06ee9304f01

View File

@ -109,9 +109,9 @@ define KernelPackage/fs-cifs
+kmod-crypto-ccm \
+kmod-crypto-ecb \
+kmod-crypto-des \
+(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-oid-registry \
+(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-dnsresolver
+(LINUX_5_15||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_1):kmod-oid-registry \
+(LINUX_5_15||LINUX_6_1):kmod-dnsresolver
endef
define KernelPackage/fs-cifs/description
@ -530,7 +530,7 @@ $(eval $(call KernelPackage,fs-ntfs))
define KernelPackage/fs-ntfs3
SUBMENU:=$(FS_MENU)
TITLE:=NTFS3 Read-Write file system support
DEPENDS:=@(LINUX_5_4||LINUX_5_10||LINUX_5_15||LINUX_6_0||LINUX_6_1) +kmod-nls-base
DEPENDS:= +kmod-nls-base
KCONFIG:= \
CONFIG_NTFS3_FS \
CONFIG_NTFS3_64BIT_CLUSTER=y \

View File

@ -142,7 +142,7 @@ $(eval $(call KernelPackage,mii))
define KernelPackage/mdio-devres
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=Supports MDIO device registration
DEPENDS:=@(LINUX_5_10||LINUX_5_15||LINUX_6_0||LINUX_6_1) +kmod-libphy +(TARGET_armvirt||TARGET_bcm27xx_bcm2708||TARGET_tegra):kmod-of-mdio
DEPENDS:=@(LINUX_5_10||LINUX_5_15||LINUX_6_1) +kmod-libphy +(TARGET_armvirt||TARGET_bcm27xx_bcm2708||TARGET_tegra):kmod-of-mdio
KCONFIG:=CONFIG_MDIO_DEVRES
HIDDEN:=1
FILES:=$(LINUX_DIR)/drivers/net/phy/mdio_devres.ko
@ -597,7 +597,7 @@ $(eval $(call KernelPackage,8139cp))
define KernelPackage/r8169
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=RealTek RTL-8169 PCI Gigabit Ethernet Adapter kernel support
DEPENDS:=@PCI_SUPPORT +kmod-mii +r8169-firmware +kmod-phy-realtek +(LINUX_5_10||LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-mdio-devres
DEPENDS:=@PCI_SUPPORT +kmod-mii +r8169-firmware +kmod-phy-realtek +(LINUX_5_10||LINUX_5_15||LINUX_6_1):kmod-mdio-devres
KCONFIG:= \
CONFIG_R8169 \
CONFIG_R8169_NAPI=y \
@ -723,7 +723,7 @@ $(eval $(call KernelPackage,igbvf))
define KernelPackage/ixgbe
SUBMENU:=$(NETWORK_DEVICES_MENU)
TITLE:=Intel(R) 82598/82599 PCI-Express 10 Gigabit Ethernet support
DEPENDS:=@PCI_SUPPORT +kmod-mdio +kmod-ptp +kmod-hwmon-core +kmod-libphy +(LINUX_5_10||LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-mdio-devres
DEPENDS:=@PCI_SUPPORT +kmod-mdio +kmod-ptp +kmod-hwmon-core +kmod-libphy +(LINUX_5_10||LINUX_5_15||LINUX_6_1):kmod-mdio-devres
KCONFIG:=CONFIG_IXGBE \
CONFIG_IXGBE_VXLAN=n \
CONFIG_IXGBE_HWMON=y \

View File

@ -1356,7 +1356,7 @@ define KernelPackage/qrtr
SUBMENU:=$(NETWORK_SUPPORT_MENU)
TITLE:=Qualcomm IPC Router support
HIDDEN:=1
DEPENDS:=@(LINUX_5_15||LINUX_6_0||LINUX_6_1)
DEPENDS:=@(LINUX_5_15||LINUX_6_1)
KCONFIG:=CONFIG_QRTR
FILES:= \
$(LINUX_DIR)/net/qrtr/qrtr.ko \

View File

@ -1136,8 +1136,8 @@ $(eval $(call KernelPackage,keys-trusted))
define KernelPackage/tpm
SUBMENU:=$(OTHER_MENU)
TITLE:=TPM Hardware Support
DEPENDS:= +kmod-random-core +(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-asn1-encoder +(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-oid-registry
DEPENDS:= +kmod-random-core +(LINUX_5_15||LINUX_6_1):kmod-asn1-decoder \
+(LINUX_5_15||LINUX_6_1):kmod-asn1-encoder +(LINUX_5_15||LINUX_6_1):kmod-oid-registry
KCONFIG:= CONFIG_TCG_TPM
FILES:= $(LINUX_DIR)/drivers/char/tpm/tpm.ko
AUTOLOAD:=$(call AutoLoad,10,tpm,1)
@ -1283,7 +1283,7 @@ $(eval $(call KernelPackage,qcom-qmi-helpers))
define KernelPackage/mhi
SUBMENU:=$(OTHER_MENU)
TITLE:=Modem Host Interface (MHI) bus
DEPENDS:=@(LINUX_5_15||LINUX_6_0||LINUX_6_1)
DEPENDS:=@(LINUX_5_15||LINUX_6_1)
KCONFIG:=CONFIG_MHI_BUS \
CONFIG_MHI_BUS_DEBUG=y \
CONFIG_MHI_BUS_PCI_GENERIC=n \

View File

@ -1138,7 +1138,8 @@ $(eval $(call KernelPackage,usb-net-aqc111))
define KernelPackage/usb-net-asix
TITLE:=Kernel module for USB-to-Ethernet Asix convertors
DEPENDS:=+kmod-libphy +(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-mdio-devres
DEPENDS:=+(LINUX_5_4||LINUX_5_10):kmod-libphy \
+(LINUX_5_15||LINUX_6_1):kmod-mdio-devres +LINUX_6_1:kmod-phylink
KCONFIG:=CONFIG_USB_NET_AX8817X
FILES:= \
$(LINUX_DIR)/drivers/$(USBNET_DIR)/asix.ko \

View File

@ -246,7 +246,7 @@ define KernelPackage/drm
TITLE:=Direct Rendering Manager (DRM) support
HIDDEN:=1
DEPENDS:=+kmod-dma-buf +kmod-i2c-core +kmod-i2c-algo-bit +kmod-backlight \
+(LINUX_5_15||LINUX_6_0||LINUX_6_1):kmod-fb
+(LINUX_5_15||LINUX_6_1):kmod-fb
KCONFIG:= \
CONFIG_DRM \
CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y \
@ -268,7 +268,7 @@ $(eval $(call KernelPackage,drm))
define KernelPackage/drm-buddy
SUBMENU:=$(VIDEO_MENU)
TITLE:=A page based buddy allocator
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-drm @(LINUX_6_0||LINUX_6_1)
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-drm @LINUX_6_1
KCONFIG:=CONFIG_DRM_BUDDY
FILES:= $(LINUX_DIR)/drivers/gpu/drm/drm_buddy.ko
AUTOLOAD:=$(call AutoProbe,drm_buddy)
@ -313,7 +313,7 @@ $(eval $(call KernelPackage,drm-kms-helper))
define KernelPackage/drm-display-helper
SUBMENU:=$(VIDEO_MENU)
TITLE:=DRM helpers for display adapters drivers
DEPENDS:=@DISPLAY_SUPPORT +kmod-drm +TARGET_x86:kmod-drm-buddy @(LINUX_6_0||LINUX_6_1)
DEPENDS:=@DISPLAY_SUPPORT +kmod-drm +TARGET_x86:kmod-drm-buddy @LINUX_6_1
KCONFIG:=CONFIG_DRM_DISPLAY_HELPER
FILES:=$(LINUX_DIR)/drivers/gpu/drm/display/drm_display_helper.ko
AUTOLOAD:=$(call AutoProbe,drm_display_helper)
@ -330,7 +330,7 @@ define KernelPackage/drm-amdgpu
TITLE:=AMDGPU DRM support
DEPENDS:=@TARGET_x86 @DISPLAY_SUPPORT +kmod-backlight +kmod-drm-ttm \
+kmod-drm-kms-helper +kmod-i2c-algo-bit +amdgpu-firmware \
+(LINUX_6_0||LINUX_6_1):kmod-drm-display-helper
+LINUX_6_1:kmod-drm-display-helper
KCONFIG:=CONFIG_DRM_AMDGPU \
CONFIG_DRM_AMDGPU_SI=y \
CONFIG_DRM_AMDGPU_CIK=y \
@ -1105,7 +1105,7 @@ define KernelPackage/drm-i915
SUBMENU:=$(VIDEO_MENU)
TITLE:=Intel GPU drm support
DEPENDS:=@TARGET_x86 +kmod-drm-ttm +kmod-drm-kms-helper +i915-firmware \
+(LINUX_6_0||LINUX_6_1):kmod-drm-display-helper
+LINUX_6_1:kmod-drm-display-helper
KCONFIG:= \
CONFIG_INTEL_GTT \
CONFIG_DRM_I915 \

View File

@ -1,75 +0,0 @@
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1386,16 +1386,6 @@ config BOOT_CONFIG_EMBED_FILE
This bootconfig will be used if there is no initrd or no other
bootconfig in the initrd.
-config INITRAMFS_PRESERVE_MTIME
- bool "Preserve cpio archive mtimes in initramfs"
- default y
- help
- Each entry in an initramfs cpio archive carries an mtime value. When
- enabled, extracted cpio items take this mtime, with directory mtime
- setting deferred until after creation of any child entries.
-
- If unsure, say Y.
-
choice
prompt "Compiler optimization level"
default CC_OPTIMIZE_FOR_PERFORMANCE
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -127,17 +127,15 @@ static void __init free_hash(void)
}
}
-#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME
-static void __init do_utime(char *filename, time64_t mtime)
+static long __init do_utime(char *filename, time64_t mtime)
{
- struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
- init_utimes(filename, t);
-}
+ struct timespec64 t[2];
-static void __init do_utime_path(const struct path *path, time64_t mtime)
-{
- struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } };
- vfs_utimes(path, t);
+ t[0].tv_sec = mtime;
+ t[0].tv_nsec = 0;
+ t[1].tv_sec = mtime;
+ t[1].tv_nsec = 0;
+ return init_utimes(filename, t);
}
static __initdata LIST_HEAD(dir_list);
@@ -170,12 +168,6 @@ static void __init dir_utime(void)
kfree(de);
}
}
-#else
-static void __init do_utime(char *filename, time64_t mtime) {}
-static void __init do_utime_path(const struct path *path, time64_t mtime) {}
-static void __init dir_add(const char *name, time64_t mtime) {}
-static void __init dir_utime(void) {}
-#endif
static __initdata time64_t mtime;
@@ -407,10 +399,14 @@ static int __init do_name(void)
static int __init do_copy(void)
{
if (byte_count >= body_len) {
+ struct timespec64 t[2] = { };
if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len)
error("write error");
- do_utime_path(&wfile->f_path, mtime);
+ t[0].tv_sec = mtime;
+ t[1].tv_sec = mtime;
+ vfs_utimes(&wfile->f_path, t);
+
fput(wfile);
if (csum_present && io_csum != hdr_csum)
error("bad data checksum");

View File

@ -1,122 +0,0 @@
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2133,8 +2133,6 @@ struct net_device {
/* Protocol-specific pointers */
- struct in_device __rcu *ip_ptr;
- struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info;
#endif
@@ -2147,18 +2145,16 @@ struct net_device {
#if IS_ENABLED(CONFIG_ATALK)
void *atalk_ptr;
#endif
+ struct in_device __rcu *ip_ptr;
#if IS_ENABLED(CONFIG_DECNET)
struct dn_dev __rcu *dn_ptr;
#endif
+ struct inet6_dev __rcu *ip6_ptr;
#if IS_ENABLED(CONFIG_AX25)
void *ax25_ptr;
#endif
-#if IS_ENABLED(CONFIG_CFG80211)
struct wireless_dev *ieee80211_ptr;
-#endif
-#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
struct wpan_dev *ieee802154_ptr;
-#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
struct mpls_dev __rcu *mpls_ptr;
#endif
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8379,9 +8379,7 @@ int cfg80211_register_netdevice(struct n
*/
static inline void cfg80211_unregister_netdevice(struct net_device *dev)
{
-#if IS_ENABLED(CONFIG_CFG80211)
cfg80211_unregister_wdev(dev->ieee80211_ptr);
-#endif
}
/**
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -373,7 +373,6 @@ struct wpan_dev {
#define to_phy(_dev) container_of(_dev, struct wpan_phy, dev)
-#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
static inline int
wpan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
const struct ieee802154_addr *daddr,
@@ -384,7 +383,6 @@ wpan_dev_hard_header(struct sk_buff *skb
return wpan_dev->header_ops->create(skb, dev, daddr, saddr, len);
}
-#endif
struct wpan_phy *
wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size);
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -308,11 +308,9 @@ static bool batadv_is_cfg80211_netdev(st
if (!net_device)
return false;
-#if IS_ENABLED(CONFIG_CFG80211)
/* cfg80211 drivers have to set ieee80211_ptr */
if (net_device->ieee80211_ptr)
return true;
-#endif
return false;
}
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -747,6 +747,7 @@ static const struct attribute_group nets
.attrs = netstat_attrs,
};
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
static struct attribute *wireless_attrs[] = {
NULL
};
@@ -755,19 +756,7 @@ static const struct attribute_group wire
.name = "wireless",
.attrs = wireless_attrs,
};
-
-static bool wireless_group_needed(struct net_device *ndev)
-{
-#if IS_ENABLED(CONFIG_CFG80211)
- if (ndev->ieee80211_ptr)
- return true;
#endif
-#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- if (ndev->wireless_handlers)
- return true;
-#endif
- return false;
-}
#else /* CONFIG_SYSFS */
#define net_class_groups NULL
@@ -2008,8 +1997,14 @@ int netdev_register_kobject(struct net_d
*groups++ = &netstat_group;
- if (wireless_group_needed(ndev))
+#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ *groups++ = &wireless_group;
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ else if (ndev->wireless_handlers)
*groups++ = &wireless_group;
+#endif
+#endif
#endif /* CONFIG_SYSFS */
error = device_add(dev);

View File

@ -1,21 +0,0 @@
From 173019b66dcc9d68ad9333aa744dad1e369b5aa8 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 9 Jul 2017 00:26:53 +0200
Subject: [PATCH 34/34] kernel: add compile fix for linux 4.9 on x86
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
Makefile | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -537,7 +537,7 @@ KBUILD_LDFLAGS_MODULE :=
KBUILD_LDFLAGS :=
CLANG_FLAGS :=
-export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG
+export ARCH SRCARCH SUBARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG
export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD

View File

@ -1,143 +0,0 @@
From e3264035bdac67898d685423ffb2f3a9c3a5964a Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 4 Aug 2021 01:31:34 -0600
Subject: [PATCH 01/14] mm: x86, arm64: add arch_has_hw_pte_young()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures automatically set the accessed bit in PTEs, e.g.,
x86 and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE usually triggers a page fault
following the TLB miss of this PTE (to emulate the accessed bit).
Being aware of this capability can help make better decisions, e.g.,
whether to spread the work out over a period of time to reduce bursty
page faults when trying to clear the accessed bit in many PTEs.
Note that theoretically this capability can be unreliable, e.g.,
hotplugged CPUs might be different from builtin ones. Therefore it
should not be used in architecture-independent code that involves
correctness, e.g., to determine whether TLB flushes are required (in
combination with the accessed bit).
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Acked-by: Will Deacon <will@kernel.org>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
---
arch/arm64/include/asm/pgtable.h | 15 ++-------------
arch/x86/include/asm/pgtable.h | 6 +++---
include/linux/pgtable.h | 13 +++++++++++++
mm/memory.c | 14 +-------------
4 files changed, 19 insertions(+), 29 deletions(-)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(stru
* page after fork() + CoW for pfn mappings. We don't always have a
* hardware-managed access flag on arm64.
*/
-static inline bool arch_faults_on_old_pte(void)
-{
- /* The register read below requires a stable CPU to make any sense */
- cant_migrate();
-
- return !cpu_has_hw_af();
-}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
+#define arch_has_hw_pte_young cpu_has_hw_af
/*
* Experimentally, it's cheap to set the access flag in hardware and we
* benefit from prefaulting mappings as 'old' to start with.
*/
-static inline bool arch_wants_old_prefaulted_pte(void)
-{
- return !arch_faults_on_old_pte();
-}
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
static inline bool pud_sect_supported(void)
{
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_c
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
+#define arch_has_hw_pte_young arch_has_hw_pte_young
+static inline bool arch_has_hw_pte_young(void)
{
- return false;
+ return true;
}
#ifdef CONFIG_PAGE_TABLE_CHECK
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported on the local CPU.
+ *
+ * This stub assumes accessing through an old PTE triggers a page fault.
+ * Architectures that automatically set the access bit should overwrite it.
+ */
+static inline bool arch_has_hw_pte_young(void)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -2872,7 +2860,7 @@ static inline bool __wp_page_copy_user(s
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);

View File

@ -1,132 +0,0 @@
From 0c0016e6f53b52166fe4da61c81fa6b27f4650cd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 26 Sep 2020 21:17:18 -0600
Subject: [PATCH 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some architectures support the accessed bit in non-leaf PMD entries,
e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it
as part of linear address translation [1]. Page table walkers that
clear the accessed bit may use this capability to reduce their search
space.
Note that:
1. Although an inline function is preferable, this capability is added
as a configuration option for consistency with the existing macros.
2. Due to the little interest in other varieties, this capability was
only tested on Intel and AMD CPUs.
Thanks to the following developers for their efforts [2][3].
Randy Dunlap <rdunlap@infradead.org>
Stephen Rothwell <sfr@canb.auug.org.au>
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3 (June 2021), section 4.8
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
---
arch/Kconfig | 8 ++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 3 ++-
arch/x86/mm/pgtable.c | 5 ++++-
include/linux/pgtable.h | 4 ++--
5 files changed, 17 insertions(+), 4 deletions(-)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1418,6 +1418,14 @@ config DYNAMIC_SIGFRAME
config HAVE_ARCH_NODE_DEV_GROUP
bool
+config ARCH_HAS_NONLEAF_PMD_YOUNG
+ bool
+ help
+ Architectures that select this option are capable of setting the
+ accessed bit in non-leaf PMD entries when using them as part of linear
+ address translations. Page table walkers that clear the accessed bit
+ may use this capability to reduce their search space.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
select ARCH_HAS_SET_MEMORY
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vad
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}
static inline unsigned long pages_to_mb(unsigned long npg)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
return ret;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
return ret;
}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_yo
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_yo
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

View File

@ -1,254 +0,0 @@
From d8e0edcddc441574410a047ede56f79c849a6d37 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 27 Sep 2020 20:49:08 -0600
Subject: [PATCH 03/14] mm/vmscan.c: refactor shrink_node()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch refactors shrink_node() to improve readability for the
upcoming changes to mm/vmscan.c.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
---
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
1 file changed, 104 insertions(+), 94 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2740,6 +2740,109 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
@@ -3207,109 +3310,16 @@ static void shrink_node(pg_data_t *pgdat
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
- /*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
- */
- mem_cgroup_flush_stats();
-
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);

View File

@ -1,59 +0,0 @@
From bc14d2c7c6d0fb8c79ad0fc5eab488b977cbcccf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:22:40 -0700
Subject: [PATCH 04/14] Revert "include/linux/mm_inline.h: fold
__update_lru_size() into its sole caller"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch undoes the following refactor:
commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller")
The upcoming changes to include/linux/mm_inline.h will reuse
__update_lru_size().
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I6155c407d50199a43b179c7f45904d4b7c052118
---
include/linux/mm_inline.h | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struc
return folio_is_file_lru(page_folio(page));
}
-static __always_inline void update_lru_size(struct lruvec *lruvec,
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
long nr_pages)
{
@@ -43,6 +43,13 @@ static __always_inline void update_lru_s
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+ enum lru_list lru, enum zone_type zid,
+ long nr_pages)
+{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif

View File

@ -1,777 +0,0 @@
From 8c6beb4548c216da9dae5e1a7612a108396e3f9e Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:12:33 -0700
Subject: [PATCH 05/14] mm: multi-gen LRU: groundwork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Evictable pages are divided into multiple generations for each lruvec.
The youngest generation number is stored in lrugen->max_seq for both
anon and file types as they are aged on an equal footing. The oldest
generation numbers are stored in lrugen->min_seq[] separately for anon
and file types as clean file pages can be evicted regardless of swap
constraints. These three variables are monotonically increasing.
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
in order to fit into the gen counter in folio->flags. Each truncated
generation number is an index to lrugen->lists[]. The sliding window
technique is used to track at least MIN_NR_GENS and at most
MAX_NR_GENS generations. The gen counter stores a value within [1,
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
stores 0.
There are two conceptually independent procedures: "the aging", which
produces young generations, and "the eviction", which consumes old
generations. They form a closed-loop system, i.e., "the page reclaim".
Both procedures can be invoked from userspace for the purposes of
working set estimation and proactive reclaim. These techniques are
commonly used to optimize job scheduling (bin packing) in data
centers [1][2].
To avoid confusion, the terms "hot" and "cold" will be applied to the
multi-gen LRU, as a new convention; the terms "active" and "inactive"
will be applied to the active/inactive LRU, as usual.
The protection of hot pages and the selection of cold pages are based
on page access channels and patterns. There are two access channels:
one through page tables and the other through file descriptors. The
protection of the former channel is by design stronger because:
1. The uncertainty in determining the access patterns of the former
channel is higher due to the approximation of the accessed bit.
2. The cost of evicting the former channel is higher due to the TLB
flushes required and the likelihood of encountering the dirty bit.
3. The penalty of underprotecting the former channel is higher because
applications usually do not prepare themselves for major page
faults like they do for blocked I/O. E.g., GUI applications
commonly use dedicated I/O threads to avoid blocking rendering
threads.
There are also two access patterns: one with temporal locality and the
other without. For the reasons listed above, the former channel is
assumed to follow the former pattern unless VM_SEQ_READ or
VM_RAND_READ is present; the latter channel is assumed to follow the
latter pattern unless outlying refaults have been observed [3][4].
The next patch will address the "outlying refaults". Three macros,
i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are
added in this patch to make the entire patchset less diffy.
A page is added to the youngest generation on faulting. The aging
needs to check the accessed bit at least twice before handing this
page over to the eviction. The first check takes care of the accessed
bit set on the initial fault; the second check makes sure this page
has not been used since then. This protocol, AKA second chance,
requires a minimum of two generations, hence MIN_NR_GENS.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
[3] https://lwn.net/Articles/495543/
[4] https://lwn.net/Articles/815342/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
---
fs/fuse/dev.c | 3 +-
include/linux/mm_inline.h | 175 ++++++++++++++++++++++++++++++
include/linux/mmzone.h | 102 +++++++++++++++++
include/linux/page-flags-layout.h | 13 ++-
include/linux/page-flags.h | 4 +-
include/linux/sched.h | 4 +
kernel/bounds.c | 5 +
mm/Kconfig | 8 ++
mm/huge_memory.c | 3 +-
mm/memcontrol.c | 2 +
mm/memory.c | 25 +++++
mm/mm_init.c | 6 +-
mm/mmzone.c | 2 +
mm/swap.c | 11 +-
mm/vmscan.c | 75 +++++++++++++
15 files changed, 424 insertions(+), 14 deletions(-)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -40,6 +40,9 @@ static __always_inline void __update_lru
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ lockdep_assert_held(&lruvec->lru_lock);
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
+
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
@@ -101,11 +104,177 @@ static __always_inline enum lru_list fol
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+static inline bool lru_gen_enabled(void)
+{
+ return true;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return current->in_lru_fault;
+}
+
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+static inline int folio_lru_gen(struct folio *folio)
+{
+ unsigned long flags = READ_ONCE(folio->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->lrugen.max_seq;
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+
+ /* see the comment on MIN_NR_GENS */
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
+ lrugen->nr_pages[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
+ lrugen->nr_pages[new_gen][type][zone] + delta);
+
+ /* addition */
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ /* deletion */
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long seq;
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
+
+ if (folio_test_unevictable(folio))
+ return false;
+ /*
+ * There are three common cases for this page:
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
+ * migrated, add it to the youngest generation.
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
+ * not in swapcache or a dirty page pending writeback, add it to the
+ * second oldest generation.
+ * 3. Everything else (clean, cold) is added to the oldest generation.
+ */
+ if (folio_test_active(folio))
+ seq = lrugen->max_seq;
+ else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
+ (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio))))
+ seq = lrugen->min_seq[type] + 1;
+ else
+ seq = lrugen->min_seq[type];
+
+ gen = lru_gen_from_seq(seq);
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
+ /* see the comment on MIN_NR_GENS about PG_active */
+ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
+
+ lru_gen_update_size(lruvec, folio, -1, gen);
+ /* for folio_rotate_reclaimable() */
+ if (reclaiming)
+ list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ unsigned long flags;
+ int gen = folio_lru_gen(folio);
+
+ if (gen < 0)
+ return false;
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+
+ /* for folio_migrate_flags() */
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
+ flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ lru_gen_update_size(lruvec, folio, gen, -1);
+ list_del(&folio->lru);
+
+ return true;
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_in_fault(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, false))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
if (lru != LRU_UNEVICTABLE)
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_add_folio(lruvec, folio, true))
+ return;
+
update_lru_size(lruvec, lru, folio_zonenum(folio),
folio_nr_pages(folio));
/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lru
{
enum lru_list lru = folio_lru_list(folio);
+ if (lru_gen_del_folio(lruvec, folio, false))
+ return;
+
if (lru != LRU_UNEVICTABLE)
list_del(&folio->lru);
update_lru_size(lruvec, lru, folio_zonenum(folio),
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -314,6 +314,102 @@ enum lruvec_flags {
*/
};
+#endif /* !__GENERATING_BOUNDS_H */
+
+/*
+ * Evictable pages are divided into multiple generations. The youngest and the
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
+ * corresponding generation. The gen counter in folio->flags stores gen+1 while
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
+ *
+ * A page is added to the youngest generation on faulting. The aging needs to
+ * check the accessed bit at least twice before handing this page over to the
+ * eviction. The first check takes care of the accessed bit set on the initial
+ * fault; the second check makes sure this page hasn't been used since then.
+ * This process, AKA second chance, requires a minimum of two generations,
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
+ * rest of generations, if they exist, are considered inactive. See
+ * lru_gen_is_active().
+ *
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
+ * the aging needs not to worry about it. And it's set again when a page
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
+ * See lru_gen_add_folio() and lru_gen_del_folio().
+ *
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
+ * number of categories of the active/inactive LRU when keeping track of
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
+ * in folio->flags.
+ */
+#define MIN_NR_GENS 2U
+#define MAX_NR_GENS 4U
+
+#ifndef __GENERATING_BOUNDS_H
+
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+enum {
+ LRU_GEN_ANON,
+ LRU_GEN_FILE,
+};
+
+/*
+ * The youngest generation number is stored in max_seq for both anon and file
+ * types as they are aged on an equal footing. The oldest generation numbers are
+ * stored in min_seq[] separately for anon and file types as clean file pages
+ * can be evicted regardless of swap constraints.
+ *
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
+ * min_seq behind.
+ *
+ * The number of pages in each generation is eventually consistent and therefore
+ * can be transiently negative.
+ */
+struct lru_gen_struct {
+ /* the aging increments the youngest generation number */
+ unsigned long max_seq;
+ /* the eviction increments the oldest generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the multi-gen LRU lists, lazily sorted on eviction */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the multi-gen LRU sizes, eventually consistent */
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+};
+
+void lru_gen_init_lruvec(struct lruvec *lruvec);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -331,6 +427,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* evictable pages divided into generations */
+ struct lru_gen_struct lrugen;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
@@ -746,6 +846,8 @@ static inline bool zone_is_empty(struct
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,12 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
+#define LRU_REFS_WIDTH 0
+
#endif
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageA
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageA
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -914,6 +914,10 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ /* whether the LRU algorithm may apply to this access */
+ unsigned in_lru_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,11 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
+#else
+ DEFINE(LRU_GEN_WIDTH, 0);
+#endif
/* End of constants */
return 0;
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1124,6 +1124,14 @@ config PTE_MARKER_UFFD_WP
purposes. It is required to enable userfaultfd write protection on
file-backed memory types like shmem and hugetlbfs.
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory.
+
source "mm/damon/Kconfig"
endmenu
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2438,7 +2438,8 @@ static void __split_huge_page_tail(struc
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5170,6 +5170,7 @@ static void __mem_cgroup_free(struct mem
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -5228,6 +5229,7 @@ static struct mem_cgroup *mem_cgroup_all
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5110,6 +5110,27 @@ static inline void mm_account_fault(stru
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm doesn't apply to sequential or random reads */
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -5141,11 +5162,15 @@ vm_fault_t handle_mm_fault(struct vm_are
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
* Poison its list head, so that any operations on it would crash.
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -484,6 +484,11 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
folio_get(folio);
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -575,7 +580,7 @@ static void lru_deactivate_file_fn(struc
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
long nr_pages = folio_nr_pages(folio);
lruvec_del_folio(lruvec, folio);
@@ -688,8 +693,8 @@ void deactivate_page(struct page *page)
{
struct folio *folio = page_folio(page);
- if (folio_test_lru(folio) && folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3062,6 +3062,81 @@ static bool can_age_anon_pages(struct pg
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* for hotadd_new_pgdat() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return pgdat ? &pgdat->__lruvec : NULL;
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];

View File

@ -1,476 +0,0 @@
From 93fa87bdef9e7fa9977355c4712c000f31639231 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:43:22 -0700
Subject: [PATCH 07/14] mm: multi-gen LRU: exploit locality in rmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Searching the rmap for PTEs mapping each page on an LRU list (to test
and clear the accessed bit) can be expensive because pages from
different VMAs (PA space) are not cache friendly to the rmap (VA
space). For workloads mostly using mapped pages, searching the rmap
can incur the highest CPU cost in the reclaim path.
This patch exploits spatial locality to reduce the trips into the
rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
new function lru_gen_look_around() scans at most BITS_PER_LONG-1
adjacent PTEs. On finding another young PTE, it clears the accessed
bit and updates the gen counter of the page mapped by this PTE to
(max_seq%MAX_NR_GENS)+1.
Server benchmark results:
Single workload:
fio (buffered I/O): no change
Single workload:
memcached (anon): +[3, 5]%
Ops/sec KB/sec
patch1-6: 1106168.46 43025.04
patch1-7: 1147696.57 44640.29
Configurations:
no change
Client benchmark results:
kswapd profiles:
patch1-6
39.03% lzo1x_1_do_compress (real work)
18.47% page_vma_mapped_walk (overhead)
6.74% _raw_spin_unlock_irq
3.97% do_raw_spin_lock
2.49% ptep_clear_flush
2.48% anon_vma_interval_tree_iter_first
1.92% folio_referenced_one
1.88% __zram_bvec_write
1.48% memmove
1.31% vma_interval_tree_iter_next
patch1-7
48.16% lzo1x_1_do_compress (real work)
8.20% page_vma_mapped_walk (overhead)
7.06% _raw_spin_unlock_irq
2.92% ptep_clear_flush
2.53% __zram_bvec_write
2.11% do_raw_spin_lock
2.02% memmove
1.93% lru_gen_look_around
1.56% free_unref_page_list
1.40% memset
Configurations:
no change
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4b9ca0fd20f566ce554e703f14cee3fe0048c2fd
---
include/linux/memcontrol.h | 31 +++++++
include/linux/mm.h | 5 +
include/linux/mmzone.h | 6 ++
mm/internal.h | 1 +
mm/memcontrol.c | 1 +
mm/rmap.c | 6 ++
mm/swap.c | 4 +-
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
8 files changed, 236 insertions(+), 2 deletions(-)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem folio a caller should hold an rcu read lock to protect memcg
* associated with a kmem folio from being released.
@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_m
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*
* For a kmem page a caller should hold an rcu read lock to protect memcg
* associated with a kmem page from being released.
@@ -959,6 +961,23 @@ void unlock_page_memcg(struct page *page
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+/* try to stablize folio_memcg() for all the pages in a memcg */
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ rcu_read_lock();
+
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
+ return true;
+
+ rcu_read_unlock();
+ return false;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -1433,6 +1452,18 @@ static inline void folio_memcg_unlock(st
{
}
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
+{
+ /* to match folio_memcg_rcu() */
+ rcu_read_lock();
+ return true;
+}
+
+static inline void mem_cgroup_unlock_pages(void)
+{
+ rcu_read_unlock();
+}
+
static inline void mem_cgroup_handle_over_high(void)
{
}
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,11 @@ static inline unsigned long folio_pfn(st
return page_to_pfn(&folio->page);
}
+static inline struct folio *pfn_folio(unsigned long pfn)
+{
+ return page_folio(pfn_to_page(pfn));
+}
+
static inline atomic_t *folio_pincount_ptr(struct folio *folio)
{
return &folio_page(folio, 1)->compound_pincount;
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -372,6 +372,7 @@ enum lruvec_flags {
#ifndef __GENERATING_BOUNDS_H
struct lruvec;
+struct page_vma_mapped_walk;
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -427,6 +428,7 @@ struct lru_gen_struct {
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(s
{
}
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
#ifdef CONFIG_MEMCG
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -833,6 +833,12 @@ static bool folio_referenced_one(struct
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu
folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1635,6 +1635,11 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
+
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -3231,6 +3236,29 @@ static bool positive_ctrl_err(struct ctr
* the aging
******************************************************************************/
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_page_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
/* protect pages accessed multiple times through file descriptors */
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
@@ -3242,6 +3270,11 @@ static int folio_inc_gen(struct lruvec *
VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
new_gen = (old_gen + 1) % MAX_NR_GENS;
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
@@ -3256,6 +3289,43 @@ static int folio_inc_gen(struct lruvec *
return new_gen;
}
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ return folio;
+}
+
static void inc_min_seq(struct lruvec *lruvec, int type)
{
struct lru_gen_struct *lrugen = &lruvec->lrugen;
@@ -3455,6 +3525,114 @@ static void lru_gen_age_node(struct pgli
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
}
+/*
+ * This function exploits spatial locality when shrink_page_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
/******************************************************************************
* the eviction
******************************************************************************/
@@ -3491,6 +3669,12 @@ static bool sort_folio(struct lruvec *lr
return true;
}
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
/* protected */
if (tier > tier_idx) {
int hist = lru_hist_from_seq(lrugen->min_seq[type]);

View File

@ -1,290 +0,0 @@
From 6b9670b94ba2b49b289b997121062500e32fc3e4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:59:54 -0700
Subject: [PATCH 09/14] mm: multi-gen LRU: optimize multiple memcgs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When multiple memcgs are available, it is possible to use generations
as a frame of reference to make better choices and improve overall
performance under global memory pressure. This patch adds a basic
optimization to select memcgs that can drop single-use unmapped clean
pages first. Doing so reduces the chance of going into the aging path
or swapping, which can be costly.
A typical example that benefits from this optimization is a server
running mixed types of workloads, e.g., heavy anon workload in one
memcg and heavy buffered I/O workload in the other.
Though this optimization can be applied to both kswapd and direct
reclaim, it is only added to kswapd to keep the patchset manageable.
Later improvements may cover the direct reclaim path.
While ensuring certain fairness to all eligible memcgs, proportional
scans of individual memcgs also require proper backoff to avoid
overshooting their aggregate reclaim target by too much. Otherwise it
can cause high direct reclaim latency. The conditions for backoff are:
1. At low priorities, for direct reclaim, if aging fairness or direct
reclaim latency is at risk, i.e., aging one memcg multiple times or
swapping after the target is met.
2. At high priorities, for global reclaim, if per-zone free pages are
above respective watermarks.
Server benchmark results:
Mixed workloads:
fio (buffered I/O): +[19, 21]%
IOPS BW
patch1-8: 1880k 7343MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[119, 123]%
Ops/sec KB/sec
patch1-8: 862768.65 33514.68
patch1-9: 1911022.12 74234.54
Mixed workloads:
fio (buffered I/O): +[75, 77]%
IOPS BW
5.19-rc1: 1279k 4996MiB/s
patch1-9: 2252k 8796MiB/s
memcached (anon): +[13, 15]%
Ops/sec KB/sec
5.19-rc1: 1673524.04 65008.87
patch1-9: 1911022.12 74234.54
Configurations:
(changes since patch 6)
cat mixed.sh
modprobe brd rd_nr=2 rd_size=56623104
swapoff -a
mkswap /dev/ram0
swapon /dev/ram0
mkfs.ext4 /dev/ram1
mount -t ext4 /dev/ram1 /mnt
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
--ratio 1:0 --pipeline 8 -d 2000
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
--buffered=1 --ioengine=io_uring --iodepth=128 \
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
--rw=randread --random_distribution=random --norandommap \
--time_based --ramp_time=10m --runtime=90m --group_reporting &
pid=$!
sleep 200
memtier_benchmark -S /var/run/memcached/memcached.sock \
-P memcache_binary -n allkeys --key-minimum=1 \
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
kill -INT $pid
wait
Client benchmark results:
no change (CONFIG_MEMCG=n)
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7e00e0c733437e534ac98031cf8154a681becc00
---
mm/vmscan.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 95 insertions(+), 9 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -131,6 +131,12 @@ struct scan_control {
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help kswapd make better choices among multiple memcgs */
+ unsigned int memcgs_need_aging:1;
+ unsigned long last_reclaimed;
+#endif
+
/* Allocation order */
s8 order;
@@ -4441,6 +4447,19 @@ static void lru_gen_age_node(struct pgli
VM_WARN_ON_ONCE(!current_is_kswapd());
+ sc->last_reclaimed = sc->nr_reclaimed;
+
+ /*
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ return;
+ }
+
set_mm_walk(pgdat);
memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -4852,7 +4871,8 @@ static int isolate_folios(struct lruvec
return scanned;
}
-static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
{
int type;
int scanned;
@@ -4915,6 +4935,9 @@ static int evict_folios(struct lruvec *l
sc->nr_reclaimed += reclaimed;
+ if (need_swapping && type == LRU_GEN_ANON)
+ *need_swapping = true;
+
return scanned;
}
@@ -4924,9 +4947,8 @@ static int evict_folios(struct lruvec *l
* reclaim.
*/
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
- bool can_swap)
+ bool can_swap, bool *need_aging)
{
- bool need_aging;
unsigned long nr_to_scan;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
DEFINE_MAX_SEQ(lruvec);
@@ -4936,8 +4958,8 @@ static unsigned long get_nr_to_scan(stru
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
return 0;
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
- if (!need_aging)
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+ if (!*need_aging)
return nr_to_scan;
/* skip the aging path at the default priority */
@@ -4954,10 +4976,67 @@ done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
}
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg at most once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
+ * may succeed if all suitable zones are somewhat safe. In either case,
+ * it's better to stop now, and restart later if necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
unsigned long scanned = 0;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
lru_add_drain();
@@ -4977,21 +5056,28 @@ static void lru_gen_shrink_lruvec(struct
else
swappiness = 0;
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
if (!nr_to_scan)
- break;
+ goto done;
- delta = evict_folios(lruvec, sc, swappiness);
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
if (!delta)
- break;
+ goto done;
scanned += delta;
if (scanned >= nr_to_scan)
break;
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+ break;
+
cond_resched();
}
+ /* see the comment in lru_gen_age_node() */
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+ sc->memcgs_need_aging = false;
+done:
clear_mm_walk();
blk_finish_plug(&plug);

View File

@ -1,475 +0,0 @@
From ef61bb3622ee0f36e055dfd5006badff08f5ce61 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 19:52:09 -0700
Subject: [PATCH 10/14] mm: multi-gen LRU: kill switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
0x0001: the multi-gen LRU core
0x0002: walking page table, when arch_has_hw_pte_young() returns
true
0x0004: clearing the accessed bit in non-leaf PMD entries, when
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
[yYnN]: apply to all the components above
E.g.,
echo y >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0007
echo 5 >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0005
NB: the page table walks happen on the scale of seconds under heavy
memory pressure, in which case the mmap_lock contention is a lesser
concern, compared with the LRU lock contention and the I/O congestion.
So far the only well-known case of the mmap_lock contention happens on
Android, due to Scudo [1] which allocates several thousand VMAs for
merely a few hundred MBs. The SPF and the Maple Tree also have
provided their own assessments [2][3]. However, if walking page tables
does worsen the mmap_lock contention, the kill switch can be used to
disable it. In this case the multi-gen LRU will suffer a minor
performance degradation, as shown previously.
Clearing the accessed bit in non-leaf PMD entries can also be
disabled, since this behavior was not tested on x86 varieties other
than Intel and AMD.
[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I4c909618e8fed7fb1337f6624bbe542ec920a515
---
include/linux/cgroup.h | 15 ++-
include/linux/mm_inline.h | 15 ++-
include/linux/mmzone.h | 9 ++
kernel/cgroup/cgroup-internal.h | 1 -
mm/Kconfig | 6 +
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
6 files changed, 265 insertions(+), 9 deletions(-)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -708,6 +719,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,10 +106,21 @@ static __always_inline enum lru_list fol
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
- return true;
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+}
+#else
+static inline bool lru_gen_enabled(void)
+{
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
+#endif
static inline bool lru_gen_in_fault(void)
{
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(str
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
/*
* There are three common cases for this page:
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -384,6 +384,13 @@ enum {
LRU_GEN_FILE,
};
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
@@ -425,6 +432,8 @@ struct lru_gen_struct {
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
};
enum {
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1133,6 +1133,12 @@ config LRU_GEN
help
A high performance LRU implementation to overcommit memory.
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
config LRU_GEN_STATS
bool "Full stats for debugging"
depends on LRU_GEN
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,7 @@
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -3082,6 +3083,14 @@ static bool can_age_anon_pages(struct pg
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3958,7 +3967,8 @@ static void walk_pmd_range_locked(pud_t
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -4056,10 +4066,12 @@ restart:
walk->mm_stats[MM_NONLEAF_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (!pmd_young(val))
- continue;
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ }
#endif
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -4321,7 +4333,7 @@ static bool try_to_inc_max_seq(struct lr
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!arch_has_hw_pte_young()) {
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -5084,6 +5096,208 @@ done:
}
/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+}
+
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enabled, store_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
* initialization
******************************************************************************/
@@ -5093,6 +5307,7 @@ void lru_gen_init_lruvec(struct lruvec *
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -5132,6 +5347,9 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
return 0;
};
late_initcall(init_lru_gen);

View File

@ -1,202 +0,0 @@
From 9d92c76fb8ac09ff195024139575d8c4db66b672 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:08:50 -0700
Subject: [PATCH 11/14] mm: multi-gen LRU: thrashing prevention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
requested by many desktop users [1].
When set to value N, it prevents the working set of N milliseconds
from getting evicted. The OOM killer is triggered if this working set
cannot be kept in memory. Based on the average human detectable lag
(~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
Larger values like N=3000 make lags less noticeable at the risk of
premature OOM kills.
Compared with the size-based approach [2], this time-based approach
has the following advantages:
1. It is easier to configure because it is agnostic to applications
and memory sizes.
2. It is more reliable because it is directly wired to the OOM killer.
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I007499d7e47374b59fd620e8c3962940bc9f788e
---
include/linux/mmzone.h | 2 ++
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 73 insertions(+), 3 deletions(-)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -419,6 +419,8 @@ struct lru_gen_struct {
unsigned long max_seq;
/* the eviction increments the oldest generation numbers */
unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
/* the multi-gen LRU lists, lazily sorted on eviction */
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* the multi-gen LRU sizes, eventually consistent */
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4305,6 +4305,7 @@ static void inc_max_seq(struct lruvec *l
for (type = 0; type < ANON_AND_FILE; type++)
reset_ctrl_pos(lruvec, type, false);
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
/* make sure preceding modifications appear */
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
@@ -4432,7 +4433,7 @@ static bool should_run_aging(struct lruv
return false;
}
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
{
bool need_aging;
unsigned long nr_to_scan;
@@ -4446,16 +4447,36 @@ static void age_lruvec(struct lruvec *lr
mem_cgroup_calculate_protection(NULL, memcg);
if (mem_cgroup_below_min(memcg))
- return;
+ return false;
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+
+ if (min_ttl) {
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ /* the size is likely too small to be helpful */
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+ return false;
+ }
+
if (need_aging)
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+
+ return true;
}
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
+ bool success = false;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
VM_WARN_ON_ONCE(!current_is_kswapd());
@@ -4478,12 +4499,32 @@ static void lru_gen_age_node(struct pgli
do {
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
- age_lruvec(lruvec, sc);
+ if (age_lruvec(lruvec, sc, min_ttl))
+ success = true;
cond_resched();
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
clear_mm_walk();
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (success || !min_ttl || sc->order)
+ return;
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either below min or empty.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
}
/*
@@ -5240,6 +5281,28 @@ unlock:
* sysfs interface
******************************************************************************/
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
unsigned int caps = 0;
@@ -5288,6 +5351,7 @@ static struct kobj_attribute lru_gen_ena
);
static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
&lru_gen_enabled_attr.attr,
NULL
};
@@ -5303,12 +5367,16 @@ static struct attribute_group lru_gen_at
void lru_gen_init_lruvec(struct lruvec *lruvec)
{
+ int i;
int gen, type, zone;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled = lru_gen_enabled();
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);

View File

@ -1,557 +0,0 @@
From d1e0e5fcdea16d4ceead496a0ea2fdbb6bc5bfe4 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Thu, 27 Jan 2022 20:12:41 -0700
Subject: [PATCH 12/14] mm: multi-gen LRU: debugfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
reclaim. These techniques are commonly used to optimize job scheduling
(bin packing) in data centers [1][2].
Compared with the page table-based approach and the PFN-based
approach, this lruvec-based approach has the following advantages:
1. It offers better choices because it is aware of memcgs, NUMA nodes,
shared mappings and unmapped page cache.
2. It is more scalable because it is O(nr_hot_pages), whereas the
PFN-based approach is O(nr_total_pages).
Add /sys/kernel/debug/lru_gen_full for debugging.
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I7bb06f14e0a94901a076cc3767d0855d4f1ea3ab
---
include/linux/nodemask.h | 1 +
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 402 insertions(+), 10 deletions(-)
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum no
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,7 @@
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4209,12 +4210,40 @@ static void clear_mm_walk(void)
kfree(walk);
}
-static void inc_min_seq(struct lruvec *lruvec, int type)
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
reset_ctrl_pos(lruvec, type, true);
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
}
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
@@ -4260,7 +4289,7 @@ next:
return success;
}
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
{
int prev, next;
int type, zone;
@@ -4274,9 +4303,13 @@ static void inc_max_seq(struct lruvec *l
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
continue;
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
- inc_min_seq(lruvec, type);
+ while (!inc_min_seq(lruvec, type, can_swap)) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
}
/*
@@ -4313,7 +4346,7 @@ static void inc_max_seq(struct lruvec *l
}
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
- struct scan_control *sc, bool can_swap)
+ struct scan_control *sc, bool can_swap, bool force_scan)
{
bool success;
struct lru_gen_mm_walk *walk;
@@ -4334,7 +4367,7 @@ static bool try_to_inc_max_seq(struct lr
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4348,7 +4381,7 @@ static bool try_to_inc_max_seq(struct lr
walk->lruvec = lruvec;
walk->max_seq = max_seq;
walk->can_swap = can_swap;
- walk->force_scan = false;
+ walk->force_scan = force_scan;
do {
success = iterate_mm_list(lruvec, walk, &mm);
@@ -4368,7 +4401,7 @@ done:
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
- inc_max_seq(lruvec, can_swap);
+ inc_max_seq(lruvec, can_swap, force_scan);
/* either this sees any waiters or they will see updated max_seq */
if (wq_has_sleeper(&lruvec->mm_state.wait))
wake_up_all(&lruvec->mm_state.wait);
@@ -4464,7 +4497,7 @@ static bool age_lruvec(struct lruvec *lr
}
if (need_aging)
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
return true;
}
@@ -5023,7 +5056,7 @@ static unsigned long get_nr_to_scan(stru
if (current_is_kswapd())
return 0;
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
return nr_to_scan;
done:
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
@@ -5362,6 +5395,361 @@ static struct attribute_group lru_gen_at
};
/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
* initialization
******************************************************************************/
@@ -5418,6 +5806,9 @@ static int __init init_lru_gen(void)
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
pr_err("lru_gen: failed to create sysfs group\n");
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
return 0;
};
late_initcall(init_lru_gen);

View File

@ -1,253 +0,0 @@
From 22199c9b30ffcc332be643577709a2af960e6786 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 23 Jan 2022 16:44:43 -0700
Subject: [PATCH 13/14] mm: multi-gen LRU: admin guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an admin guide.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
---
Documentation/admin-guide/mm/index.rst | 1 +
Documentation/admin-guide/mm/multigen_lru.rst | 162 ++++++++++++++++++
mm/Kconfig | 3 +-
mm/vmscan.c | 4 +
4 files changed, 169 insertions(+), 1 deletion(-)
create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -32,6 +32,7 @@ the Linux memory management.
idle_page_tracking
ksm
memory-hotplug
+ multigen_lru
nommu-mmap
numa_memory_policy
numaperf
--- /dev/null
+++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -0,0 +1,162 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Quick start
+===========
+Build the kernel with the following configurations.
+
+* ``CONFIG_LRU_GEN=y``
+* ``CONFIG_LRU_GEN_ENABLED=y``
+
+All set!
+
+Runtime options
+===============
+``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
+following subsections.
+
+Kill switch
+-----------
+``enabled`` accepts different values to enable or disable the
+following components. Its default value depends on
+``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
+unless some of them have unforeseen side effects. Writing to
+``enabled`` has no effect when a component is not supported by the
+hardware, and valid values will be accepted even when the main switch
+is off.
+
+====== ===============================================================
+Values Components
+====== ===============================================================
+0x0001 The main switch for the multi-gen LRU.
+0x0002 Clearing the accessed bit in leaf page table entries in large
+ batches, when MMU sets it (e.g., on x86). This behavior can
+ theoretically worsen lock contention (mmap_lock). If it is
+ disabled, the multi-gen LRU will suffer a minor performance
+ degradation for workloads that contiguously map hot pages,
+ whose accessed bits can be otherwise cleared by fewer larger
+ batches.
+0x0004 Clearing the accessed bit in non-leaf page table entries as
+ well, when MMU sets it (e.g., on x86). This behavior was not
+ verified on x86 varieties other than Intel and AMD. If it is
+ disabled, the multi-gen LRU will suffer a negligible
+ performance degradation.
+[yYnN] Apply to all the components above.
+====== ===============================================================
+
+E.g.,
+::
+
+ echo y >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0007
+ echo 5 >/sys/kernel/mm/lru_gen/enabled
+ cat /sys/kernel/mm/lru_gen/enabled
+ 0x0005
+
+Thrashing prevention
+--------------------
+Personal computers are more sensitive to thrashing because it can
+cause janks (lags when rendering UI) and negatively impact user
+experience. The multi-gen LRU offers thrashing prevention to the
+majority of laptop and desktop users who do not have ``oomd``.
+
+Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
+``N`` milliseconds from getting evicted. The OOM killer is triggered
+if this working set cannot be kept in memory. In other words, this
+option works as an adjustable pressure relief valve, and when open, it
+terminates applications that are hopefully not being used.
+
+Based on the average human detectable lag (~100ms), ``N=1000`` usually
+eliminates intolerable janks due to thrashing. Larger values like
+``N=3000`` make janks less noticeable at the risk of premature OOM
+kills.
+
+The default value ``0`` means disabled.
+
+Experimental features
+=====================
+``/sys/kernel/debug/lru_gen`` accepts commands described in the
+following subsections. Multiple command lines are supported, so does
+concatenation with delimiters ``,`` and ``;``.
+
+``/sys/kernel/debug/lru_gen_full`` provides additional stats for
+debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
+evicted generations in this file.
+
+Working set estimation
+----------------------
+Working set estimation measures how much memory an application needs
+in a given time interval, and it is usually done with little impact on
+the performance of the application. E.g., data centers want to
+optimize job scheduling (bin packing) to improve memory utilizations.
+When a new job comes in, the job scheduler needs to find out whether
+each server it manages can allocate a certain amount of memory for
+this new job before it can pick a candidate. To do so, the job
+scheduler needs to estimate the working sets of the existing jobs.
+
+When it is read, ``lru_gen`` returns a histogram of numbers of pages
+accessed over different time intervals for each memcg and node.
+``MAX_NR_GENS`` decides the number of bins for each histogram. The
+histograms are noncumulative.
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen_nr age_in_ms nr_anon_pages nr_file_pages
+ ...
+ max_gen_nr age_in_ms nr_anon_pages nr_file_pages
+
+Each bin contains an estimated number of pages that have been accessed
+within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
+and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
+the former is the largest and that of the latter is the smallest.
+
+Users can write the following command to ``lru_gen`` to create a new
+generation ``max_gen_nr+1``:
+
+ ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
+
+``can_swap`` defaults to the swap setting and, if it is set to ``1``,
+it forces the scan of anon pages when swap is off, and vice versa.
+``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
+employs heuristics to reduce the overhead, which is likely to reduce
+the coverage as well.
+
+A typical use case is that a job scheduler runs this command at a
+certain time interval to create new generations, and it ranks the
+servers it manages based on the sizes of their cold pages defined by
+this time interval.
+
+Proactive reclaim
+-----------------
+Proactive reclaim induces page reclaim when there is no memory
+pressure. It usually targets cold pages only. E.g., when a new job
+comes in, the job scheduler wants to proactively reclaim cold pages on
+the server it selected, to improve the chance of successfully landing
+this new job.
+
+Users can write the following command to ``lru_gen`` to evict
+generations less than or equal to ``min_gen_nr``.
+
+ ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
+
+``min_gen_nr`` should be less than ``max_gen_nr-1``, since
+``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
+the active list) and therefore cannot be evicted. ``swappiness``
+overrides the default value in ``/proc/sys/vm/swappiness``.
+``nr_to_reclaim`` limits the number of pages to evict.
+
+A typical use case is that a job scheduler runs this command before it
+tries to land a new job on a server. If it fails to materialize enough
+cold pages because of the overestimation, it retries on the next
+server according to the ranking result obtained from the working set
+estimation step. This less forceful approach limits the impacts on the
+existing jobs.
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1131,7 +1131,8 @@ config LRU_GEN
# make sure folio->flags has enough spare bits
depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
help
- A high performance LRU implementation to overcommit memory.
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
config LRU_GEN_ENABLED
bool "Enable by default"
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5319,6 +5319,7 @@ static ssize_t show_min_ttl(struct kobje
return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5352,6 +5353,7 @@ static ssize_t show_enabled(struct kobje
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
{
@@ -5499,6 +5501,7 @@ static void lru_gen_seq_show_full(struct
seq_putc(m, '\n');
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static int lru_gen_seq_show(struct seq_file *m, void *v)
{
unsigned long seq;
@@ -5657,6 +5660,7 @@ done:
return err;
}
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
size_t len, loff_t *pos)
{

View File

@ -1,202 +0,0 @@
From bd82a74f6b5c0a75ef61be5e9be34319bb17328f Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 6 Mar 2022 20:35:00 -0700
Subject: [PATCH 14/14] mm: multi-gen LRU: design doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add a design doc.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Change-Id: I958afcabf5abc37b3e58f72638d35a349c31b98d
---
Documentation/mm/index.rst | 1 +
Documentation/mm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++
2 files changed, 160 insertions(+)
create mode 100644 Documentation/mm/multigen_lru.rst
--- a/Documentation/mm/index.rst
+++ b/Documentation/mm/index.rst
@@ -51,6 +51,7 @@ above structured documentation, or delet
ksm
memory-model
mmu_notifier
+ multigen_lru
numa
overcommit-accounting
page_migration
--- /dev/null
+++ b/Documentation/mm/multigen_lru.rst
@@ -0,0 +1,159 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Multi-Gen LRU
+=============
+The multi-gen LRU is an alternative LRU implementation that optimizes
+page reclaim and improves performance under memory pressure. Page
+reclaim decides the kernel's caching policy and ability to overcommit
+memory. It directly impacts the kswapd CPU usage and RAM efficiency.
+
+Design overview
+===============
+Objectives
+----------
+The design objectives are:
+
+* Good representation of access recency
+* Try to profit from spatial locality
+* Fast paths to make obvious choices
+* Simple self-correcting heuristics
+
+The representation of access recency is at the core of all LRU
+implementations. In the multi-gen LRU, each generation represents a
+group of pages with similar access recency. Generations establish a
+(time-based) common frame of reference and therefore help make better
+choices, e.g., between different memcgs on a computer or different
+computers in a data center (for job scheduling).
+
+Exploiting spatial locality improves efficiency when gathering the
+accessed bit. A rmap walk targets a single page and does not try to
+profit from discovering a young PTE. A page table walk can sweep all
+the young PTEs in an address space, but the address space can be too
+sparse to make a profit. The key is to optimize both methods and use
+them in combination.
+
+Fast paths reduce code complexity and runtime overhead. Unmapped pages
+do not require TLB flushes; clean pages do not require writeback.
+These facts are only helpful when other conditions, e.g., access
+recency, are similar. With generations as a common frame of reference,
+additional factors stand out. But obvious choices might not be good
+choices; thus self-correction is necessary.
+
+The benefits of simple self-correcting heuristics are self-evident.
+Again, with generations as a common frame of reference, this becomes
+attainable. Specifically, pages in the same generation can be
+categorized based on additional factors, and a feedback loop can
+statistically compare the refault percentages across those categories
+and infer which of them are better choices.
+
+Assumptions
+-----------
+The protection of hot pages and the selection of cold pages are based
+on page access channels and patterns. There are two access channels:
+
+* Accesses through page tables
+* Accesses through file descriptors
+
+The protection of the former channel is by design stronger because:
+
+1. The uncertainty in determining the access patterns of the former
+ channel is higher due to the approximation of the accessed bit.
+2. The cost of evicting the former channel is higher due to the TLB
+ flushes required and the likelihood of encountering the dirty bit.
+3. The penalty of underprotecting the former channel is higher because
+ applications usually do not prepare themselves for major page
+ faults like they do for blocked I/O. E.g., GUI applications
+ commonly use dedicated I/O threads to avoid blocking rendering
+ threads.
+
+There are also two access patterns:
+
+* Accesses exhibiting temporal locality
+* Accesses not exhibiting temporal locality
+
+For the reasons listed above, the former channel is assumed to follow
+the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
+present, and the latter channel is assumed to follow the latter
+pattern unless outlying refaults have been observed.
+
+Workflow overview
+=================
+Evictable pages are divided into multiple generations for each
+``lruvec``. The youngest generation number is stored in
+``lrugen->max_seq`` for both anon and file types as they are aged on
+an equal footing. The oldest generation numbers are stored in
+``lrugen->min_seq[]`` separately for anon and file types as clean file
+pages can be evicted regardless of swap constraints. These three
+variables are monotonically increasing.
+
+Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
+bits in order to fit into the gen counter in ``folio->flags``. Each
+truncated generation number is an index to ``lrugen->lists[]``. The
+sliding window technique is used to track at least ``MIN_NR_GENS`` and
+at most ``MAX_NR_GENS`` generations. The gen counter stores a value
+within ``[1, MAX_NR_GENS]`` while a page is on one of
+``lrugen->lists[]``; otherwise it stores zero.
+
+Each generation is divided into multiple tiers. A page accessed ``N``
+times through file descriptors is in tier ``order_base_2(N)``. Unlike
+generations, tiers do not have dedicated ``lrugen->lists[]``. In
+contrast to moving across generations, which requires the LRU lock,
+moving across tiers only involves atomic operations on
+``folio->flags`` and therefore has a negligible cost. A feedback loop
+modeled after the PID controller monitors refaults over all the tiers
+from anon and file types and decides which tiers from which types to
+evict or protect.
+
+There are two conceptually independent procedures: the aging and the
+eviction. They form a closed-loop system, i.e., the page reclaim.
+
+Aging
+-----
+The aging produces young generations. Given an ``lruvec``, it
+increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
+``MIN_NR_GENS``. The aging promotes hot pages to the youngest
+generation when it finds them accessed through page tables; the
+demotion of cold pages happens consequently when it increments
+``max_seq``. The aging uses page table walks and rmap walks to find
+young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
+and calls ``walk_page_range()`` with each ``mm_struct`` on this list
+to scan PTEs, and after each iteration, it increments ``max_seq``. For
+the latter, when the eviction walks the rmap and finds a young PTE,
+the aging scans the adjacent PTEs. For both, on finding a young PTE,
+the aging clears the accessed bit and updates the gen counter of the
+page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
+
+Eviction
+--------
+The eviction consumes old generations. Given an ``lruvec``, it
+increments ``min_seq`` when ``lrugen->lists[]`` indexed by
+``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
+evict from, it first compares ``min_seq[]`` to select the older type.
+If both types are equally old, it selects the one whose first tier has
+a lower refault percentage. The first tier contains single-use
+unmapped clean pages, which are the best bet. The eviction sorts a
+page according to its gen counter if the aging has found this page
+accessed through page tables and updated its gen counter. It also
+moves a page to the next generation, i.e., ``min_seq+1``, if this page
+was accessed multiple times through file descriptors and the feedback
+loop has detected outlying refaults from the tier this page is in. To
+this end, the feedback loop uses the first tier as the baseline, for
+the reason stated earlier.
+
+Summary
+-------
+The multi-gen LRU can be disassembled into the following parts:
+
+* Generations
+* Rmap walks
+* Page table walks
+* Bloom filters
+* PID controller
+
+The aging and the eviction form a producer-consumer model;
+specifically, the latter drives the former by the sliding window over
+generations. Within the aging, rmap walks drive page table walks by
+inserting hot densely populated page tables to the Bloom filters.
+Within the eviction, the PID controller uses refaults as the feedback
+to select types to evict and tiers to protect.

File diff suppressed because it is too large Load Diff

View File

@ -1,174 +0,0 @@
From a779a482fb9b9f8fcdf8b2519c789b4b9bb5dd05 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 16:56:48 +0200
Subject: build: add a hack for removing non-essential module info
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/linux/module.h | 13 ++++++++-----
include/linux/moduleparam.h | 15 ++++++++++++---
init/Kconfig | 7 +++++++
kernel/module.c | 5 ++++-
scripts/mod/modpost.c | 12 ++++++++++++
5 files changed, 43 insertions(+), 9 deletions(-)
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -164,6 +164,7 @@ extern void cleanup_module(void);
/* Generic info of form tag = "info" */
#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)
+#define MODULE_INFO_STRIP(tag, info) __MODULE_INFO_STRIP(tag, tag, info)
/* For userspace: you can also call me... */
#define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)
@@ -233,12 +234,12 @@ extern void cleanup_module(void);
* Author(s), use "Name <email>" or just "Name", for multiple
* authors use multiple MODULE_AUTHOR() statements/lines.
*/
-#define MODULE_AUTHOR(_author) MODULE_INFO(author, _author)
+#define MODULE_AUTHOR(_author) MODULE_INFO_STRIP(author, _author)
/* What your module does. */
-#define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description)
+#define MODULE_DESCRIPTION(_description) MODULE_INFO_STRIP(description, _description)
-#ifdef MODULE
+#if defined(MODULE) && !defined(CONFIG_MODULE_STRIPPED)
/* Creates an alias so file2alias.c can find device table. */
#define MODULE_DEVICE_TABLE(type, name) \
extern typeof(name) __mod_##type##__##name##_device_table \
@@ -265,7 +266,9 @@ extern typeof(name) __mod_##type##__##na
*/
#if defined(MODULE) || !defined(CONFIG_SYSFS)
-#define MODULE_VERSION(_version) MODULE_INFO(version, _version)
+#define MODULE_VERSION(_version) MODULE_INFO_STRIP(version, _version)
+#elif defined(CONFIG_MODULE_STRIPPED)
+#define MODULE_VERSION(_version) __MODULE_INFO_DISABLED(version)
#else
#define MODULE_VERSION(_version) \
MODULE_INFO(version, _version); \
@@ -288,7 +291,7 @@ extern typeof(name) __mod_##type##__##na
/* Optional firmware file (or files) needed by the module
* format is simply firmware file name. Multiple firmware
* files require multiple MODULE_FIRMWARE() specifiers */
-#define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware)
+#define MODULE_FIRMWARE(_firmware) MODULE_INFO_STRIP(firmware, _firmware)
#define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns))
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -20,6 +20,16 @@
/* Chosen so that structs with an unsigned long line up. */
#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
+/* This struct is here for syntactic coherency, it is not used */
+#define __MODULE_INFO_DISABLED(name) \
+ struct __UNIQUE_ID(name) {}
+
+#ifdef CONFIG_MODULE_STRIPPED
+#define __MODULE_INFO_STRIP(tag, name, info) __MODULE_INFO_DISABLED(name)
+#else
+#define __MODULE_INFO_STRIP(tag, name, info) __MODULE_INFO(tag, name, info)
+#endif
+
#define __MODULE_INFO(tag, name, info) \
static const char __UNIQUE_ID(name)[] \
__used __section(".modinfo") __aligned(1) \
@@ -31,7 +41,7 @@
/* One for each parameter, describing how to use it. Some files do
multiple of these per line, so can't just use MODULE_INFO. */
#define MODULE_PARM_DESC(_parm, desc) \
- __MODULE_INFO(parm, _parm, #_parm ":" desc)
+ __MODULE_INFO_STRIP(parm, _parm, #_parm ":" desc)
struct kernel_param;
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -286,6 +286,13 @@ config UNUSED_KSYMS_WHITELIST
one per line. The path can be absolute, or relative to the kernel
source tree.
+config MODULE_STRIPPED
+ bool "Reduce module size"
+ depends on MODULES
+ help
+ Remove module parameter descriptions, author info, version, aliases,
+ device tables, etc.
+
config MODULES_TREE_LOOKUP
def_bool y
depends on PERF_EVENTS || TRACING || CFI_CLANG
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1954,9 +1954,11 @@ static int setup_load_info(struct load_i
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
- const char *modmagic = get_modinfo(info, "vermagic");
int err;
+#ifndef CONFIG_MODULE_STRIPPED
+ const char *modmagic = get_modinfo(info, "vermagic");
+
if (flags & MODULE_INIT_IGNORE_VERMAGIC)
modmagic = NULL;
@@ -1977,6 +1979,7 @@ static int check_modinfo(struct module *
mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
}
+#endif
check_modinfo_retpoline(mod, info);
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1817,7 +1817,9 @@ static void read_symbols(const char *mod
symname = remove_dot(info.strtab + sym->st_name);
handle_symbol(mod, &info, sym, symname);
+#ifndef CONFIG_MODULE_STRIPPED
handle_moddevtable(mod, &info, sym, symname);
+#endif
}
for (sym = info.symtab_start; sym < info.symtab_stop; sym++) {
@@ -1980,8 +1982,10 @@ static void add_header(struct buffer *b,
buf_printf(b, "BUILD_SALT;\n");
buf_printf(b, "BUILD_LTO_INFO;\n");
buf_printf(b, "\n");
+#ifndef CONFIG_MODULE_STRIPPED
buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
+#endif
buf_printf(b, "\n");
buf_printf(b, "__visible struct module __this_module\n");
buf_printf(b, "__section(\".gnu.linkonce.this_module\") = {\n");
@@ -2101,11 +2105,13 @@ static void add_depends(struct buffer *b
static void add_srcversion(struct buffer *b, struct module *mod)
{
+#ifndef CONFIG_MODULE_STRIPPED
if (mod->srcversion[0]) {
buf_printf(b, "\n");
buf_printf(b, "MODULE_INFO(srcversion, \"%s\");\n",
mod->srcversion);
}
+#endif
}
static void write_buf(struct buffer *b, const char *fname)
@@ -2191,7 +2197,9 @@ static void write_mod_c_file(struct modu
add_exported_symbols(&buf, mod);
add_versions(&buf, mod);
add_depends(&buf, mod);
+#ifndef CONFIG_MODULE_STRIPPED
add_moddevtable(&buf, mod);
+#endif
add_srcversion(&buf, mod);
ret = snprintf(fname, sizeof(fname), "%s.mod.c", mod->name);

View File

@ -1,11 +0,0 @@
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -432,6 +432,8 @@ static int conf_sym(struct menu *menu)
break;
continue;
case 0:
+ if (!sym_has_value(sym) && !tty_stdio && getenv("FAIL_ON_UNCONFIGURED"))
+ exit(1);
newval = oldval;
break;
case '?':

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +0,0 @@
From e44fc2af1ddc452b6659d08c16973d65c73b7d0a Mon Sep 17 00:00:00 2001
From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Date: Wed, 5 Feb 2020 18:36:43 +0000
Subject: [PATCH] file2alias: build on macos
Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
---
scripts/mod/file2alias.c | 3 +++
1 file changed, 3 insertions(+)
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -38,6 +38,9 @@ typedef struct {
__u8 b[16];
} guid_t;
+#ifdef __APPLE__
+#define uuid_t compat_uuid_t
+#endif
/* backwards compatibility, don't use in new code */
typedef struct {
__u8 b[16];

View File

@ -1,83 +0,0 @@
From 48232d3d931c95953ce2ddfe7da7bb164aef6a73 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:03:16 +0200
Subject: fix portability of some includes files in tools/ used on the host
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
tools/include/tools/be_byteshift.h | 4 ++++
tools/include/tools/le_byteshift.h | 4 ++++
tools/include/tools/linux_types.h | 22 ++++++++++++++++++++++
3 files changed, 30 insertions(+)
create mode 100644 tools/include/tools/linux_types.h
--- a/tools/include/tools/be_byteshift.h
+++ b/tools/include/tools/be_byteshift.h
@@ -2,6 +2,10 @@
#ifndef _TOOLS_BE_BYTESHIFT_H
#define _TOOLS_BE_BYTESHIFT_H
+#ifndef __linux__
+#include "linux_types.h"
+#endif
+
#include <stdint.h>
static inline uint16_t __get_unaligned_be16(const uint8_t *p)
--- a/tools/include/tools/le_byteshift.h
+++ b/tools/include/tools/le_byteshift.h
@@ -2,6 +2,10 @@
#ifndef _TOOLS_LE_BYTESHIFT_H
#define _TOOLS_LE_BYTESHIFT_H
+#ifndef __linux__
+#include "linux_types.h"
+#endif
+
#include <stdint.h>
static inline uint16_t __get_unaligned_le16(const uint8_t *p)
--- /dev/null
+++ b/tools/include/tools/linux_types.h
@@ -0,0 +1,26 @@
+#ifndef __LINUX_TYPES_H
+#define __LINUX_TYPES_H
+
+#include <stdint.h>
+
+typedef int8_t __s8;
+typedef uint8_t __u8;
+typedef uint8_t __be8;
+typedef uint8_t __le8;
+
+typedef int16_t __s16;
+typedef uint16_t __u16;
+typedef uint16_t __be16;
+typedef uint16_t __le16;
+
+typedef int32_t __s32;
+typedef uint32_t __u32;
+typedef uint32_t __be32;
+typedef uint32_t __le32;
+
+typedef int64_t __s64;
+typedef uint64_t __u64;
+typedef uint64_t __be64;
+typedef uint64_t __le64;
+
+#endif
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -10,8 +10,12 @@
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
#endif
+#ifndef __linux__
+#include <tools/linux_types.h>
+#else
#include <asm/types.h>
#include <asm/posix_types.h>
+#endif
struct page;
struct kmem_cache;

View File

@ -1,24 +0,0 @@
From be9be95ff10e16a5b4ad36f903978d0cc5747024 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:04:08 +0200
Subject: kernel: fix linux/spi/spidev.h portability issues with musl
Felix will try to get this define included into musl
lede-commit: 795e7cf60de19e7a076a46874fab7bb88b43bbff
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/uapi/linux/spi/spidev.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/uapi/linux/spi/spidev.h
+++ b/include/uapi/linux/spi/spidev.h
@@ -93,7 +93,7 @@ struct spi_ioc_transfer {
/* not all platforms use <asm-generic/ioctl.h> or _IOC_TYPECHECK() ... */
#define SPI_MSGSIZE(N) \
- ((((N)*(sizeof (struct spi_ioc_transfer))) < (1 << _IOC_SIZEBITS)) \
+ ((((N)*(sizeof (struct spi_ioc_transfer))) < (1 << 13)) \
? ((N)*(sizeof (struct spi_ioc_transfer))) : 0)
#define SPI_IOC_MESSAGE(N) _IOW(SPI_IOC_MAGIC, 0, char[SPI_MSGSIZE(N)])

View File

@ -1,123 +0,0 @@
From e3d8676f5722b7622685581e06e8f53e6138e3ab Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 15 Jul 2017 23:42:36 +0200
Subject: use -ffunction-sections, -fdata-sections and --gc-sections
In combination with kernel symbol export stripping this significantly reduces
the kernel image size. Used on both ARM and MIPS architectures.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Jonas Gorski <jogo@openwrt.org>
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
---
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -121,6 +121,7 @@ config ARM
select HAVE_UID16
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select IRQ_FORCED_THREADING
+ select HAVE_LD_DEAD_CODE_DATA_ELIMINATION
select MODULES_USE_ELF_REL
select NEED_DMA_MAP_STATE
select OF_EARLY_FLATTREE if OF
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -91,6 +91,7 @@ endif
ifeq ($(CONFIG_USE_OF),y)
OBJS += $(libfdt_objs) fdt_check_mem_start.o
endif
+KBUILD_CFLAGS_KERNEL := $(patsubst -f%-sections,,$(KBUILD_CFLAGS_KERNEL))
OBJS += lib1funcs.o ashldi3.o bswapsdi2.o
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -75,7 +75,7 @@ SECTIONS
. = ALIGN(4);
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
__start___ex_table = .;
- ARM_MMU_KEEP(*(__ex_table))
+ KEEP(*(__ex_table))
__stop___ex_table = .;
}
@@ -100,24 +100,24 @@ SECTIONS
}
.init.arch.info : {
__arch_info_begin = .;
- *(.arch.info.init)
+ KEEP(*(.arch.info.init))
__arch_info_end = .;
}
.init.tagtable : {
__tagtable_begin = .;
- *(.taglist.init)
+ KEEP(*(.taglist.init))
__tagtable_end = .;
}
#ifdef CONFIG_SMP_ON_UP
.init.smpalt : {
__smpalt_begin = .;
- *(.alt.smp.init)
+ KEEP(*(.alt.smp.init))
__smpalt_end = .;
}
#endif
.init.pv_table : {
__pv_table_begin = .;
- *(.pv_table)
+ KEEP(*(.pv_table))
__pv_table_end = .;
}
--- a/arch/arm/include/asm/vmlinux.lds.h
+++ b/arch/arm/include/asm/vmlinux.lds.h
@@ -42,13 +42,13 @@
#define PROC_INFO \
. = ALIGN(4); \
__proc_info_begin = .; \
- *(.proc.info.init) \
+ KEEP(*(.proc.info.init)) \
__proc_info_end = .;
#define IDMAP_TEXT \
ALIGN_FUNCTION(); \
__idmap_text_start = .; \
- *(.idmap.text) \
+ KEEP(*(.idmap.text)) \
__idmap_text_end = .; \
#define ARM_DISCARD \
@@ -109,12 +109,12 @@
. = ALIGN(8); \
.ARM.unwind_idx : { \
__start_unwind_idx = .; \
- *(.ARM.exidx*) \
+ KEEP(*(.ARM.exidx*)) \
__stop_unwind_idx = .; \
} \
.ARM.unwind_tab : { \
__start_unwind_tab = .; \
- *(.ARM.extab*) \
+ KEEP(*(.ARM.extab*)) \
__stop_unwind_tab = .; \
}
@@ -126,7 +126,7 @@
__vectors_lma = .; \
OVERLAY 0xffff0000 : NOCROSSREFS AT(__vectors_lma) { \
.vectors { \
- *(.vectors) \
+ KEEP(*(.vectors)) \
} \
.vectors.bhb.loop8 { \
*(.vectors.bhb.loop8) \
@@ -144,7 +144,7 @@
\
__stubs_lma = .; \
.stubs ADDR(.vectors) + 0x1000 : AT(__stubs_lma) { \
- *(.stubs) \
+ KEEP(*(.stubs)) \
} \
ARM_LMA(__stubs, .stubs); \
. = __stubs_lma + SIZEOF(.stubs); \

View File

@ -1,102 +0,0 @@
From b14784e7883390c20ed3ff904892255404a5914b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:05:53 +0200
Subject: add an optional config option for stripping all unnecessary symbol exports from the kernel image
lede-commit: bb5a40c64b7c4f4848509fa0a6625055fc9e66cc
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/asm-generic/vmlinux.lds.h | 18 +++++++++++++++---
include/linux/export.h | 9 ++++++++-
scripts/Makefile.build | 2 +-
3 files changed, 24 insertions(+), 5 deletions(-)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -81,6 +81,16 @@
#define RO_EXCEPTION_TABLE
#endif
+#ifndef SYMTAB_KEEP
+#define SYMTAB_KEEP KEEP(*(SORT(___ksymtab+*)))
+#define SYMTAB_KEEP_GPL KEEP(*(SORT(___ksymtab_gpl+*)))
+#endif
+
+#ifndef SYMTAB_DISCARD
+#define SYMTAB_DISCARD
+#define SYMTAB_DISCARD_GPL
+#endif
+
/* Align . to a 8 byte boundary equals to maximum function alignment. */
#define ALIGN_FUNCTION() . = ALIGN(8)
@@ -479,14 +489,14 @@
/* Kernel symbol table: Normal symbols */ \
__ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \
__start___ksymtab = .; \
- KEEP(*(SORT(___ksymtab+*))) \
+ SYMTAB_KEEP \
__stop___ksymtab = .; \
} \
\
/* Kernel symbol table: GPL-only symbols */ \
__ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \
__start___ksymtab_gpl = .; \
- KEEP(*(SORT(___ksymtab_gpl+*))) \
+ SYMTAB_KEEP_GPL \
__stop___ksymtab_gpl = .; \
} \
\
@@ -506,7 +516,7 @@
\
/* Kernel symbol table: strings */ \
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
- *(__ksymtab_strings) \
+ *(__ksymtab_strings+*) \
} \
\
/* __*init sections */ \
@@ -1023,6 +1033,8 @@
#define COMMON_DISCARDS \
SANITIZER_DISCARDS \
+ SYMTAB_DISCARD \
+ SYMTAB_DISCARD_GPL \
*(.discard) \
*(.discard.*) \
*(.modinfo) \
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -72,6 +72,12 @@ struct kernel_symbol {
#else
+#ifdef MODULE
+#define __EXPORT_SUFFIX(sym)
+#else
+#define __EXPORT_SUFFIX(sym) "+" #sym
+#endif
+
/*
* For every exported symbol, do the following:
*
@@ -87,7 +93,7 @@ struct kernel_symbol {
extern typeof(sym) sym; \
extern const char __kstrtab_##sym[]; \
extern const char __kstrtabns_##sym[]; \
- asm(" .section \"__ksymtab_strings\",\"aMS\",%progbits,1 \n" \
+ asm(" .section \"__ksymtab_strings" __EXPORT_SUFFIX(sym) "\",\"aMS\",%progbits,1 \n" \
"__kstrtab_" #sym ": \n" \
" .asciz \"" #sym "\" \n" \
"__kstrtabns_" #sym ": \n" \
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -328,7 +328,7 @@ targets += $(real-dtb-y) $(lib-y) $(alwa
# Linker scripts preprocessor (.lds.S -> .lds)
# ---------------------------------------------------------------------------
quiet_cmd_cpp_lds_S = LDS $@
- cmd_cpp_lds_S = $(CPP) $(cpp_flags) -P -U$(ARCH) \
+ cmd_cpp_lds_S = $(CPP) $(EXTRA_LDSFLAGS) $(cpp_flags) -P -U$(ARCH) \
-D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $<
$(obj)/%.lds: $(src)/%.lds.S FORCE

View File

@ -1,34 +0,0 @@
From b3d00b452467f621317953d9e4c6f9ae8dcfd271 Mon Sep 17 00:00:00 2001
From: Imre Kaloz <kaloz@openwrt.org>
Date: Fri, 7 Jul 2017 17:06:55 +0200
Subject: use the openwrt lzma options for now
lede-commit: 548de949f392049420a6a1feeef118b30ab8ea8c
Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
---
lib/decompress.c | 1 +
scripts/Makefile.lib | 2 +-
usr/gen_initramfs_list.sh | 10 +++++-----
3 files changed, 7 insertions(+), 6 deletions(-)
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -53,6 +53,7 @@ static const struct compress_format comp
{ {0x1f, 0x9e}, "gzip", gunzip },
{ {0x42, 0x5a}, "bzip2", bunzip2 },
{ {0x5d, 0x00}, "lzma", unlzma },
+ { {0x6d, 0x00}, "lzma-openwrt", unlzma },
{ {0xfd, 0x37}, "xz", unxz },
{ {0x89, 0x4c}, "lzo", unlzo },
{ {0x02, 0x21}, "lz4", unlz4 },
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -421,7 +421,7 @@ quiet_cmd_bzip2_with_size = BZIP2 $@
# ---------------------------------------------------------------------------
quiet_cmd_lzma = LZMA $@
- cmd_lzma = cat $(real-prereqs) | $(LZMA) -9 > $@
+ cmd_lzma = { cat $(real-prereqs) | $(LZMA) e -d20 -lc1 -lp2 -pb2 -eos -si -so; $(size_append); } > $@
quiet_cmd_lzma_with_size = LZMA $@
cmd_lzma_with_size = { cat $(real-prereqs) | $(LZMA) -9; $(size_append); } > $@

View File

@ -1,11 +0,0 @@
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -315,7 +315,7 @@ config NET_IPVTI
on top.
config NET_UDP_TUNNEL
- tristate
+ tristate "IP: UDP tunneling support"
select NET_IP_TUNNEL
default n

View File

@ -1,27 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: hack: net: remove bogus netfilter dependencies
lede-commit: 589d2a377dee27d206fc3725325309cf649e4df6
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
net/netfilter/Kconfig | 2 --
1 file changed, 2 deletions(-)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -253,7 +253,6 @@ config NF_CONNTRACK_FTP
config NF_CONNTRACK_H323
tristate "H.323 protocol support"
- depends on IPV6 || IPV6=n
depends on NETFILTER_ADVANCED
help
H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -1118,7 +1117,6 @@ config NETFILTER_XT_TARGET_SECMARK
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
- depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
help
This option adds a `TCPMSS' target, which allows you to alter the

View File

@ -1,199 +0,0 @@
From da3c50704f14132f4adf80d48e9a4cd5d46e54c9 Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 7 Jul 2017 17:09:21 +0200
Subject: kconfig: owrt specifc dependencies
Signed-off-by: John Crispin <john@phrozen.org>
---
crypto/Kconfig | 10 +++++-----
drivers/bcma/Kconfig | 1 +
drivers/ssb/Kconfig | 3 ++-
lib/Kconfig | 8 ++++----
net/netfilter/Kconfig | 2 +-
net/wireless/Kconfig | 17 ++++++++++-------
sound/core/Kconfig | 4 ++--
7 files changed, 25 insertions(+), 20 deletions(-)
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -55,7 +55,7 @@ config CRYPTO_FIPS_VERSION
By default the KERNELRELEASE value is used.
config CRYPTO_ALGAPI
- tristate
+ tristate "ALGAPI"
select CRYPTO_ALGAPI2
help
This option provides the API for cryptographic algorithms.
@@ -64,7 +64,7 @@ config CRYPTO_ALGAPI2
tristate
config CRYPTO_AEAD
- tristate
+ tristate "AEAD"
select CRYPTO_AEAD2
select CRYPTO_ALGAPI
@@ -75,7 +75,7 @@ config CRYPTO_AEAD2
select CRYPTO_RNG2
config CRYPTO_SKCIPHER
- tristate
+ tristate "SKCIPHER"
select CRYPTO_SKCIPHER2
select CRYPTO_ALGAPI
@@ -85,7 +85,7 @@ config CRYPTO_SKCIPHER2
select CRYPTO_RNG2
config CRYPTO_HASH
- tristate
+ tristate "HASH"
select CRYPTO_HASH2
select CRYPTO_ALGAPI
@@ -94,7 +94,7 @@ config CRYPTO_HASH2
select CRYPTO_ALGAPI2
config CRYPTO_RNG
- tristate
+ tristate "RNG"
select CRYPTO_RNG2
select CRYPTO_ALGAPI
--- a/drivers/bcma/Kconfig
+++ b/drivers/bcma/Kconfig
@@ -16,6 +16,7 @@ if BCMA
# Support for Block-I/O. SELECT this from the driver that needs it.
config BCMA_BLOCKIO
bool
+ default y
config BCMA_HOST_PCI_POSSIBLE
bool
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -29,6 +29,7 @@ config SSB_SPROM
config SSB_BLOCKIO
bool
depends on SSB
+ default y
config SSB_PCIHOST_POSSIBLE
bool
@@ -49,7 +50,7 @@ config SSB_PCIHOST
config SSB_B43_PCI_BRIDGE
bool
depends on SSB_PCIHOST
- default n
+ default y
config SSB_PCMCIAHOST_POSSIBLE
bool
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -456,16 +456,16 @@ config BCH_CONST_T
# Textsearch support is select'ed if needed
#
config TEXTSEARCH
- bool
+ bool "Textsearch support"
config TEXTSEARCH_KMP
- tristate
+ tristate "Textsearch KMP"
config TEXTSEARCH_BM
- tristate
+ tristate "Textsearch BM"
config TEXTSEARCH_FSM
- tristate
+ tristate "Textsearch FSM"
config BTREE
bool
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -22,7 +22,7 @@ config NETFILTER_SKIP_EGRESS
def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB)
config NETFILTER_NETLINK
- tristate
+ tristate "Netfilter NFNETLINK interface"
config NETFILTER_FAMILY_BRIDGE
bool
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config WIRELESS_EXT
- bool
+ bool "Wireless extensions"
config WEXT_CORE
def_bool y
@@ -12,10 +12,10 @@ config WEXT_PROC
depends on WEXT_CORE
config WEXT_SPY
- bool
+ bool "WEXT_SPY"
config WEXT_PRIV
- bool
+ bool "WEXT_PRIV"
config CFG80211
tristate "cfg80211 - wireless configuration API"
@@ -204,7 +204,7 @@ config CFG80211_WEXT_EXPORT
endif # CFG80211
config LIB80211
- tristate
+ tristate "LIB80211"
default n
help
This options enables a library of common routines used
@@ -213,17 +213,17 @@ config LIB80211
Drivers should select this themselves if needed.
config LIB80211_CRYPT_WEP
- tristate
+ tristate "LIB80211_CRYPT_WEP"
select CRYPTO_LIB_ARC4
config LIB80211_CRYPT_CCMP
- tristate
+ tristate "LIB80211_CRYPT_CCMP"
select CRYPTO
select CRYPTO_AES
select CRYPTO_CCM
config LIB80211_CRYPT_TKIP
- tristate
+ tristate "LIB80211_CRYPT_TKIP"
select CRYPTO_LIB_ARC4
config LIB80211_DEBUG
--- a/sound/core/Kconfig
+++ b/sound/core/Kconfig
@@ -17,7 +17,7 @@ config SND_DMAENGINE_PCM
tristate
config SND_HWDEP
- tristate
+ tristate "Sound hardware support"
config SND_SEQ_DEVICE
tristate
@@ -27,7 +27,7 @@ config SND_RAWMIDI
select SND_SEQ_DEVICE if SND_SEQUENCER != n
config SND_COMPRESS_OFFLOAD
- tristate
+ tristate "Compression offloading support"
config SND_JACK
bool

View File

@ -1,23 +0,0 @@
From 8c817e33be829c7249c2cfd59ff48ad5fac6a31d Mon Sep 17 00:00:00 2001
From: Sungbo Eo <mans0n@gorani.run>
Date: Fri, 7 Jul 2017 17:09:21 +0200
Subject: [PATCH] kconfig: solidify SATA_PMP config
SATA_PMP option in kernel config file disappears for every kernel_oldconfig refresh.
To prevent this, SATA_HOST is now selected automatically when SATA_PMP is enabled.
This patch can be dropped if SATA_MV is ever re-added into the config.
---
drivers/ata/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -112,7 +112,7 @@ config SATA_ZPODD
config SATA_PMP
bool "SATA Port Multiplier support"
- depends on SATA_HOST
+ select SATA_HOST
default y
help
This option adds support for SATA Port Multipliers

View File

@ -1,22 +0,0 @@
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1954,7 +1954,7 @@ config PADATA
bool
config ASN1
- tristate
+ tristate "ASN1"
help
Build a simple ASN.1 grammar compiler that produces a bytecode output
that can be interpreted by the ASN.1 stream decoder and used to
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -627,7 +627,7 @@ config LIBFDT
bool
config OID_REGISTRY
- tristate
+ tristate "OID"
help
Enable fast lookup object identifier registry.

View File

@ -1,144 +0,0 @@
From 811d9e2268a62b830cfe93cd8bc929afcb8b198b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 15 Jul 2017 21:12:38 +0200
Subject: kernel: move regmap bloat out of the kernel image if it is only being used in modules
lede-commit: 96f39119815028073583e4fca3a9c5fe9141e998
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
drivers/base/regmap/Kconfig | 15 ++++++++++-----
drivers/base/regmap/Makefile | 12 ++++++++----
drivers/base/regmap/regmap.c | 3 +++
include/linux/regmap.h | 2 +-
4 files changed, 22 insertions(+), 10 deletions(-)
--- a/drivers/base/regmap/Kconfig
+++ b/drivers/base/regmap/Kconfig
@@ -4,10 +4,9 @@
# subsystems should select the appropriate symbols.
config REGMAP
- default y if (REGMAP_I2C || REGMAP_SPI || REGMAP_SPMI || REGMAP_W1 || REGMAP_AC97 || REGMAP_MMIO || REGMAP_IRQ || REGMAP_SOUNDWIRE || REGMAP_SOUNDWIRE_MBQ || REGMAP_SCCB || REGMAP_I3C || REGMAP_SPI_AVMM || REGMAP_MDIO)
select IRQ_DOMAIN if REGMAP_IRQ
select MDIO_BUS if REGMAP_MDIO
- bool
+ tristate
config REGCACHE_COMPRESSED
select LZO_COMPRESS
@@ -15,53 +14,67 @@ config REGCACHE_COMPRESSED
bool
config REGMAP_AC97
+ select REGMAP
tristate
config REGMAP_I2C
+ select REGMAP
tristate
depends on I2C
config REGMAP_SLIMBUS
+ select REGMAP
tristate
depends on SLIMBUS
config REGMAP_SPI
+ select REGMAP
tristate
depends on SPI
config REGMAP_SPMI
+ select REGMAP
tristate
depends on SPMI
config REGMAP_W1
+ select REGMAP
tristate
depends on W1
config REGMAP_MDIO
+ select REGMAP
tristate
config REGMAP_MMIO
+ select REGMAP
tristate
config REGMAP_IRQ
+ select REGMAP
bool
config REGMAP_SOUNDWIRE
+ select REGMAP
tristate
depends on SOUNDWIRE
config REGMAP_SOUNDWIRE_MBQ
+ select REGMAP
tristate
depends on SOUNDWIRE
config REGMAP_SCCB
+ select REGMAP
tristate
depends on I2C
config REGMAP_I3C
+ select REGMAP
tristate
depends on I3C
config REGMAP_SPI_AVMM
+ select REGMAP
tristate
depends on SPI
--- a/drivers/base/regmap/Makefile
+++ b/drivers/base/regmap/Makefile
@@ -2,10 +2,14 @@
# For include/trace/define_trace.h to include trace.h
CFLAGS_regmap.o := -I$(src)
-obj-$(CONFIG_REGMAP) += regmap.o regcache.o
-obj-$(CONFIG_REGMAP) += regcache-rbtree.o regcache-flat.o
-obj-$(CONFIG_REGCACHE_COMPRESSED) += regcache-lzo.o
-obj-$(CONFIG_DEBUG_FS) += regmap-debugfs.o
+regmap-core-objs = regmap.o regcache.o regcache-rbtree.o regcache-flat.o
+ifdef CONFIG_DEBUG_FS
+regmap-core-objs += regmap-debugfs.o
+endif
+ifdef CONFIG_REGCACHE_COMPRESSED
+regmap-core-objs += regcache-lzo.o
+endif
+obj-$(CONFIG_REGMAP) += regmap-core.o
obj-$(CONFIG_REGMAP_AC97) += regmap-ac97.o
obj-$(CONFIG_REGMAP_I2C) += regmap-i2c.o
obj-$(CONFIG_REGMAP_SLIMBUS) += regmap-slimbus.o
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -9,6 +9,7 @@
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/err.h>
#include <linux/property.h>
@@ -3384,3 +3385,5 @@ static int __init regmap_initcall(void)
return 0;
}
postcore_initcall(regmap_initcall);
+
+MODULE_LICENSE("GPL");
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -180,7 +180,7 @@ struct reg_sequence {
__ret ?: __tmp; \
})
-#ifdef CONFIG_REGMAP
+#if IS_REACHABLE(CONFIG_REGMAP)
enum regmap_endian {
/* Unspecified -> 0 -> Backwards compatible default */

View File

@ -1,52 +0,0 @@
From fd1799b0bf5efa46dd3e6dfbbf3955564807e508 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:12:51 +0200
Subject: kernel: prevent cryptomgr from pulling in useless extra dependencies for tests that are not run
Reduces kernel size after LZMA by about 5k on MIPS
lede-commit: 044c316167e076479a344c59905e5b435b84a77f
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
crypto/Kconfig | 13 ++++++-------
crypto/algboss.c | 4 ++++
2 files changed, 10 insertions(+), 7 deletions(-)
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -142,13 +142,13 @@ config CRYPTO_MANAGER
cbc(aes).
config CRYPTO_MANAGER2
- def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y)
- select CRYPTO_AEAD2
- select CRYPTO_HASH2
- select CRYPTO_SKCIPHER2
- select CRYPTO_AKCIPHER2
- select CRYPTO_KPP2
- select CRYPTO_ACOMP2
+ def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y && !CRYPTO_MANAGER_DISABLE_TESTS)
+ select CRYPTO_AEAD2 if !CRYPTO_MANAGER_DISABLE_TESTS
+ select CRYPTO_HASH2 if !CRYPTO_MANAGER_DISABLE_TESTS
+ select CRYPTO_SKCIPHER2 if !CRYPTO_MANAGER_DISABLE_TESTS
+ select CRYPTO_AKCIPHER2 if !CRYPTO_MANAGER_DISABLE_TESTS
+ select CRYPTO_KPP2 if !CRYPTO_MANAGER_DISABLE_TESTS
+ select CRYPTO_ACOMP2 if !CRYPTO_MANAGER_DISABLE_TESTS
config CRYPTO_USER
tristate "Userspace cryptographic algorithm configuration"
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -211,8 +211,12 @@ static int cryptomgr_schedule_test(struc
type = alg->cra_flags;
/* Do not test internal algorithms. */
+#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+ type |= CRYPTO_ALG_TESTED;
+#else
if (type & CRYPTO_ALG_INTERNAL)
type |= CRYPTO_ALG_TESTED;
+#endif
param->type = type;

View File

@ -1,15 +0,0 @@
This makes it possible to select CONFIG_CRYPTO_LIB_ARC4 directly. We
need this to be able to compile this into the kernel and make use of it
from backports.
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -6,7 +6,7 @@ config CRYPTO_LIB_AES
tristate
config CRYPTO_LIB_ARC4
- tristate
+ tristate "ARC4 cipher library"
config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
bool

View File

@ -1,84 +0,0 @@
From 236c1acdfef5958010ac9814a9872e0a46fd78ee Mon Sep 17 00:00:00 2001
From: John Crispin <john@phrozen.org>
Date: Fri, 7 Jul 2017 17:13:44 +0200
Subject: rfkill: add fake rfkill support
allow building of modules depending on RFKILL even if RFKILL is not enabled.
Signed-off-by: John Crispin <john@phrozen.org>
---
include/linux/rfkill.h | 2 +-
net/Makefile | 2 +-
net/rfkill/Kconfig | 14 +++++++++-----
net/rfkill/Makefile | 2 +-
4 files changed, 12 insertions(+), 8 deletions(-)
--- a/include/linux/rfkill.h
+++ b/include/linux/rfkill.h
@@ -64,7 +64,7 @@ struct rfkill_ops {
int (*set_block)(void *data, bool blocked);
};
-#if defined(CONFIG_RFKILL) || defined(CONFIG_RFKILL_MODULE)
+#if defined(CONFIG_RFKILL_FULL) || defined(CONFIG_RFKILL_FULL_MODULE)
/**
* rfkill_alloc - Allocate rfkill structure
* @name: name of the struct -- the string is not copied internally
--- a/net/Makefile
+++ b/net/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_TIPC) += tipc/
obj-$(CONFIG_NETLABEL) += netlabel/
obj-$(CONFIG_IUCV) += iucv/
obj-$(CONFIG_SMC) += smc/
-obj-$(CONFIG_RFKILL) += rfkill/
+obj-$(CONFIG_RFKILL_FULL) += rfkill/
obj-$(CONFIG_NET_9P) += 9p/
obj-$(CONFIG_CAIF) += caif/
obj-$(CONFIG_DCB) += dcb/
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -2,7 +2,11 @@
#
# RF switch subsystem configuration
#
-menuconfig RFKILL
+config RFKILL
+ bool
+ default y
+
+menuconfig RFKILL_FULL
tristate "RF switch subsystem support"
help
Say Y here if you want to have control over RF switches
@@ -14,19 +18,19 @@ menuconfig RFKILL
# LED trigger support
config RFKILL_LEDS
bool
- depends on RFKILL
+ depends on RFKILL_FULL
depends on LEDS_TRIGGERS = y || RFKILL = LEDS_TRIGGERS
default y
config RFKILL_INPUT
bool "RF switch input support" if EXPERT
- depends on RFKILL
+ depends on RFKILL_FULL
depends on INPUT = y || RFKILL = INPUT
default y if !EXPERT
config RFKILL_GPIO
tristate "GPIO RFKILL driver"
- depends on RFKILL
+ depends on RFKILL_FULL
depends on GPIOLIB || COMPILE_TEST
default n
help
--- a/net/rfkill/Makefile
+++ b/net/rfkill/Makefile
@@ -5,5 +5,5 @@
rfkill-y += core.o
rfkill-$(CONFIG_RFKILL_INPUT) += input.o
-obj-$(CONFIG_RFKILL) += rfkill.o
+obj-$(CONFIG_RFKILL_FULL) += rfkill.o
obj-$(CONFIG_RFKILL_GPIO) += rfkill-gpio.o

View File

@ -1,64 +0,0 @@
From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Date: Fri, 7 Jun 2013 18:35:22 -0500
Subject: MIPS: r4k_cache: use more efficient cache blast
Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.
Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -286,14 +286,46 @@ static inline void prot##extra##blast_##
unsigned long end) \
{ \
unsigned long lsize = cpu_##desc##_line_size(); \
+ unsigned long lsize_2 = lsize * 2; \
+ unsigned long lsize_3 = lsize * 3; \
+ unsigned long lsize_4 = lsize * 4; \
+ unsigned long lsize_5 = lsize * 5; \
+ unsigned long lsize_6 = lsize * 6; \
+ unsigned long lsize_7 = lsize * 7; \
+ unsigned long lsize_8 = lsize * 8; \
unsigned long addr = start & ~(lsize - 1); \
- unsigned long aend = (end - 1) & ~(lsize - 1); \
+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
+ int lines = (aend - addr) / lsize; \
\
- while (1) { \
+ while (lines >= 8) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ prot##cache_op(hitop, addr + lsize_4); \
+ prot##cache_op(hitop, addr + lsize_5); \
+ prot##cache_op(hitop, addr + lsize_6); \
+ prot##cache_op(hitop, addr + lsize_7); \
+ addr += lsize_8; \
+ lines -= 8; \
+ } \
+ \
+ if (lines & 0x4) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ addr += lsize_4; \
+ } \
+ \
+ if (lines & 0x2) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ addr += lsize_2; \
+ } \
+ \
+ if (lines & 0x1) { \
prot##cache_op(hitop, addr); \
- if (addr == aend) \
- break; \
- addr += lsize; \
} \
}

View File

@ -1,38 +0,0 @@
From: John Crispin <john@phrozen.org>
Subject: hack: kernel: add generic image_cmdline hack to MIPS targets
lede-commit: d59f5b3a987a48508257a0ddbaeadc7909f9f976
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
---
arch/mips/Kconfig | 4 ++++
arch/mips/kernel/head.S | 6 ++++++
2 files changed, 10 insertions(+)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1112,6 +1112,10 @@ config MIPS_MSC
config SYNC_R4K
bool
+config IMAGE_CMDLINE_HACK
+ bool "OpenWrt specific image command line hack"
+ default n
+
config NO_IOPORT_MAP
def_bool n
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -79,6 +79,12 @@ FEXPORT(__kernel_entry)
j kernel_entry
#endif /* CONFIG_BOOT_RAW */
+#ifdef CONFIG_IMAGE_CMDLINE_HACK
+ .ascii "CMDLINE:"
+EXPORT(__image_cmdline)
+ .fill 0x400
+#endif /* CONFIG_IMAGE_CMDLINE_HACK */
+
__REF
NESTED(kernel_entry, 16, sp) # kernel entry point

View File

@ -1,38 +0,0 @@
From 107c0964cb8db7ca28ac5199426414fdab3c274d Mon Sep 17 00:00:00 2001
From: "Alexandros C. Couloumbis" <alex@ozo.com>
Date: Fri, 7 Jul 2017 17:14:51 +0200
Subject: hack: arch: powerpc: drop register save/restore library from modules
Upstream GCC uses a libgcc function for saving/restoring registers. This
makes the code bigger, and upstream kernels need to carry that function
for every single kernel module. Our GCC is patched to avoid those
references, so we can drop the extra bloat for modules.
lede-commit: e8e1084654f50904e6bf77b70b2de3f137d7b3ec
Signed-off-by: Alexandros C. Couloumbis <alex@ozo.com>
---
arch/powerpc/Makefile | 1 -
1 file changed, 1 deletion(-)
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -42,19 +42,6 @@ machine-$(CONFIG_PPC64) += 64
machine-$(CONFIG_CPU_LITTLE_ENDIAN) += le
UTS_MACHINE := $(subst $(space),,$(machine-y))
-# XXX This needs to be before we override LD below
-ifdef CONFIG_PPC32
-KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
-else
-ifeq ($(call ld-ifversion, -ge, 22500, y),y)
-# Have the linker provide sfpr if possible.
-# There is a corresponding test in arch/powerpc/lib/Makefile
-KBUILD_LDFLAGS_MODULE += --save-restore-funcs
-else
-KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
-endif
-endif
-
ifdef CONFIG_CPU_LITTLE_ENDIAN
KBUILD_CFLAGS += -mlittle-endian
KBUILD_LDFLAGS += -EL

View File

@ -1,196 +0,0 @@
--- a/block/blk.h
+++ b/block/blk.h
@@ -407,6 +407,8 @@ void blk_free_ext_minor(unsigned int min
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
+#define ADDPART_FLAG_READONLY 4
+#define ADDPART_FLAG_ROOTDEV 8
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -103,6 +103,13 @@ config ATARI_PARTITION
Say Y here if you would like to use hard disks under Linux which
were partitioned under the Atari OS.
+config FIT_PARTITION
+ bool "Flattened-Image-Tree (FIT) partition support" if PARTITION_ADVANCED
+ default n
+ help
+ Say Y here if your system needs to mount the filesystem part of
+ a Flattened-Image-Tree (FIT) image commonly used with Das U-Boot.
+
config IBM_PARTITION
bool "IBM disk label and partition support"
depends on PARTITION_ADVANCED && S390
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o
obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
obj-$(CONFIG_ATARI_PARTITION) += atari.o
obj-$(CONFIG_AIX_PARTITION) += aix.o
+obj-$(CONFIG_FIT_PARTITION) += fit.o
obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
obj-$(CONFIG_MAC_PARTITION) += mac.o
obj-$(CONFIG_LDM_PARTITION) += ldm.o
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -57,6 +57,7 @@ int amiga_partition(struct parsed_partit
int atari_partition(struct parsed_partitions *state);
int cmdline_partition(struct parsed_partitions *state);
int efi_partition(struct parsed_partitions *state);
+int fit_partition(struct parsed_partitions *state);
int ibm_partition(struct parsed_partitions *);
int karma_partition(struct parsed_partitions *state);
int ldm_partition(struct parsed_partitions *state);
@@ -67,3 +68,5 @@ int sgi_partition(struct parsed_partitio
int sun_partition(struct parsed_partitions *state);
int sysv68_partition(struct parsed_partitions *state);
int ultrix_partition(struct parsed_partitions *state);
+
+int parse_fit_partitions(struct parsed_partitions *state, u64 start_sector, u64 nr_sectors, int *slot, int add_remain);
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -10,6 +10,10 @@
#include <linux/ctype.h>
#include <linux/vmalloc.h>
#include <linux/raid/detect.h>
+#ifdef CONFIG_FIT_PARTITION
+#include <linux/root_dev.h>
+#endif
+
#include "check.h"
static int (*check_part[])(struct parsed_partitions *) = {
@@ -46,6 +50,9 @@ static int (*check_part[])(struct parsed
#ifdef CONFIG_EFI_PARTITION
efi_partition, /* this must come before msdos */
#endif
+#ifdef CONFIG_FIT_PARTITION
+ fit_partition,
+#endif
#ifdef CONFIG_SGI_PARTITION
sgi_partition,
#endif
@@ -398,6 +405,11 @@ static struct block_device *add_partitio
goto out_del;
}
+#ifdef CONFIG_FIT_PARTITION
+ if (flags & ADDPART_FLAG_READONLY)
+ bdev->bd_read_only = true;
+#endif
+
/* everything is up and running, commence */
err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
if (err)
@@ -585,6 +597,11 @@ static bool blk_add_partition(struct gen
(state->parts[p].flags & ADDPART_FLAG_RAID))
md_autodetect_dev(part->bd_dev);
+#ifdef CONFIG_FIT_PARTITION
+ if ((state->parts[p].flags & ADDPART_FLAG_ROOTDEV) && ROOT_DEV == 0)
+ ROOT_DEV = part->bd_dev;
+#endif
+
return true;
}
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -433,6 +433,9 @@ int ubiblock_create(struct ubi_volume_in
}
gd->flags |= GENHD_FL_NO_PART;
gd->private_data = dev;
+#ifdef CONFIG_FIT_PARTITION
+ gd->flags |= GENHD_FL_EXT_DEVT;
+#endif
sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id);
set_capacity(gd, disk_capacity);
dev->gd = gd;
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -346,6 +346,9 @@ int add_mtd_blktrans_dev(struct mtd_blkt
gd->first_minor = (new->devnum) << tr->part_bits;
gd->minors = 1 << tr->part_bits;
gd->fops = &mtd_block_ops;
+#ifdef CONFIG_FIT_PARTITION
+ gd->flags |= GENHD_FL_EXT_DEVT;
+#endif
if (tr->part_bits) {
if (new->devnum < 26)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -716,6 +716,9 @@ int efi_partition(struct parsed_partitio
gpt_entry *ptes = NULL;
u32 i;
unsigned ssz = queue_logical_block_size(state->disk->queue) / 512;
+#ifdef CONFIG_FIT_PARTITION
+ u32 extra_slot = 64;
+#endif
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -749,6 +752,11 @@ int efi_partition(struct parsed_partitio
ARRAY_SIZE(ptes[i].partition_name));
utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname);
state->parts[i + 1].has_info = true;
+#ifdef CONFIG_FIT_PARTITION
+ /* If this is a U-Boot FIT volume it may have subpartitions */
+ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_FIT_GUID))
+ (void) parse_fit_partitions(state, start * ssz, size * ssz, &extra_slot, 1);
+#endif
}
kfree(ptes);
kfree(gpt);
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -51,6 +51,9 @@
#define PARTITION_LINUX_LVM_GUID \
EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
+#define PARTITION_LINUX_FIT_GUID \
+ EFI_GUID( 0xcae9be83, 0xb15f, 0x49cc, \
+ 0x86, 0x3f, 0x08, 0x1b, 0x74, 0x4a, 0x2d, 0x93)
typedef struct _gpt_header {
__le64 signature;
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -564,6 +564,15 @@ static void parse_minix(struct parsed_pa
#endif /* CONFIG_MINIX_SUBPARTITION */
}
+static void parse_fit_mbr(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_FIT_PARTITION
+ u32 extra_slot = 64;
+ (void) parse_fit_partitions(state, offset, size, &extra_slot, 1);
+#endif /* CONFIG_FIT_PARTITION */
+}
+
static struct {
unsigned char id;
void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
@@ -575,6 +584,7 @@ static struct {
{UNIXWARE_PARTITION, parse_unixware},
{SOLARIS_X86_PARTITION, parse_solaris_x86},
{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
+ {FIT_PARTITION, parse_fit_mbr},
{0, NULL},
};
--- a/include/linux/msdos_partition.h
+++ b/include/linux/msdos_partition.h
@@ -31,6 +31,7 @@ enum msdos_sys_ind {
LINUX_LVM_PARTITION = 0x8e,
LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */
+ FIT_PARTITION = 0x2e, /* U-Boot uImage.FIT */
SOLARIS_X86_PARTITION = 0x82, /* also Linux swap partitions */
NEW_SOLARIS_X86_PARTITION = 0xbf,

View File

@ -1,39 +0,0 @@
From: Gabor Juhos <juhosg@openwrt.org>
Subject: kernel/3.1[02]: move MTD root device setup code to mtdcore
The current code only allows to automatically set
root device on MTD partitions. Move the code to MTD
core to allow to use it with all MTD devices.
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
---
drivers/mtd/mtdcore.c | 10 ++++++++++
1 file changed, 10 insertions(+)
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -27,6 +27,7 @@
#include <linux/reboot.h>
#include <linux/leds.h>
#include <linux/debugfs.h>
+#include <linux/root_dev.h>
#include <linux/nvmem-provider.h>
#include <linux/mtd/mtd.h>
@@ -748,6 +749,16 @@ int add_mtd_device(struct mtd_info *mtd)
of this try_ nonsense, and no bitching about it
either. :) */
__module_get(THIS_MODULE);
+
+ if (!strcmp(mtd->name, "rootfs") &&
+ IS_ENABLED(CONFIG_MTD_ROOTFS_ROOT_DEV) &&
+ ROOT_DEV == 0) {
+ unsigned int index = mtd->index;
+ pr_notice("mtd: device %d (%s) set to be root filesystem\n",
+ mtd->index, mtd->name);
+ ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, index);
+ }
+
return 0;
fail_nvmem_add:

View File

@ -1,120 +0,0 @@
From 6fa9e3678eb002246df1280322b6a024853950a5 Mon Sep 17 00:00:00 2001
From: Ansuel Smith <ansuelsmth@gmail.com>
Date: Mon, 11 Oct 2021 00:53:14 +0200
Subject: [PATCH] drivers: mtd: parsers: add nvmem support to cmdlinepart
Assuming cmdlinepart is only one level deep partition scheme and that
static partition are also defined in DTS, we can assign an of_node for
partition declared from bootargs. cmdlinepart have priority than
fiexed-partition parser so in this specific case the parser doesn't
assign an of_node. Fix this by searching a defined of_node using a
similar fixed_partition parser and if a partition is found with the same
label, check that it has the same offset and size and return the DT
of_node to correctly use NVMEM cells.
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
---
drivers/mtd/parsers/cmdlinepart.c | 71 +++++++++++++++++++++++++++++++
1 file changed, 71 insertions(+)
--- a/drivers/mtd/parsers/cmdlinepart.c
+++ b/drivers/mtd/parsers/cmdlinepart.c
@@ -43,6 +43,7 @@
#include <linux/mtd/partitions.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/of.h>
/* debug macro */
#if 0
@@ -323,6 +324,68 @@ static int mtdpart_setup_real(char *s)
return 0;
}
+static int search_fixed_partition(struct mtd_info *master,
+ struct mtd_partition *target_part,
+ struct mtd_partition *fixed_part)
+{
+ struct device_node *mtd_node;
+ struct device_node *ofpart_node;
+ struct device_node *pp;
+ struct mtd_partition part;
+ const char *partname;
+
+ mtd_node = mtd_get_of_node(master);
+ if (!mtd_node)
+ return -EINVAL;
+
+ ofpart_node = of_get_child_by_name(mtd_node, "partitions");
+
+ for_each_child_of_node(ofpart_node, pp) {
+ const __be32 *reg;
+ int len;
+ int a_cells, s_cells;
+
+ reg = of_get_property(pp, "reg", &len);
+ if (!reg) {
+ pr_debug("%s: ofpart partition %pOF (%pOF) missing reg property.\n",
+ master->name, pp,
+ mtd_node);
+ continue;
+ }
+
+ a_cells = of_n_addr_cells(pp);
+ s_cells = of_n_size_cells(pp);
+ if (len / 4 != a_cells + s_cells) {
+ pr_debug("%s: ofpart partition %pOF (%pOF) error parsing reg property.\n",
+ master->name, pp,
+ mtd_node);
+ continue;
+ }
+
+ part.offset = of_read_number(reg, a_cells);
+ part.size = of_read_number(reg + a_cells, s_cells);
+ part.of_node = pp;
+
+ partname = of_get_property(pp, "label", &len);
+ if (!partname)
+ partname = of_get_property(pp, "name", &len);
+ part.name = partname;
+
+ if (!strncmp(target_part->name, part.name, len)) {
+ if (part.offset != target_part->offset)
+ return -EINVAL;
+
+ if (part.size != target_part->size)
+ return -EINVAL;
+
+ memcpy(fixed_part, &part, sizeof(struct mtd_partition));
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
/*
* Main function to be called from the MTD mapping driver/device to
* obtain the partitioning information. At this point the command line
@@ -338,6 +401,7 @@ static int parse_cmdline_partitions(stru
int i, err;
struct cmdline_mtd_partition *part;
const char *mtd_id = master->name;
+ struct mtd_partition fixed_part;
/* parse command line */
if (!cmdline_parsed) {
@@ -382,6 +446,13 @@ static int parse_cmdline_partitions(stru
sizeof(*part->parts) * (part->num_parts - i));
i--;
}
+
+ err = search_fixed_partition(master, &part->parts[i], &fixed_part);
+ if (!err) {
+ part->parts[i].of_node = fixed_part.of_node;
+ pr_info("Found partition defined in DT for %s. Assigning OF node to support nvmem.",
+ part->parts[i].name);
+ }
}
*pparts = kmemdup(part->parts, sizeof(*part->parts) * part->num_parts,

View File

@ -1,23 +0,0 @@
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -61,6 +61,10 @@ config MTD_NAND_ECC_MEDIATEK
help
This enables support for the hardware ECC engine from Mediatek.
+config MTD_NAND_MTK_BMT
+ bool "Support MediaTek NAND Bad-block Management Table"
+ default n
+
endmenu
endmenu
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -3,6 +3,7 @@
nandcore-objs := core.o bbt.o
obj-$(CONFIG_MTD_NAND_CORE) += nandcore.o
obj-$(CONFIG_MTD_NAND_ECC_MEDIATEK) += ecc-mtk.o
+obj-$(CONFIG_MTD_NAND_MTK_BMT) += mtk_bmt.o mtk_bmt_v2.o mtk_bmt_bbt.o mtk_bmt_nmbm.o
obj-y += onenand/
obj-y += raw/

File diff suppressed because it is too large Load Diff

View File

@ -1,41 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:18:54 +0200
Subject: bridge: only accept EAP locally
When bridging, do not forward EAP frames to other ports, only deliver
them locally, regardless of the state.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
[add disable_eap_hack sysfs attribute]
Signed-off-by: Etienne Champetier <champetier.etienne@gmail.com>
---
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -133,10 +133,14 @@ int br_handle_frame_finish(struct net *n
}
}
+ BR_INPUT_SKB_CB(skb)->brdev = br->dev;
+
+ if (skb->protocol == htons(ETH_P_PAE) && !br->disable_eap_hack)
+ return br_pass_frame_up(skb);
+
if (state == BR_STATE_LEARNING)
goto drop;
- BR_INPUT_SKB_CB(skb)->brdev = br->dev;
BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
if (IS_ENABLED(CONFIG_INET) &&
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -482,6 +482,8 @@ struct net_bridge {
u16 group_fwd_mask;
u16 group_fwd_mask_required;
+ bool disable_eap_hack;
+
/* STP */
bridge_id designated_root;
bridge_id bridge_id;

View File

@ -1,214 +0,0 @@
From eda40b8c8c82e0f2789d6bc8bf63846dce2e8f32 Mon Sep 17 00:00:00 2001
From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
Date: Sat, 23 Mar 2019 09:29:49 +0000
Subject: [PATCH] netfilter: connmark: introduce set-dscpmark
set-dscpmark is a method of storing the DSCP of an ip packet into
conntrack mark. In combination with a suitable tc filter action
(act_ctinfo) DSCP values are able to be stored in the mark on egress and
restored on ingress across links that otherwise alter or bleach DSCP.
This is useful for qdiscs such as CAKE which are able to shape according
to policies based on DSCP.
Ingress classification is traditionally a challenging task since
iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
lookups, hence are unable to see internal IPv4 addresses as used on the
typical home masquerading gateway.
x_tables CONNMARK set-dscpmark target solves the problem of storing the
DSCP to the conntrack mark in a way suitable for the new act_ctinfo tc
action to restore.
The set-dscpmark option accepts 2 parameters, a 32bit 'dscpmask' and a
32bit 'statemask'. The dscp mask must be 6 contiguous bits and
represents the area where the DSCP will be stored in the connmark. The
state mask is a minimum 1 bit length mask that must not overlap with the
dscpmask. It represents a flag which is set when the DSCP has been
stored in the conntrack mark. This is useful to implement a 'one shot'
iptables based classification where the 'complicated' iptables rules are
only run once to classify the connection on initial (egress) packet and
subsequent packets are all marked/restored with the same DSCP. A state
mask of zero disables the setting of a status bit/s.
example syntax with a suitably modified iptables user space application:
iptables -A QOS_MARK_eth0 -t mangle -j CONNMARK --set-dscpmark 0xfc000000/0x01000000
Would store the DSCP in the top 6 bits of the 32bit mark field, and use
the LSB of the top byte as the 'DSCP has been stored' marker.
|----0xFC----conntrack mark----000000---|
| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0|
| DSCP | unused | flag |unused |
|-----------------------0x01---000000---|
^ ^
| |
---| Conditional flag
| set this when dscp
|-ip diffserv-| stored in mark
| 6 bits |
|-------------|
an identically configured tc action to restore looks like:
tc filter show dev eth0 ingress
filter parent ffff: protocol all pref 10 u32 chain 0
filter parent ffff: protocol all pref 10 u32 chain 0 fh 800: ht divisor 1
filter parent ffff: protocol all pref 10 u32 chain 0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1: not_in_hw
match 00000000/00000000 at 0
action order 1: ctinfo zone 0 pipe
index 2 ref 1 bind 1 dscp 0xfc000000/0x1000000
action order 2: mirred (Egress Redirect to device ifb4eth0) stolen
index 1 ref 1 bind 1
|----0xFC----conntrack mark----000000---|
| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0|
| DSCP | unused | flag |unused |
|-----------------------0x01---000000---|
| |
| |
---| Conditional flag
v only restore if set
|-ip diffserv-|
| 6 bits |
|-------------|
Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
---
include/uapi/linux/netfilter/xt_connmark.h | 10 ++++
net/netfilter/xt_connmark.c | 55 ++++++++++++++++++----
2 files changed, 57 insertions(+), 8 deletions(-)
--- a/include/uapi/linux/netfilter/xt_connmark.h
+++ b/include/uapi/linux/netfilter/xt_connmark.h
@@ -20,6 +20,11 @@ enum {
};
enum {
+ XT_CONNMARK_VALUE = (1 << 0),
+ XT_CONNMARK_DSCP = (1 << 1)
+};
+
+enum {
D_SHIFT_LEFT = 0,
D_SHIFT_RIGHT,
};
@@ -34,6 +39,11 @@ struct xt_connmark_tginfo2 {
__u8 shift_dir, shift_bits, mode;
};
+struct xt_connmark_tginfo3 {
+ __u32 ctmark, ctmask, nfmask;
+ __u8 shift_dir, shift_bits, mode, func;
+};
+
struct xt_connmark_mtinfo1 {
__u32 mark, mask;
__u8 invert;
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -24,13 +24,13 @@ MODULE_ALIAS("ipt_connmark");
MODULE_ALIAS("ip6t_connmark");
static unsigned int
-connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
+connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo3 *info)
{
enum ip_conntrack_info ctinfo;
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
- u_int32_t oldmark;
+ u_int8_t dscp;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
@@ -38,13 +38,24 @@ connmark_tg_shift(struct sk_buff *skb, c
switch (info->mode) {
case XT_CONNMARK_SET:
- oldmark = READ_ONCE(ct->mark);
- newmark = (oldmark & ~info->ctmask) ^ info->ctmark;
- if (info->shift_dir == D_SHIFT_RIGHT)
- newmark >>= info->shift_bits;
- else
- newmark <<= info->shift_bits;
+ newmark = READ_ONCE(ct->mark);
+ if (info->func & XT_CONNMARK_VALUE) {
+ newmark = (newmark & ~info->ctmask) ^ info->ctmark;
+ if (info->shift_dir == D_SHIFT_RIGHT)
+ newmark >>= info->shift_bits;
+ else
+ newmark <<= info->shift_bits;
+ } else if (info->func & XT_CONNMARK_DSCP) {
+ if (skb->protocol == htons(ETH_P_IP))
+ dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+ else /* protocol doesn't have diffserv */
+ break;
+ newmark = (newmark & ~info->ctmark) |
+ (info->ctmask | (dscp << info->shift_bits));
+ }
if (READ_ONCE(ct->mark) != newmark) {
WRITE_ONCE(ct->mark, newmark);
nf_conntrack_event_cache(IPCT_MARK, ct);
@@ -83,20 +94,36 @@ static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
- const struct xt_connmark_tginfo2 info2 = {
+ const struct xt_connmark_tginfo3 info3 = {
.ctmark = info->ctmark,
.ctmask = info->ctmask,
.nfmask = info->nfmask,
.mode = info->mode,
+ .func = XT_CONNMARK_VALUE
};
- return connmark_tg_shift(skb, &info2);
+ return connmark_tg_shift(skb, &info3);
}
static unsigned int
connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo2 *info = par->targinfo;
+ const struct xt_connmark_tginfo3 info3 = {
+ .ctmark = info->ctmark,
+ .ctmask = info->ctmask,
+ .nfmask = info->nfmask,
+ .mode = info->mode,
+ .func = XT_CONNMARK_VALUE
+ };
+
+ return connmark_tg_shift(skb, &info3);
+}
+
+static unsigned int
+connmark_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_connmark_tginfo3 *info = par->targinfo;
return connmark_tg_shift(skb, info);
}
@@ -167,6 +194,16 @@ static struct xt_target connmark_tg_reg[
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
+ },
+ {
+ .name = "CONNMARK",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = connmark_tg_check,
+ .target = connmark_tg_v3,
+ .targetsize = sizeof(struct xt_connmark_tginfo3),
+ .destroy = connmark_tg_destroy,
+ .me = THIS_MODULE,
}
};

View File

@ -1,776 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 20 Feb 2018 15:56:02 +0100
Subject: [PATCH] netfilter: add xt_FLOWOFFLOAD target
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
create mode 100644 net/netfilter/xt_OFFLOAD.c
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -712,8 +712,6 @@ config NFT_REJECT_NETDEV
endif # NF_TABLES_NETDEV
-endif # NF_TABLES
-
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
depends on NF_FLOW_TABLE
@@ -722,11 +720,12 @@ config NF_FLOW_TABLE_INET
To compile it as a module, choose M here.
+endif # NF_TABLES
+
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
depends on NETFILTER_INGRESS
depends on NF_CONNTRACK
- depends on NF_TABLES
help
This option adds the flow table core infrastructure.
@@ -1023,6 +1022,15 @@ config NETFILTER_XT_TARGET_NOTRACK
depends on NETFILTER_ADVANCED
select NETFILTER_XT_TARGET_CT
+config NETFILTER_XT_TARGET_FLOWOFFLOAD
+ tristate '"FLOWOFFLOAD" target support'
+ depends on NF_FLOW_TABLE
+ depends on NETFILTER_INGRESS
+ help
+ This option adds a `FLOWOFFLOAD' target, which uses the nf_flow_offload
+ module to speed up processing of packets by bypassing the usual
+ netfilter chains
+
config NETFILTER_XT_TARGET_RATEEST
tristate '"RATEEST" target support'
depends on NETFILTER_ADVANCED
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -148,6 +148,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIF
obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_FLOWOFFLOAD) += xt_FLOWOFFLOAD.o
obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_HMARK.o
obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
--- /dev/null
+++ b/net/netfilter/xt_FLOWOFFLOAD.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 2018-2021 Felix Fietkau <nbd@nbd.name>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_FLOWOFFLOAD.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct xt_flowoffload_hook {
+ struct hlist_node list;
+ struct nf_hook_ops ops;
+ struct net *net;
+ bool registered;
+ bool used;
+};
+
+struct xt_flowoffload_table {
+ struct nf_flowtable ft;
+ struct hlist_head hooks;
+ struct delayed_work work;
+};
+
+struct nf_forward_info {
+ const struct net_device *indev;
+ const struct net_device *outdev;
+ const struct net_device *hw_outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
+ u8 ingress_vlans;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
+};
+
+static DEFINE_SPINLOCK(hooks_lock);
+
+struct xt_flowoffload_table flowtable[2];
+
+static unsigned int
+xt_flowoffload_net_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct vlan_ethhdr *veth;
+ __be16 proto;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ proto = veth->h_vlan_encapsulated_proto;
+ break;
+ case htons(ETH_P_PPP_SES):
+ proto = nf_flow_pppoe_proto(skb);
+ break;
+ default:
+ proto = skb->protocol;
+ break;
+ }
+
+ switch (proto) {
+ case htons(ETH_P_IP):
+ return nf_flow_offload_ip_hook(priv, skb, state);
+ case htons(ETH_P_IPV6):
+ return nf_flow_offload_ipv6_hook(priv, skb, state);
+ }
+
+ return NF_ACCEPT;
+}
+
+static int
+xt_flowoffload_create_hook(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+ struct nf_hook_ops *ops;
+
+ hook = kzalloc(sizeof(*hook), GFP_ATOMIC);
+ if (!hook)
+ return -ENOMEM;
+
+ ops = &hook->ops;
+ ops->pf = NFPROTO_NETDEV;
+ ops->hooknum = NF_NETDEV_INGRESS;
+ ops->priority = 10;
+ ops->priv = &table->ft;
+ ops->hook = xt_flowoffload_net_hook;
+ ops->dev = dev;
+
+ hlist_add_head(&hook->list, &table->hooks);
+ mod_delayed_work(system_power_efficient_wq, &table->work, 0);
+
+ return 0;
+}
+
+static struct xt_flowoffload_hook *
+flow_offload_lookup_hook(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->ops.dev == dev)
+ return hook;
+ }
+
+ return NULL;
+}
+
+static void
+xt_flowoffload_check_device(struct xt_flowoffload_table *table,
+ struct net_device *dev)
+{
+ struct xt_flowoffload_hook *hook;
+
+ if (!dev)
+ return;
+
+ spin_lock_bh(&hooks_lock);
+ hook = flow_offload_lookup_hook(table, dev);
+ if (hook)
+ hook->used = true;
+ else
+ xt_flowoffload_create_hook(table, dev);
+ spin_unlock_bh(&hooks_lock);
+}
+
+static void
+xt_flowoffload_register_hooks(struct xt_flowoffload_table *table)
+{
+ struct xt_flowoffload_hook *hook;
+
+restart:
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->registered)
+ continue;
+
+ hook->registered = true;
+ hook->net = dev_net(hook->ops.dev);
+ spin_unlock_bh(&hooks_lock);
+ nf_register_net_hook(hook->net, &hook->ops);
+ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
+ table->ft.type->setup(&table->ft, hook->ops.dev,
+ FLOW_BLOCK_BIND);
+ spin_lock_bh(&hooks_lock);
+ goto restart;
+ }
+
+}
+
+static bool
+xt_flowoffload_cleanup_hooks(struct xt_flowoffload_table *table)
+{
+ struct xt_flowoffload_hook *hook;
+ bool active = false;
+
+restart:
+ spin_lock_bh(&hooks_lock);
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->used || !hook->registered) {
+ active = true;
+ continue;
+ }
+
+ hlist_del(&hook->list);
+ spin_unlock_bh(&hooks_lock);
+ if (table->ft.flags & NF_FLOWTABLE_HW_OFFLOAD)
+ table->ft.type->setup(&table->ft, hook->ops.dev,
+ FLOW_BLOCK_UNBIND);
+ nf_unregister_net_hook(hook->net, &hook->ops);
+ kfree(hook);
+ goto restart;
+ }
+ spin_unlock_bh(&hooks_lock);
+
+ return active;
+}
+
+static void
+xt_flowoffload_check_hook(struct flow_offload *flow, void *data)
+{
+ struct xt_flowoffload_table *table = data;
+ struct flow_offload_tuple *tuple0 = &flow->tuplehash[0].tuple;
+ struct flow_offload_tuple *tuple1 = &flow->tuplehash[1].tuple;
+ struct xt_flowoffload_hook *hook;
+
+ spin_lock_bh(&hooks_lock);
+ hlist_for_each_entry(hook, &table->hooks, list) {
+ if (hook->ops.dev->ifindex != tuple0->iifidx &&
+ hook->ops.dev->ifindex != tuple1->iifidx)
+ continue;
+
+ hook->used = true;
+ }
+ spin_unlock_bh(&hooks_lock);
+}
+
+static void
+xt_flowoffload_hook_work(struct work_struct *work)
+{
+ struct xt_flowoffload_table *table;
+ struct xt_flowoffload_hook *hook;
+ int err;
+
+ table = container_of(work, struct xt_flowoffload_table, work.work);
+
+ spin_lock_bh(&hooks_lock);
+ xt_flowoffload_register_hooks(table);
+ hlist_for_each_entry(hook, &table->hooks, list)
+ hook->used = false;
+ spin_unlock_bh(&hooks_lock);
+
+
+
+ if (err && err != -EAGAIN)
+ goto out;
+
+ if (!xt_flowoffload_cleanup_hooks(table))
+ return;
+
+out:
+ queue_delayed_work(system_power_efficient_wq, &table->work, HZ);
+}
+
+static bool
+xt_flowoffload_skip(struct sk_buff *skb, int family)
+{
+ if (skb_sec_path(skb))
+ return true;
+
+ if (family == NFPROTO_IPV4) {
+ const struct ip_options *opt = &(IPCB(skb)->opt);
+
+ if (unlikely(opt->optlen))
+ return true;
+ }
+
+ return false;
+}
+
+static enum flow_offload_xmit_type nf_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nf_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir,
+ struct net_device **dev)
+{
+ dev[!dir] = dst_cache->dev;
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nf_xmit_type(dst_cache);
+}
+
+static bool nf_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
+static void nf_dev_path_info(const struct net_device_path_stack *stack,
+ struct nf_forward_info *info,
+ unsigned char *ha)
+{
+ const struct net_device_path *path;
+ int i;
+
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
+ case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
+ info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
+
+ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ if (!info->outdev)
+ info->outdev = path->dev;
+ info->encap[info->num_encaps].id = path->encap.id;
+ info->encap[info->num_encaps].proto = path->encap.proto;
+ info->num_encaps++;
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
+ break;
+ case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
+ case DEV_PATH_BR_VLAN_TAG:
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ info->num_encaps--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+ break;
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+ if (!info->outdev)
+ info->outdev = info->indev;
+
+ info->hw_outdev = info->indev;
+
+ if (nf_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+}
+
+static int nf_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir, u8 *ha,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ struct neighbour *n;
+ u8 nud_state;
+
+ if (!nf_is_valid_ether_device(dev))
+ goto out;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+out:
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+static void nf_dev_forward_path(struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct net_device **devs)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nf_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
+ int i;
+
+ if (nf_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nf_dev_path_info(&stack, &info, ha);
+
+ devs[!dir] = (struct net_device *)info.indev;
+ if (!info.indev)
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
+}
+
+static int
+xt_flowoffload_route(struct sk_buff *skb, const struct nf_conn *ct,
+ const struct xt_action_param *par,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct net_device **devs)
+{
+ struct dst_entry *this_dst = skb_dst(skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (xt_family(par)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.flowi4_oif = xt_in(par)->ifindex;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
+ fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.flowi6_oif = xt_in(par)->ifindex;
+ break;
+ }
+
+ nf_route(xt_net(par), &other_dst, &fl, false, xt_family(par));
+ if (!other_dst)
+ return -ENOENT;
+
+ nf_default_forward_path(route, this_dst, dir, devs);
+ nf_default_forward_path(route, other_dst, !dir, devs);
+
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+ nf_dev_forward_path(route, ct, dir, devs);
+ nf_dev_forward_path(route, ct, !dir, devs);
+ }
+
+ return 0;
+}
+
+static unsigned int
+flowoffload_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ struct xt_flowoffload_table *table;
+ const struct xt_flowoffload_target_info *info = par->targinfo;
+ struct tcphdr _tcph, *tcph = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct nf_flow_route route = {};
+ struct flow_offload *flow = NULL;
+ struct net_device *devs[2] = {};
+ struct nf_conn *ct;
+ struct net *net;
+
+ if (xt_flowoffload_skip(skb, xt_family(par)))
+ return XT_CONTINUE;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct == NULL)
+ return XT_CONTINUE;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+ case IPPROTO_TCP:
+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return XT_CONTINUE;
+
+ tcph = skb_header_pointer(skb, par->thoff,
+ sizeof(_tcph), &_tcph);
+ if (unlikely(!tcph || tcph->fin || tcph->rst))
+ return XT_CONTINUE;
+ break;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return XT_CONTINUE;
+ }
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
+ return XT_CONTINUE;
+
+ if (!nf_ct_is_confirmed(ct))
+ return XT_CONTINUE;
+
+ devs[dir] = xt_out(par);
+ devs[!dir] = xt_in(par);
+
+ if (!devs[dir] || !devs[!dir])
+ return XT_CONTINUE;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return XT_CONTINUE;
+
+ dir = CTINFO2DIR(ctinfo);
+
+ if (xt_flowoffload_route(skb, ct, par, &route, dir, devs) < 0)
+ goto err_flow_route;
+
+ flow = flow_offload_alloc(ct);
+ if (!flow)
+ goto err_flow_alloc;
+
+ if (flow_offload_route_init(flow, &route) < 0)
+ goto err_flow_add;
+
+ if (tcph) {
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ table = &flowtable[!!(info->flags & XT_FLOWOFFLOAD_HW)];
+
+ net = read_pnet(&table->ft.net);
+ if (!net)
+ write_pnet(&table->ft.net, xt_net(par));
+
+ if (flow_offload_add(&table->ft, flow) < 0)
+ goto err_flow_add;
+
+ xt_flowoffload_check_device(table, devs[0]);
+ xt_flowoffload_check_device(table, devs[1]);
+
+ dst_release(route.tuple[!dir].dst);
+
+ return XT_CONTINUE;
+
+err_flow_add:
+ flow_offload_free(flow);
+err_flow_alloc:
+ dst_release(route.tuple[!dir].dst);
+err_flow_route:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+
+ return XT_CONTINUE;
+}
+
+static int flowoffload_chk(const struct xt_tgchk_param *par)
+{
+ struct xt_flowoffload_target_info *info = par->targinfo;
+
+ if (info->flags & ~XT_FLOWOFFLOAD_MASK)
+ return -EINVAL;
+
+ return 0;
+}
+
+static struct xt_target offload_tg_reg __read_mostly = {
+ .family = NFPROTO_UNSPEC,
+ .name = "FLOWOFFLOAD",
+ .revision = 0,
+ .targetsize = sizeof(struct xt_flowoffload_target_info),
+ .usersize = sizeof(struct xt_flowoffload_target_info),
+ .checkentry = flowoffload_chk,
+ .target = flowoffload_tg,
+ .me = THIS_MODULE,
+};
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct xt_flowoffload_hook *hook0, *hook1;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ spin_lock_bh(&hooks_lock);
+ hook0 = flow_offload_lookup_hook(&flowtable[0], dev);
+ if (hook0)
+ hlist_del(&hook0->list);
+
+ hook1 = flow_offload_lookup_hook(&flowtable[1], dev);
+ if (hook1)
+ hlist_del(&hook1->list);
+ spin_unlock_bh(&hooks_lock);
+
+ if (hook0) {
+ nf_unregister_net_hook(hook0->net, &hook0->ops);
+ kfree(hook0);
+ }
+
+ if (hook1) {
+ nf_unregister_net_hook(hook1->net, &hook1->ops);
+ kfree(hook1);
+ }
+
+ nf_flow_table_cleanup(dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = flow_offload_netdev_event,
+};
+
+static int nf_flow_rule_route_inet(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ const struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
+ int err;
+
+ switch (flow_tuple->l3proto) {
+ case NFPROTO_IPV4:
+ err = nf_flow_rule_route_ipv4(net, flow, dir, flow_rule);
+ break;
+ case NFPROTO_IPV6:
+ err = nf_flow_rule_route_ipv6(net, flow, dir, flow_rule);
+ break;
+ default:
+ err = -1;
+ break;
+ }
+
+ return err;
+}
+
+static struct nf_flowtable_type flowtable_inet = {
+ .family = NFPROTO_INET,
+ .init = nf_flow_table_init,
+ .setup = nf_flow_table_offload_setup,
+ .action = nf_flow_rule_route_inet,
+ .free = nf_flow_table_free,
+ .hook = xt_flowoffload_net_hook,
+ .owner = THIS_MODULE,
+};
+
+static int init_flowtable(struct xt_flowoffload_table *tbl)
+{
+ INIT_DELAYED_WORK(&tbl->work, xt_flowoffload_hook_work);
+ tbl->ft.type = &flowtable_inet;
+
+ return nf_flow_table_init(&tbl->ft);
+}
+
+static int __init xt_flowoffload_tg_init(void)
+{
+ int ret;
+
+ register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+ ret = init_flowtable(&flowtable[0]);
+ if (ret)
+ return ret;
+
+ ret = init_flowtable(&flowtable[1]);
+ if (ret)
+ goto cleanup;
+
+ flowtable[1].ft.flags = NF_FLOWTABLE_HW_OFFLOAD;
+
+ ret = xt_register_target(&offload_tg_reg);
+ if (ret)
+ goto cleanup2;
+
+ return 0;
+
+cleanup2:
+ nf_flow_table_free(&flowtable[1].ft);
+cleanup:
+ nf_flow_table_free(&flowtable[0].ft);
+ return ret;
+}
+
+static void __exit xt_flowoffload_tg_exit(void)
+{
+ xt_unregister_target(&offload_tg_reg);
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+ nf_flow_table_free(&flowtable[0].ft);
+ nf_flow_table_free(&flowtable[1].ft);
+}
+
+MODULE_LICENSE("GPL");
+module_init(xt_flowoffload_tg_init);
+module_exit(xt_flowoffload_tg_exit);
--- /dev/null
+++ b/include/uapi/linux/netfilter/xt_FLOWOFFLOAD.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _XT_FLOWOFFLOAD_H
+#define _XT_FLOWOFFLOAD_H
+
+#include <linux/types.h>
+
+enum {
+ XT_FLOWOFFLOAD_HW = 1 << 0,
+
+ XT_FLOWOFFLOAD_MASK = XT_FLOWOFFLOAD_HW
+};
+
+struct xt_flowoffload_target_info {
+ __u32 flags;
+};
+
+#endif /* _XT_FLOWOFFLOAD_H */

View File

@ -1,24 +0,0 @@
From 6d3bc769657b0ee7c7506dad9911111c4226a7ea Mon Sep 17 00:00:00 2001
From: Imre Kaloz <kaloz@openwrt.org>
Date: Fri, 7 Jul 2017 17:21:05 +0200
Subject: mac80211: increase wireless mesh header size
lede-commit 3d4466cfd8f75f717efdb1f96fdde3c70d865fc1
Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
---
include/linux/netdevice.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -149,8 +149,8 @@ static inline bool dev_xmit_complete(int
#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
-#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
-# if defined(CONFIG_MAC80211_MESH)
+#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) || 1
+# if defined(CONFIG_MAC80211_MESH) || 1
# define LL_MAX_HEADER 128
# else
# define LL_MAX_HEADER 96

View File

@ -1,27 +0,0 @@
From a6ccb238939b25851474a279b20367fd24a0e816 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:21:53 +0200
Subject: hack: net: fq_codel: tune defaults for small devices
Assume that x86_64 devices always have a big memory and do not need this
optimization compared to devices with only 32 MB or 64 MB RAM.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
net/sched/sch_fq_codel.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -474,7 +474,11 @@ static int fq_codel_init(struct Qdisc *s
sch->limit = 10*1024;
q->flows_cnt = 1024;
+#ifdef CONFIG_X86_64
q->memory_limit = 32 << 20; /* 32 MBytes */
+#else
+ q->memory_limit = 4 << 20; /* 4 MBytes */
+#endif
q->drop_batch_size = 64;
q->quantum = psched_mtu(qdisc_dev(sch));
INIT_LIST_HEAD(&q->new_flows);

View File

@ -1,100 +0,0 @@
From 1d418f7e88035ed7a94073f6354246c66e9193e9 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:22:58 +0200
Subject: fq_codel: switch default qdisc from pfifo_fast to fq_codel and remove pfifo_fast
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/net/sch_generic.h | 3 ++-
net/sched/Kconfig | 3 ++-
net/sched/sch_api.c | 2 +-
net/sched/sch_fq_codel.c | 3 ++-
net/sched/sch_generic.c | 4 ++--
5 files changed, 9 insertions(+), 6 deletions(-)
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -585,12 +585,13 @@ extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
+extern struct Qdisc_ops fq_codel_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
return ntx < dev->real_num_tx_queues ?
- default_qdisc_ops : &pfifo_fast_ops;
+ default_qdisc_ops : &fq_codel_qdisc_ops;
}
struct Qdisc_class_common {
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -4,8 +4,9 @@
#
menuconfig NET_SCHED
- bool "QoS and/or fair queueing"
+ def_bool y
select NET_SCH_FIFO
+ select NET_SCH_FQ_CODEL
help
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2277,7 +2277,7 @@ static int __init pktsched_init(void)
return err;
}
- register_qdisc(&pfifo_fast_ops);
+ register_qdisc(&fq_codel_qdisc_ops);
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
register_qdisc(&pfifo_head_drop_qdisc_ops);
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -719,7 +719,7 @@ static const struct Qdisc_class_ops fq_c
.walk = fq_codel_walk,
};
-static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
+struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.cl_ops = &fq_codel_class_ops,
.id = "fq_codel",
.priv_size = sizeof(struct fq_codel_sched_data),
@@ -734,6 +734,7 @@ static struct Qdisc_ops fq_codel_qdisc_o
.dump_stats = fq_codel_dump_stats,
.owner = THIS_MODULE,
};
+EXPORT_SYMBOL(fq_codel_qdisc_ops);
static int __init fq_codel_module_init(void)
{
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -32,7 +32,7 @@
#include <net/xfrm.h>
/* Qdisc to use by default */
-const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
+const struct Qdisc_ops *default_qdisc_ops = &fq_codel_qdisc_ops;
EXPORT_SYMBOL(default_qdisc_ops);
static void qdisc_maybe_clear_missed(struct Qdisc *q,
@@ -1142,12 +1142,12 @@ static void attach_one_default_qdisc(str
void *_unused)
{
struct Qdisc *qdisc;
- const struct Qdisc_ops *ops = default_qdisc_ops;
+ const struct Qdisc_ops *ops = &fq_codel_qdisc_ops;
if (dev->priv_flags & IFF_NO_QUEUE)
ops = &noqueue_qdisc_ops;
else if(dev->type == ARPHRD_CAN)
- ops = &pfifo_fast_ops;
+ ops = &fq_codel_qdisc_ops;
qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
if (!qdisc)

View File

@ -1,129 +0,0 @@
From 36e516290611e613aa92996cb4339561452695b4 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Jul 2017 17:24:23 +0200
Subject: net: swconfig: adds openwrt switch layer
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
drivers/net/phy/Kconfig | 83 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/net/phy/Makefile | 15 +++++++++
include/uapi/linux/Kbuild | 1 +
3 files changed, 99 insertions(+)
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -61,6 +61,80 @@ config SFP
depends on HWMON || HWMON=n
select MDIO_I2C
+comment "Switch configuration API + drivers"
+
+config SWCONFIG
+ tristate "Switch configuration API"
+ help
+ Switch configuration API using netlink. This allows
+ you to configure the VLAN features of certain switches.
+
+config SWCONFIG_LEDS
+ bool "Switch LED trigger support"
+ depends on (SWCONFIG && LEDS_TRIGGERS)
+
+config ADM6996_PHY
+ tristate "Driver for ADM6996 switches"
+ select SWCONFIG
+ help
+ Currently supports the ADM6996FC and ADM6996M switches.
+ Support for FC is very limited.
+
+config AR8216_PHY
+ tristate "Driver for Atheros AR8216 switches"
+ select SWCONFIG
+
+config AR8216_PHY_LEDS
+ bool "Atheros AR8216 switch LED support"
+ depends on (AR8216_PHY && LEDS_CLASS)
+
+source "drivers/net/phy/b53/Kconfig"
+
+config IP17XX_PHY
+ tristate "Driver for IC+ IP17xx switches"
+ select SWCONFIG
+
+config PSB6970_PHY
+ tristate "Lantiq XWAY Tantos (PSB6970) Ethernet switch"
+ select SWCONFIG
+ select ETHERNET_PACKET_MANGLE
+
+config RTL8306_PHY
+ tristate "Driver for Realtek RTL8306S switches"
+ select SWCONFIG
+
+config RTL8366_SMI
+ tristate "Driver for the RTL8366 SMI interface"
+ depends on GPIOLIB
+ help
+ This module implements the SMI interface protocol which is used
+ by some RTL8366 ethernet switch devices via the generic GPIO API.
+
+if RTL8366_SMI
+
+config RTL8366_SMI_DEBUG_FS
+ bool "RTL8366 SMI interface debugfs support"
+ depends on DEBUG_FS
+ default n
+
+config RTL8366S_PHY
+ tristate "Driver for the Realtek RTL8366S switch"
+ select SWCONFIG
+
+config RTL8366RB_PHY
+ tristate "Driver for the Realtek RTL8366RB switch"
+ select SWCONFIG
+
+config RTL8367_PHY
+ tristate "Driver for the Realtek RTL8367R/M switches"
+ select SWCONFIG
+
+config RTL8367B_PHY
+ tristate "Driver fot the Realtek RTL8367R-VB switch"
+ select SWCONFIG
+
+endif # RTL8366_SMI
+
comment "MII PHY device drivers"
config AMD_PHY
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -24,6 +24,19 @@ libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_
obj-$(CONFIG_PHYLINK) += phylink.o
obj-$(CONFIG_PHYLIB) += libphy.o
+obj-$(CONFIG_SWCONFIG) += swconfig.o
+obj-$(CONFIG_ADM6996_PHY) += adm6996.o
+obj-$(CONFIG_AR8216_PHY) += ar8216.o ar8327.o
+obj-$(CONFIG_SWCONFIG_B53) += b53/
+obj-$(CONFIG_IP17XX_PHY) += ip17xx.o
+obj-$(CONFIG_PSB6970_PHY) += psb6970.o
+obj-$(CONFIG_RTL8306_PHY) += rtl8306.o
+obj-$(CONFIG_RTL8366_SMI) += rtl8366_smi.o
+obj-$(CONFIG_RTL8366S_PHY) += rtl8366s.o
+obj-$(CONFIG_RTL8366RB_PHY) += rtl8366rb.o
+obj-$(CONFIG_RTL8367_PHY) += rtl8367.o
+obj-$(CONFIG_RTL8367B_PHY) += rtl8367b.o
+
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += mii_timestamper.o
obj-$(CONFIG_SFP) += sfp.o
--- a/include/linux/platform_data/b53.h
+++ b/include/linux/platform_data/b53.h
@@ -29,6 +29,9 @@ struct b53_platform_data {
u32 chip_id;
u16 enabled_ports;
+ /* allow to specify an ethX alias */
+ const char *alias;
+
/* only used by MMAP'd driver */
unsigned big_endian:1;
void __iomem *regs;

View File

@ -1,74 +0,0 @@
From 82985725e071f2a5735052f18e109a32aeac3a0b Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Sun, 26 Jul 2020 02:38:31 +0200
Subject: [PATCH] net: usb: r8152: add LED configuration from OF
This adds the ability to configure the LED configuration register using
OF. This way, the correct value for board specific LED configuration can
be determined.
Signed-off-by: David Bauer <mail@david-bauer.net>
---
drivers/net/usb/r8152.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -11,6 +11,7 @@
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/usb.h>
+#include <linux/of.h>
#include <linux/crc32.h>
#include <linux/if_vlan.h>
#include <linux/uaccess.h>
@@ -6866,6 +6867,22 @@ static void rtl_tally_reset(struct r8152
ocp_write_word(tp, MCU_TYPE_PLA, PLA_RSTTALLY, ocp_data);
}
+static int r8152_led_configuration(struct r8152 *tp)
+{
+ u32 led_data;
+ int ret;
+
+ ret = of_property_read_u32(tp->udev->dev.of_node, "realtek,led-data",
+ &led_data);
+
+ if (ret)
+ return ret;
+
+ ocp_write_word(tp, MCU_TYPE_PLA, PLA_LEDSEL, led_data);
+
+ return 0;
+}
+
static void r8152b_init(struct r8152 *tp)
{
u32 ocp_data;
@@ -6907,6 +6924,8 @@ static void r8152b_init(struct r8152 *tp
ocp_data = ocp_read_word(tp, MCU_TYPE_USB, USB_USB_CTRL);
ocp_data &= ~(RX_AGG_DISABLE | RX_ZERO_EN);
ocp_write_word(tp, MCU_TYPE_USB, USB_USB_CTRL, ocp_data);
+
+ r8152_led_configuration(tp);
}
static void r8153_init(struct r8152 *tp)
@@ -7047,6 +7066,8 @@ static void r8153_init(struct r8152 *tp)
tp->coalesce = COALESCE_SLOW;
break;
}
+
+ r8152_led_configuration(tp);
}
static void r8153b_init(struct r8152 *tp)
@@ -7129,6 +7150,8 @@ static void r8153b_init(struct r8152 *tp
rtl_tally_reset(tp);
tp->coalesce = 15000; /* 15 us */
+
+ r8152_led_configuration(tp);
}
static void r8153c_init(struct r8152 *tp)

View File

@ -1,54 +0,0 @@
From 3ee05f4aa64fc86af3be5bc176ba5808de9260a7 Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Sun, 26 Jul 2020 15:30:33 +0200
Subject: [PATCH] dt-bindings: net: add RTL8152 binding documentation
Add binding documentation for the Realtek RTL8152 / RTL8153 USB ethernet
adapters.
Signed-off-by: David Bauer <mail@david-bauer.net>
---
.../bindings/net/realtek,rtl8152.yaml | 36 +++++++++++++++++++
1 file changed, 36 insertions(+)
create mode 100644 Documentation/devicetree/bindings/net/realtek,rtl8152.yaml
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/realtek,rtl8152.yaml
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/realtek,rtl8152.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Realtek RTL8152/RTL8153 series USB ethernet
+
+maintainers:
+ - David Bauer <mail@david-bauer.net>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - realtek,rtl8152
+ - realtek,rtl8153
+
+ reg:
+ description: The device number on the USB bus
+
+ realtek,led-data:
+ description: Value to be written to the LED configuration register.
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ usb-eth@2 {
+ compatible = "realtek,rtl8153";
+ reg = <2>;
+ realtek,led-data = <0x87>;
+ };
\ No newline at end of file

View File

@ -1,98 +0,0 @@
From 3cb240533ab787899dc7f17aa7d6c5b4810e2e58 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Fri, 7 Jul 2017 17:26:01 +0200
Subject: bcm53xx: bgmac: use srab switch driver
use the srab switch driver on these SoCs.
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
---
drivers/net/ethernet/broadcom/bgmac-bcma.c | 1 +
drivers/net/ethernet/broadcom/bgmac.c | 24 ++++++++++++++++++++++++
drivers/net/ethernet/broadcom/bgmac.h | 4 ++++
3 files changed, 29 insertions(+)
--- a/drivers/net/ethernet/broadcom/bgmac-bcma.c
+++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c
@@ -280,6 +280,7 @@ static int bgmac_probe(struct bcma_devic
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
bgmac->feature_flags |= BGMAC_FEAT_NO_RESET;
bgmac->feature_flags |= BGMAC_FEAT_FORCE_SPEED_2500;
+ bgmac->feature_flags |= BGMAC_FEAT_SRAB;
break;
default:
bgmac->feature_flags |= BGMAC_FEAT_CLKCTLST;
--- a/drivers/net/ethernet/broadcom/bgmac.c
+++ b/drivers/net/ethernet/broadcom/bgmac.c
@@ -12,6 +12,7 @@
#include <linux/bcma/bcma.h>
#include <linux/etherdevice.h>
#include <linux/interrupt.h>
+#include <linux/platform_data/b53.h>
#include <linux/bcm47xx_nvram.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
@@ -1408,6 +1409,17 @@ static const struct ethtool_ops bgmac_et
.set_link_ksettings = phy_ethtool_set_link_ksettings,
};
+static struct b53_platform_data bgmac_b53_pdata = {
+};
+
+static struct platform_device bgmac_b53_dev = {
+ .name = "b53-srab-switch",
+ .id = -1,
+ .dev = {
+ .platform_data = &bgmac_b53_pdata,
+ },
+};
+
/**************************************************
* MII
**************************************************/
@@ -1542,6 +1554,14 @@ int bgmac_enet_probe(struct bgmac *bgmac
/* Omit FCS from max MTU size */
net_dev->max_mtu = BGMAC_RX_MAX_FRAME_SIZE - ETH_FCS_LEN;
+ if ((bgmac->feature_flags & BGMAC_FEAT_SRAB) && !bgmac_b53_pdata.regs) {
+ bgmac_b53_pdata.regs = ioremap_nocache(0x18007000, 0x1000);
+
+ err = platform_device_register(&bgmac_b53_dev);
+ if (!err)
+ bgmac->b53_device = &bgmac_b53_dev;
+ }
+
err = register_netdev(bgmac->net_dev);
if (err) {
dev_err(bgmac->dev, "Cannot register net device\n");
@@ -1564,6 +1584,10 @@ EXPORT_SYMBOL_GPL(bgmac_enet_probe);
void bgmac_enet_remove(struct bgmac *bgmac)
{
+ if (bgmac->b53_device)
+ platform_device_unregister(&bgmac_b53_dev);
+ bgmac->b53_device = NULL;
+
unregister_netdev(bgmac->net_dev);
phy_disconnect(bgmac->net_dev->phydev);
netif_napi_del(&bgmac->napi);
--- a/drivers/net/ethernet/broadcom/bgmac.h
+++ b/drivers/net/ethernet/broadcom/bgmac.h
@@ -388,6 +388,7 @@
#define BGMAC_FEAT_CC4_IF_SW_TYPE_RGMII BIT(18)
#define BGMAC_FEAT_CC7_IF_TYPE_RGMII BIT(19)
#define BGMAC_FEAT_IDM_MASK BIT(20)
+#define BGMAC_FEAT_SRAB BIT(21)
struct bgmac_slot_info {
union {
@@ -493,6 +494,9 @@ struct bgmac {
void (*cmn_maskset32)(struct bgmac *bgmac, u16 offset, u32 mask,
u32 set);
int (*phy_connect)(struct bgmac *bgmac);
+
+ /* platform device for associated switch */
+ struct platform_device *b53_device;
};
struct bgmac *bgmac_alloc(struct device *dev);

View File

@ -1,33 +0,0 @@
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1088,6 +1088,7 @@ static const struct usb_device_id produc
{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0620)}, /* Quectel EM160R-GL */
{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */
{QMI_MATCH_FF_FF_FF(0x2c7c, 0x0801)}, /* Quectel RM520N */
+ {QMI_MATCH_FF_FF_FF(0x05c6, 0xf601)}, /* MeigLink SLM750 */
/* 3. Combined interface devices matching on interface number */
{QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -244,6 +244,8 @@ static void option_instat_callback(struc
#define UBLOX_PRODUCT_R410M 0x90b2
/* These Yuga products use Qualcomm's vendor ID */
#define YUGA_PRODUCT_CLM920_NC5 0x9625
+/* These MeigLink products use Qualcomm's vendor ID */
+#define MEIGLINK_PRODUCT_SLM750 0xf601
#define QUECTEL_VENDOR_ID 0x2c7c
/* These Quectel products use Quectel's vendor ID */
@@ -1155,6 +1157,11 @@ static const struct usb_device_id option
.driver_info = ZLP },
{ USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96),
.driver_info = RSVD(4) },
+ /* Meiglink products using Qualcomm vendor ID */
+ // Works OK. In case of some issues check macros that are used by Quectel Products
+ { USB_DEVICE_AND_INTERFACE_INFO(QUALCOMM_VENDOR_ID, MEIGLINK_PRODUCT_SLM750, 0xff, 0xff, 0xff),
+ .driver_info = NUMEP2 },
+ { USB_DEVICE_AND_INTERFACE_INFO(QUALCOMM_VENDOR_ID, MEIGLINK_PRODUCT_SLM750, 0xff, 0, 0) },
{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff),
.driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 },
{ USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) },

View File

@ -1,162 +0,0 @@
From cc809a441d8f2924f785eb863dfa6aef47a25b0b Mon Sep 17 00:00:00 2001
From: John Crispin <blogic@openwrt.org>
Date: Tue, 12 Aug 2014 20:49:27 +0200
Subject: [PATCH 30/36] GPIO: add named gpio exports
Signed-off-by: John Crispin <blogic@openwrt.org>
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -19,6 +19,8 @@
#include <linux/pinctrl/pinctrl.h>
#include <linux/slab.h>
#include <linux/gpio/machine.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
#include "gpiolib.h"
#include "gpiolib-of.h"
@@ -1066,3 +1068,72 @@ void of_gpio_dev_init(struct gpio_chip *
else
gc->of_node = gdev->dev.of_node;
}
+
+#ifdef CONFIG_GPIO_SYSFS
+
+static struct of_device_id gpio_export_ids[] = {
+ { .compatible = "gpio-export" },
+ { /* sentinel */ }
+};
+
+static int of_gpio_export_probe(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+ struct device_node *cnp;
+ u32 val;
+ int nb = 0;
+
+ for_each_child_of_node(np, cnp) {
+ const char *name = NULL;
+ int gpio;
+ bool dmc;
+ int max_gpio = 1;
+ int i;
+
+ of_property_read_string(cnp, "gpio-export,name", &name);
+
+ if (!name)
+ max_gpio = of_gpio_count(cnp);
+
+ for (i = 0; i < max_gpio; i++) {
+ unsigned flags = 0;
+ enum of_gpio_flags of_flags;
+
+ gpio = of_get_gpio_flags(cnp, i, &of_flags);
+ if (!gpio_is_valid(gpio))
+ return gpio;
+
+ if (of_flags == OF_GPIO_ACTIVE_LOW)
+ flags |= GPIOF_ACTIVE_LOW;
+
+ if (!of_property_read_u32(cnp, "gpio-export,output", &val))
+ flags |= val ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
+ else
+ flags |= GPIOF_IN;
+
+ if (devm_gpio_request_one(&pdev->dev, gpio, flags, name ? name : of_node_full_name(np)))
+ continue;
+
+ dmc = of_property_read_bool(cnp, "gpio-export,direction_may_change");
+ gpio_export_with_name(gpio, dmc, name);
+ nb++;
+ }
+ }
+
+ dev_info(&pdev->dev, "%d gpio(s) exported\n", nb);
+
+ return 0;
+}
+
+static struct platform_driver gpio_export_driver = {
+ .driver = {
+ .name = "gpio-export",
+ .owner = THIS_MODULE,
+ .of_match_table = of_match_ptr(gpio_export_ids),
+ },
+ .probe = of_gpio_export_probe,
+};
+
+module_platform_driver(gpio_export_driver);
+
+#endif
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -125,6 +125,12 @@ static inline int gpio_export(unsigned g
return gpiod_export(gpio_to_desc(gpio), direction_may_change);
}
+int __gpiod_export(struct gpio_desc *desc, bool direction_may_change, const char *name);
+static inline int gpio_export_with_name(unsigned gpio, bool direction_may_change, const char *name)
+{
+ return __gpiod_export(gpio_to_desc(gpio), direction_may_change, name);
+}
+
static inline int gpio_export_link(struct device *dev, const char *name,
unsigned gpio)
{
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -728,6 +728,7 @@ static inline struct gpio_desc *acpi_get
#if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
+int _gpiod_export(struct gpio_desc *desc, bool direction_may_change, const char *name);
int gpiod_export(struct gpio_desc *desc, bool direction_may_change);
int gpiod_export_link(struct device *dev, const char *name,
struct gpio_desc *desc);
@@ -735,6 +736,13 @@ void gpiod_unexport(struct gpio_desc *de
#else /* CONFIG_GPIOLIB && CONFIG_GPIO_SYSFS */
+static inline int _gpiod_export(struct gpio_desc *desc,
+ bool direction_may_change,
+ const char *name)
+{
+ return -ENOSYS;
+}
+
static inline int gpiod_export(struct gpio_desc *desc,
bool direction_may_change)
{
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -544,7 +544,7 @@ static struct class gpio_class = {
*
* Returns zero on success, else an error.
*/
-int gpiod_export(struct gpio_desc *desc, bool direction_may_change)
+int __gpiod_export(struct gpio_desc *desc, bool direction_may_change, const char *name)
{
struct gpio_chip *chip;
struct gpio_device *gdev;
@@ -606,6 +606,8 @@ int gpiod_export(struct gpio_desc *desc,
offset = gpio_chip_hwgpio(desc);
if (chip->names && chip->names[offset])
ioname = chip->names[offset];
+ if (name)
+ ioname = name;
dev = device_create_with_groups(&gpio_class, &gdev->dev,
MKDEV(0, 0), data, gpio_groups,
@@ -627,6 +629,12 @@ err_unlock:
gpiod_dbg(desc, "%s: status %d\n", __func__, status);
return status;
}
+EXPORT_SYMBOL_GPL(__gpiod_export);
+
+int gpiod_export(struct gpio_desc *desc, bool direction_may_change)
+{
+ return __gpiod_export(desc, direction_may_change, NULL);
+}
EXPORT_SYMBOL_GPL(gpiod_export);
static int match_export(struct device *dev, const void *desc)

View File

@ -1,408 +0,0 @@
From 9e3f1d0805b2d919904dd9a4ff0d956314cc3cba Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 8 Jul 2017 08:20:09 +0200
Subject: debloat: procfs
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
fs/locks.c | 2 ++
fs/proc/Kconfig | 5 +++++
fs/proc/consoles.c | 3 +++
fs/proc/proc_tty.c | 11 ++++++++++-
include/net/snmp.h | 18 +++++++++++++++++-
ipc/msg.c | 3 +++
ipc/sem.c | 2 ++
ipc/shm.c | 2 ++
ipc/util.c | 3 +++
kernel/exec_domain.c | 2 ++
kernel/irq/proc.c | 9 +++++++++
kernel/time/timer_list.c | 2 ++
mm/vmalloc.c | 2 ++
mm/vmstat.c | 8 +++++---
net/8021q/vlanproc.c | 6 ++++++
net/core/net-procfs.c | 18 ++++++++++++------
net/core/sock.c | 2 ++
net/ipv4/fib_trie.c | 18 ++++++++++++------
net/ipv4/proc.c | 3 +++
net/ipv4/route.c | 3 +++
20 files changed, 105 insertions(+), 17 deletions(-)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2885,6 +2885,8 @@ static const struct seq_operations locks
static int __init proc_locks_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
sizeof(struct locks_iterator), NULL);
return 0;
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -100,6 +100,11 @@ config PROC_CHILDREN
Say Y if you are running any user-space software which takes benefit from
this interface. For example, rkt is such a piece of software.
+config PROC_STRIPPED
+ default n
+ depends on EXPERT
+ bool "Strip non-essential /proc functionality to reduce code size"
+
config PROC_PID_ARCH_STATUS
def_bool n
depends on PROC_FS
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -92,6 +92,9 @@ static const struct seq_operations conso
static int __init proc_consoles_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
proc_create_seq("consoles", 0, NULL, &consoles_op);
return 0;
}
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -131,7 +131,10 @@ static const struct seq_operations tty_d
void proc_tty_register_driver(struct tty_driver *driver)
{
struct proc_dir_entry *ent;
-
+
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
if (!driver->driver_name || driver->proc_entry ||
!driver->ops->proc_show)
return;
@@ -148,6 +151,9 @@ void proc_tty_unregister_driver(struct t
{
struct proc_dir_entry *ent;
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
ent = driver->proc_entry;
if (!ent)
return;
@@ -162,6 +168,9 @@ void proc_tty_unregister_driver(struct t
*/
void __init proc_tty_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
if (!proc_mkdir("tty", NULL))
return;
proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -124,6 +124,21 @@ struct linux_tls_mib {
#define DECLARE_SNMP_STAT(type, name) \
extern __typeof__(type) __percpu *name
+#ifdef CONFIG_PROC_STRIPPED
+#define __SNMP_STATS_DUMMY(mib) \
+ do { (void) mib->mibs[0]; } while(0)
+
+#define __SNMP_INC_STATS(mib, field) __SNMP_STATS_DUMMY(mib)
+#define SNMP_INC_STATS_ATOMIC_LONG(mib, field) __SNMP_STATS_DUMMY(mib)
+#define SNMP_INC_STATS(mib, field) __SNMP_STATS_DUMMY(mib)
+#define SNMP_DEC_STATS(mib, field) __SNMP_STATS_DUMMY(mib)
+#define __SNMP_ADD_STATS(mib, field, addend) __SNMP_STATS_DUMMY(mib)
+#define SNMP_ADD_STATS(mib, field, addend) __SNMP_STATS_DUMMY(mib)
+#define SNMP_UPD_PO_STATS(mib, basefield, addend) __SNMP_STATS_DUMMY(mib)
+#define __SNMP_UPD_PO_STATS(mib, basefield, addend) __SNMP_STATS_DUMMY(mib)
+
+#else
+
#define __SNMP_INC_STATS(mib, field) \
__this_cpu_inc(mib->mibs[field])
@@ -154,8 +169,9 @@ struct linux_tls_mib {
__this_cpu_add(ptr[basefield##OCTETS], addend); \
} while (0)
+#endif
-#if BITS_PER_LONG==32
+#if (BITS_PER_LONG==32) && !defined(CONFIG_PROC_STRIPPED)
#define __SNMP_ADD_STATS64(mib, field, addend) \
do { \
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -1350,6 +1350,9 @@ void __init msg_init(void)
{
msg_init_ns(&init_ipc_ns);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
ipc_init_proc_interface("sysvipc/msg",
" key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n",
IPC_MSG_IDS, sysvipc_msg_proc_show);
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -268,6 +268,8 @@ void sem_exit_ns(struct ipc_namespace *n
void __init sem_init(void)
{
sem_init_ns(&init_ipc_ns);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
ipc_init_proc_interface("sysvipc/sem",
" key semid perms nsems uid gid cuid cgid otime ctime\n",
IPC_SEM_IDS, sysvipc_sem_proc_show);
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -154,6 +154,8 @@ pure_initcall(ipc_ns_init);
void __init shm_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
ipc_init_proc_interface("sysvipc/shm",
#if BITS_PER_LONG <= 32
" key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n",
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -141,6 +141,9 @@ void __init ipc_init_proc_interface(cons
struct proc_dir_entry *pde;
struct ipc_proc_iface *iface;
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
iface = kmalloc(sizeof(*iface), GFP_KERNEL);
if (!iface)
return;
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -29,6 +29,8 @@ static int execdomains_proc_show(struct
static int __init proc_execdomains_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
proc_create_single("execdomains", 0, NULL, execdomains_proc_show);
return 0;
}
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -341,6 +341,9 @@ void register_irq_proc(unsigned int irq,
void __maybe_unused *irqp = (void *)(unsigned long) irq;
char name [MAX_NAMELEN];
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP))
+ return;
+
if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
return;
@@ -394,6 +397,9 @@ void unregister_irq_proc(unsigned int ir
{
char name [MAX_NAMELEN];
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP))
+ return;
+
if (!root_irq_dir || !desc->dir)
return;
#ifdef CONFIG_SMP
@@ -432,6 +438,9 @@ void init_irq_proc(void)
unsigned int irq;
struct irq_desc *desc;
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED) && !IS_ENABLED(CONFIG_SMP))
+ return;
+
/* create /proc/irq */
root_irq_dir = proc_mkdir("irq", NULL);
if (!root_irq_dir)
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -350,6 +350,8 @@ static int __init init_timer_list_procfs
{
struct proc_dir_entry *pe;
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops,
sizeof(struct timer_list_iter), NULL);
if (!pe)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4160,6 +4160,8 @@ static const struct seq_operations vmall
static int __init proc_vmalloc_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
if (IS_ENABLED(CONFIG_NUMA))
proc_create_seq_private("vmallocinfo", 0400, NULL,
&vmalloc_op,
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2127,10 +2127,12 @@ void __init init_mm_internals(void)
#endif
migrate_on_reclaim_init();
#ifdef CONFIG_PROC_FS
- proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
- proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
+ proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
+ proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ }
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
- proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
}
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -93,6 +93,9 @@ void vlan_proc_cleanup(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
if (vn->proc_vlan_conf)
remove_proc_entry(name_conf, vn->proc_vlan_dir);
@@ -112,6 +115,9 @@ int __net_init vlan_proc_init(struct net
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net);
if (!vn->proc_vlan_dir)
goto err;
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -319,10 +319,12 @@ static int __net_init dev_proc_net_init(
if (!proc_create_net("dev", 0444, net->proc_net, &dev_seq_ops,
sizeof(struct seq_net_private)))
goto out;
- if (!proc_create_seq("softnet_stat", 0444, net->proc_net,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create_seq("softnet_stat", 0444, net->proc_net,
&softnet_seq_ops))
goto out_dev;
- if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
sizeof(struct seq_net_private)))
goto out_softnet;
@@ -332,9 +334,11 @@ static int __net_init dev_proc_net_init(
out:
return rc;
out_ptype:
- remove_proc_entry("ptype", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("ptype", net->proc_net);
out_softnet:
- remove_proc_entry("softnet_stat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("softnet_stat", net->proc_net);
out_dev:
remove_proc_entry("dev", net->proc_net);
goto out;
@@ -344,8 +348,10 @@ static void __net_exit dev_proc_net_exit
{
wext_proc_exit(net);
- remove_proc_entry("ptype", net->proc_net);
- remove_proc_entry("softnet_stat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ }
remove_proc_entry("dev", net->proc_net);
}
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4005,6 +4005,8 @@ static __net_initdata struct pernet_oper
static int __init proto_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
return register_pernet_subsys(&proto_net_ops);
}
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -3031,11 +3031,13 @@ static const struct seq_operations fib_r
int __net_init fib_proc_init(struct net *net)
{
- if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
sizeof(struct fib_trie_iter)))
goto out1;
- if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create_net_single("fib_triestat", 0444, net->proc_net,
fib_triestat_seq_show, NULL))
goto out2;
@@ -3046,17 +3048,21 @@ int __net_init fib_proc_init(struct net
return 0;
out3:
- remove_proc_entry("fib_triestat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- remove_proc_entry("fib_trie", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
- remove_proc_entry("fib_trie", net->proc_net);
- remove_proc_entry("fib_triestat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ }
remove_proc_entry("route", net->proc_net);
}
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -553,5 +553,8 @@ static __net_initdata struct pernet_oper
int __init ip_misc_proc_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
return register_pernet_subsys(&ip_proc_ops);
}
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -380,6 +380,9 @@ static struct pernet_operations ip_rt_pr
static int __init ip_rt_proc_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
return register_pernet_subsys(&ip_rt_proc_ops);
}

View File

@ -1,93 +0,0 @@
From e3692cb2fcd5ba1244512a0f43b8118f65f1c375 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 8 Jul 2017 08:20:43 +0200
Subject: debloat: dmabuf
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
drivers/base/Kconfig | 2 +-
drivers/dma-buf/Makefile | 10 +++++++---
drivers/dma-buf/dma-buf.c | 4 +++-
kernel/sched/core.c | 1 +
4 files changed, 12 insertions(+), 5 deletions(-)
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -198,7 +198,7 @@ config SOC_BUS
source "drivers/base/regmap/Kconfig"
config DMA_SHARED_BUFFER
- bool
+ tristate
default n
select IRQ_WORK
help
--- a/drivers/dma-buf/heaps/Makefile
+++ b/drivers/dma-buf/heaps/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_DMABUF_HEAPS_SYSTEM) += system_heap.o
-obj-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o
+dma-buf-objs-$(CONFIG_DMABUF_HEAPS_SYSTEM) += system_heap.o
+dma-buf-objs-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o
--- a/drivers/dma-buf/Makefile
+++ b/drivers/dma-buf/Makefile
@@ -1,12 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \
+obj-$(CONFIG_DMA_SHARED_BUFFER) := dma-shared-buffer.o
+
+dma-buf-objs-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \
dma-fence-unwrap.o dma-resv.o
-obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o
-obj-$(CONFIG_DMABUF_HEAPS) += heaps/
-obj-$(CONFIG_SYNC_FILE) += sync_file.o
-obj-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o
-obj-$(CONFIG_UDMABUF) += udmabuf.o
-obj-$(CONFIG_DMABUF_SYSFS_STATS) += dma-buf-sysfs-stats.o
+dma-buf-objs-$(CONFIG_DMABUF_HEAPS) += dma-heap.o
+obj-$(CONFIG_DMABUF_HEAPS) += heaps/
+dma-buf-objs-$(CONFIG_SYNC_FILE) += sync_file.o
+dma-buf-objs-$(CONFIG_SW_SYNC) += sw_sync.o sync_debug.o
+dma-buf-objs-$(CONFIG_UDMABUF) += udmabuf.o
+dma-buf-objs-$(CONFIG_DMABUF_SYSFS_STATS) += udmabuf.o
dmabuf_selftests-y := \
selftest.o \
@@ -15,4 +17,6 @@ dmabuf_selftests-y := \
st-dma-fence-unwrap.o \
st-dma-resv.o
-obj-$(CONFIG_DMABUF_SELFTESTS) += dmabuf_selftests.o
+dma-buf-objs-$(CONFIG_DMABUF_SELFTESTS) += dmabuf_selftests.o
+
+dma-shared-buffer-objs := $(dma-buf-objs-y)
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1589,4 +1589,5 @@ static void __exit dma_buf_deinit(void)
kern_unmount(dma_buf_mnt);
dma_buf_uninit_sysfs_statistics();
}
-__exitcall(dma_buf_deinit);
+module_exit(dma_buf_deinit);
+MODULE_LICENSE("GPL");
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4319,6 +4319,7 @@ int wake_up_state(struct task_struct *p,
{
return try_to_wake_up(p, state, 0);
}
+EXPORT_SYMBOL_GPL(wake_up_state);
/*
* Perform scheduler related setup for a newly forked process p.
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -314,6 +314,7 @@ char *dynamic_dname(struct dentry *dentr
buffer += buflen - sz;
return memcpy(buffer, temp, sz);
}
+EXPORT_SYMBOL_GPL(dynamic_dname);
char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{

View File

@ -1,32 +0,0 @@
From 0d37e6edc09c99e683dd91ca0e83bbc0df8477b3 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 16 Jul 2017 16:56:10 +0200
Subject: lib: add uevent_next_seqnum()
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/linux/kobject.h | 5 +++++
lib/kobject_uevent.c | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 42 insertions(+)
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -179,6 +179,18 @@ out:
return r;
}
+u64 uevent_next_seqnum(void)
+{
+ u64 seq;
+
+ mutex_lock(&uevent_sock_mutex);
+ seq = ++uevent_seqnum;
+ mutex_unlock(&uevent_sock_mutex);
+
+ return seq;
+}
+EXPORT_SYMBOL_GPL(uevent_next_seqnum);
+
/**
* kobject_synth_uevent - send synthetic uevent with arguments
*

View File

@ -1,76 +0,0 @@
From 0d37e6edc09c99e683dd91ca0e83bbc0df8477b3 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 16 Jul 2017 16:56:10 +0200
Subject: lib: add uevent_next_seqnum()
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
include/linux/kobject.h | 5 +++++
lib/kobject_uevent.c | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 42 insertions(+)
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -32,6 +32,8 @@
#define UEVENT_NUM_ENVP 64 /* number of env pointers */
#define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */
+struct sk_buff;
+
#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
@@ -224,4 +226,7 @@ int kobject_synth_uevent(struct kobject
__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+ gfp_t allocation);
+
#endif /* _KOBJECT_H_ */
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -691,6 +691,43 @@ int add_uevent_var(struct kobj_uevent_en
EXPORT_SYMBOL_GPL(add_uevent_var);
#if defined(CONFIG_NET)
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+ gfp_t allocation)
+{
+ struct uevent_sock *ue_sk;
+ int err = 0;
+
+ /* send netlink message */
+ mutex_lock(&uevent_sock_mutex);
+ list_for_each_entry(ue_sk, &uevent_sock_list, list) {
+ struct sock *uevent_sock = ue_sk->sk;
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, allocation);
+ if (!skb2)
+ break;
+
+ err = netlink_broadcast(uevent_sock, skb2, pid, group,
+ allocation);
+ if (err)
+ break;
+ }
+ mutex_unlock(&uevent_sock_mutex);
+
+ kfree_skb(skb);
+ return err;
+}
+#else
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+ gfp_t allocation)
+{
+ kfree_skb(skb);
+ return 0;
+}
+#endif
+EXPORT_SYMBOL_GPL(broadcast_uevent);
+
+#if defined(CONFIG_NET)
static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb,
struct netlink_ext_ack *extack)
{

View File

@ -1,12 +0,0 @@
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1179,6 +1179,9 @@ int __init early_init_dt_scan_chosen(cha
p = of_get_flat_dt_prop(node, "bootargs", &l);
if (p != NULL && l > 0)
strlcpy(cmdline, p, min(l, COMMAND_LINE_SIZE));
+ p = of_get_flat_dt_prop(node, "bootargs-append", &l);
+ if (p != NULL && l > 0)
+ strlcat(cmdline, p, min_t(int, strlen(cmdline) + (int)l, COMMAND_LINE_SIZE));
/*
* CONFIG_CMDLINE is meant to be a default in case nothing else

View File

@ -1,352 +0,0 @@
From 42824d4b753f84ccf885eca602c5037338b546c8 Mon Sep 17 00:00:00 2001
From: Zhi Chen <zhichen@codeaurora.org>
Date: Tue, 13 Jan 2015 14:28:18 -0800
Subject: [PATCH 3/3] net: conntrack events, support multiple registrant
Merging this patch from kernel 3.4:
This was supported by old (.28) kernel versions but removed
because of it's overhead.
But we need this feature for NA connection manager. Both ipv4
and ipv6 modules needs to register themselves to ct events.
Change-Id: Iebfb254590fb594f5baf232f849d1b7ae45ef757
Signed-off-by: Zhi Chen <zhichen@codeaurora.org>
---
include/net/netfilter/nf_conntrack_ecache.h | 15 ++-
include/net/netns/conntrack.h | 3 +
net/netfilter/Kconfig | 8 ++
net/netfilter/nf_conntrack_core.c | 4 +
net/netfilter/nf_conntrack_ecache.c | 103 +++++++++++++++++++-
net/netfilter/nf_conntrack_netlink.c | 17 ++++
6 files changed, 146 insertions(+), 4 deletions(-)
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -65,9 +65,14 @@ struct nf_ct_event_notifier {
int (*exp_event)(unsigned int events, const struct nf_exp_event *item);
};
-void nf_conntrack_register_notifier(struct net *net,
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+extern int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb);
+extern int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb);
+#else
+int nf_conntrack_register_notifier(struct net *net,
const struct nf_ct_event_notifier *nb);
void nf_conntrack_unregister_notifier(struct net *net);
+#endif
void nf_ct_deliver_cached_events(struct nf_conn *ct);
int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
@@ -98,11 +103,13 @@ static inline void
nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- struct net *net = nf_ct_net(ct);
struct nf_conntrack_ecache *e;
+#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ struct net *net = nf_ct_net(ct);
if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
return;
+#endif
e = nf_ct_ecache_find(ct);
if (e == NULL)
@@ -117,20 +124,34 @@ nf_conntrack_event_report(enum ip_conntr
u32 portid, int report)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- if (nf_ct_ecache_exist(ct))
- return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
+#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ const struct net *net = nf_ct_net(ct);
+
+ if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
+ return 0;
#endif
+
+ return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
+#else
return 0;
+#endif
}
static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- if (nf_ct_ecache_exist(ct))
- return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
+#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ const struct net *net = nf_ct_net(ct);
+
+ if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
+ return 0;
#endif
+
+ return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
+#else
return 0;
+#endif
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -106,6 +106,9 @@ struct netns_ct {
u8 sysctl_checksum;
struct ip_conntrack_stat __percpu *stat;
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ struct atomic_notifier_head nf_conntrack_chain;
+#endif
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
struct nf_ip_net nf_ct_proto;
#if defined(CONFIG_NF_CONNTRACK_LABELS)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -161,6 +161,14 @@ config NF_CONNTRACK_EVENTS
If unsure, say `N'.
+config NF_CONNTRACK_CHAIN_EVENTS
+ bool "Register multiple callbacks to ct events"
+ depends on NF_CONNTRACK_EVENTS
+ help
+ Support multiple registrations.
+
+ If unsure, say `N'.
+
config NF_CONNTRACK_TIMEOUT
bool 'Connection tracking timeout'
depends on NETFILTER_ADVANCED
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2803,6 +2803,10 @@ int nf_conntrack_init_net(struct net *ne
nf_conntrack_ecache_pernet_init(net);
nf_conntrack_proto_pernet_init(net);
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ ATOMIC_INIT_NOTIFIER_HEAD(&net->ct.nf_conntrack_chain);
+#endif
+
return 0;
err_expect:
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -17,6 +17,9 @@
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/kernel.h>
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+#include <linux/notifier.h>
+#endif
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/export.h>
@@ -162,6 +165,35 @@ static int __nf_conntrack_eventmask_repo
return ret;
}
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
+ u32 portid, int report)
+{
+ struct nf_conntrack_ecache *e;
+ struct net *net = nf_ct_net(ct);
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ return 0;
+
+ if (nf_ct_is_confirmed(ct)) {
+ struct nf_ct_event item = {
+ .ct = ct,
+ .portid = e->portid ? e->portid : portid,
+ .report = report
+ };
+ /* This is a resent of a destroy event? If so, skip missed */
+ unsigned long missed = e->portid ? 0 : e->missed;
+
+ if (!((eventmask | missed) & e->ctmask))
+ return 0;
+
+ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain, eventmask | missed, &item);
+ }
+
+ return 0;
+}
+#else
int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
@@ -197,10 +229,52 @@ int nf_conntrack_eventmask_report(unsign
return ret;
}
+#endif
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+ unsigned long events, missed;
+ struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ struct net *net = nf_ct_net(ct);
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ return;
+
+ events = xchg(&e->cache, 0);
+
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events)
+ return;
+
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ missed = e->missed;
+
+ if (!((events | missed) & e->ctmask))
+ return;
+
+ item.ct = ct;
+ item.portid = 0;
+ item.report = 0;
+
+ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain,
+ events | missed,
+ &item);
+
+ if (likely(!missed))
+ return;
+
+ spin_lock_bh(&ct->lock);
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+}
+#else
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
struct nf_conntrack_ecache *e;
@@ -226,6 +300,7 @@ void nf_ct_deliver_cached_events(struct
*/
__nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
+#endif
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
@@ -258,20 +333,43 @@ out_unlock:
rcu_read_unlock();
}
-void nf_conntrack_register_notifier(struct net *net,
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+int nf_conntrack_register_notifier(struct net *net,
+ struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb);
+}
+#else
+int nf_conntrack_register_notifier(struct net *net,
const struct nf_ct_event_notifier *new)
{
+ int ret;
struct nf_ct_event_notifier *notify;
mutex_lock(&nf_ct_ecache_mutex);
notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
lockdep_is_held(&nf_ct_ecache_mutex));
WARN_ON_ONCE(notify);
+ if (notify != NULL) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
- mutex_unlock(&nf_ct_ecache_mutex);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&nf_ct_ecache_mutex);
+ return ret;
}
+#endif
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain, nb);
+}
+#else
void nf_conntrack_unregister_notifier(struct net *net)
{
mutex_lock(&nf_ct_ecache_mutex);
@@ -279,6 +377,7 @@ void nf_conntrack_unregister_notifier(st
mutex_unlock(&nf_ct_ecache_mutex);
/* synchronize_rcu() is called after netns pre_exit */
}
+#endif
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -712,12 +712,19 @@ static size_t ctnetlink_nlmsg_size(const
}
static int
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ctnetlink_conntrack_event(struct notifier_block *this, unsigned long events, void *ptr)
+#else
ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
+#endif
{
const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
struct nlattr *nest_parms;
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ struct nf_ct_event *item = (struct nf_ct_event *)ptr;
+#endif
struct nf_conn *ct = item->ct;
struct sk_buff *skb;
unsigned int type;
@@ -3749,11 +3756,17 @@ static int ctnetlink_stat_exp_cpu(struct
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+static struct notifier_block ctnl_notifier = {
+ .notifier_call = ctnetlink_conntrack_event
+};
+#else
static struct nf_ct_event_notifier ctnl_notifier = {
.ct_event = ctnetlink_conntrack_event,
.exp_event = ctnetlink_expect_event,
};
#endif
+#endif
static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = {
[IPCTNL_MSG_CT_NEW] = {
@@ -3852,8 +3865,12 @@ static int __net_init ctnetlink_net_init
static void ctnetlink_net_pre_exit(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ nf_conntrack_unregister_notifier(net,&ctnl_notifier);
+#else
nf_conntrack_unregister_notifier(net);
#endif
+#endif
}
static struct pernet_operations ctnetlink_net_ops = {

View File

@ -1,204 +0,0 @@
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -69,6 +69,9 @@ void brioctl_set(int (*hook)(struct net
int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
struct ifreq *ifr, void __user *uarg);
+extern void br_dev_update_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *nlstats);
+
#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
int br_multicast_list_adjacent(struct net_device *dev,
struct list_head *br_ip_list);
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -979,6 +979,10 @@ struct sk_buff {
__u8 csum_not_inet:1;
__u8 scm_io_uring:1;
+#ifdef CONFIG_SHORTCUT_FE
+ __u8 fast_forwarded:1;
+#endif
+
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -18,6 +18,10 @@ struct timer_list {
void (*function)(struct timer_list *);
u32 flags;
+#ifdef CONFIG_SHORTCUT_FE
+ unsigned long cust_data;
+#endif
+
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -68,6 +68,8 @@ struct nf_ct_event_notifier {
#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
extern int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb);
extern int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb);
+extern int nf_conntrack_register_chain_notifier(struct net *net, struct notifier_block *nb);
+extern int nf_conntrack_unregister_chain_notifier(struct net *net, struct notifier_block *nb);
#else
int nf_conntrack_register_notifier(struct net *net,
const struct nf_ct_event_notifier *nb);
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -460,6 +460,9 @@ config FAILOVER
migration of VMs with direct attached VFs by failing over to the
paravirtual datapath when the VF is unplugged.
+config SHORTCUT_FE
+ bool "Enables kernel network stack path for Shortcut Forwarding Engine"
+
config ETHTOOL_NETLINK
bool "Netlink interface for ethtool"
default y
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -774,6 +774,28 @@ void br_port_flags_change(struct net_bri
br_recalculate_neigh_suppress_enabled(br);
}
+void br_dev_update_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *nlstats)
+{
+
+ struct pcpu_sw_netstats *stats;
+
+ /* Is this a bridge? */
+ if (!(dev->priv_flags & IFF_EBRIDGE))
+ return;
+
+
+ stats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_add(&stats->rx_packets, nlstats->rx_packets);
+ u64_stats_add(&stats->rx_bytes, nlstats->rx_bytes);
+ u64_stats_add(&stats->tx_packets, nlstats->tx_packets);
+ u64_stats_add(&stats->tx_bytes, nlstats->tx_bytes);
+ u64_stats_update_end(&stats->syncp);
+}
+EXPORT_SYMBOL_GPL(br_dev_update_stats);
+
bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
{
struct net_bridge_port *p;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3581,9 +3581,17 @@ static int xmit_one(struct sk_buff *skb,
{
unsigned int len;
int rc;
-
+#ifdef CONFIG_SHORTCUT_FE
+ /* If this skb has been fast forwarded then we don't want it to
+ * go to any taps (by definition we're trying to bypass them).
+ */
+ if (!skb->fast_forwarded) {
+#endif
if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);
+#ifdef CONFIG_SHORTCUT_FE
+ }
+#endif
len = skb->len;
trace_net_dev_start_xmit(skb, dev);
@@ -5237,6 +5245,11 @@ void netdev_rx_handler_unregister(struct
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+#ifdef CONFIG_SHORTCUT_FE
+int (*athrs_fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly;
+EXPORT_SYMBOL_GPL(athrs_fast_nat_recv);
+#endif
+
/*
* Limit the use of PFMEMALLOC reserves to those protocols that implement
* the special handling of PFMEMALLOC skbs.
@@ -5285,6 +5298,10 @@ static int __netif_receive_skb_core(stru
int ret = NET_RX_DROP;
__be16 type;
+#ifdef CONFIG_SHORTCUT_FE
+ int (*fast_recv)(struct sk_buff *skb);
+#endif
+
net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
@@ -5322,6 +5339,15 @@ another_round:
goto out;
}
+#ifdef CONFIG_SHORTCUT_FE
+ fast_recv = rcu_dereference(athrs_fast_nat_recv);
+ if (fast_recv) {
+ if (fast_recv(skb)) {
+ ret = NET_RX_SUCCESS;
+ goto out;
+ }
+ }
+#endif
if (skb_skip_tc_classify(skb))
goto skip_classify;
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -143,12 +143,23 @@ static int __nf_conntrack_eventmask_repo
rcu_read_lock();
notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
- if (!notify) {
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ if (!notify && !rcu_dereference_raw(net->ct.nf_conntrack_chain.head))
+#else
+ if (!notify)
+#endif
+ {
rcu_read_unlock();
return 0;
}
-
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ ret = atomic_notifier_call_chain(&net->ct.nf_conntrack_chain,
+ events | missed, &item);
+ if (notify)
+ ret = notify->ct_event(events | missed, item);
+#else
ret = notify->ct_event(events | missed, item);
+#endif
rcu_read_unlock();
if (likely(ret >= 0 && missed == 0))
@@ -339,6 +350,11 @@ int nf_conntrack_register_notifier(struc
{
return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb);
}
+int nf_conntrack_register_chain_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_register_chain_notifier);
#else
int nf_conntrack_register_notifier(struct net *net,
const struct nf_ct_event_notifier *new)
@@ -369,6 +385,11 @@ int nf_conntrack_unregister_notifier(str
{
return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain, nb);
}
+int nf_conntrack_unregister_chain_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain, nb);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_unregister_chain_notifier);
#else
void nf_conntrack_unregister_notifier(struct net *net)
{

View File

@ -1,235 +0,0 @@
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -8,6 +8,9 @@
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_nat_masquerade.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_core.h>
struct masq_dev_work {
struct work_struct work;
@@ -24,6 +27,129 @@ static DEFINE_MUTEX(masq_mutex);
static unsigned int masq_refcnt __read_mostly;
static atomic_t masq_worker_count __read_mostly;
+static void bcm_nat_expect(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_nat_range2 range;
+
+ /* This must be a fresh one. */
+ BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+ /* Change src to where new ct comes from */
+ range.flags = NF_NAT_RANGE_MAP_IPS;
+ range.min_addr = range.max_addr =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
+
+ /* For DST manip, map port here to where it's expected. */
+ range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
+ range.min_proto = range.max_proto = exp->saved_proto;
+ range.min_addr = range.max_addr = exp->saved_addr;
+ nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
+}
+
+/****************************************************************************/
+static int bcm_nat_help(struct sk_buff *skb, unsigned int protoff,
+ struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+ int dir = CTINFO2DIR(ctinfo);
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_expect *exp;
+
+ if (dir != IP_CT_DIR_ORIGINAL ||
+ help->expecting[NF_CT_EXPECT_CLASS_DEFAULT])
+ return NF_ACCEPT;
+
+ pr_debug("bcm_nat: packet[%d bytes] ", skb->len);
+ nf_ct_dump_tuple(&ct->tuplehash[dir].tuple);
+ pr_debug("reply: ");
+ nf_ct_dump_tuple(&ct->tuplehash[!dir].tuple);
+
+ /* Create expect */
+ if ((exp = nf_ct_expect_alloc(ct)) == NULL)
+ return NF_ACCEPT;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, AF_INET, NULL,
+ &ct->tuplehash[!dir].tuple.dst.u3, IPPROTO_UDP,
+ NULL, &ct->tuplehash[!dir].tuple.dst.u.udp.port);
+ exp->flags = NF_CT_EXPECT_PERMANENT;
+ exp->saved_addr = ct->tuplehash[dir].tuple.src.u3;
+ exp->saved_proto.udp.port = ct->tuplehash[dir].tuple.src.u.udp.port;
+ exp->dir = !dir;
+ exp->expectfn = bcm_nat_expect;
+
+ /* Setup expect */
+ nf_ct_expect_related(exp, 0);
+ nf_ct_expect_put(exp);
+ pr_debug("bcm_nat: expect setup\n");
+
+ return NF_ACCEPT;
+}
+
+/****************************************************************************/
+static struct nf_conntrack_expect_policy bcm_nat_exp_policy __read_mostly = {
+ .max_expected = 1000,
+ .timeout = 240,
+};
+
+/****************************************************************************/
+static struct nf_conntrack_helper nf_conntrack_helper_bcm_nat __read_mostly = {
+ .name = "BCM-NAT",
+ .me = THIS_MODULE,
+ .tuple.src.l3num = AF_INET,
+ .tuple.dst.protonum = IPPROTO_UDP,
+ .expect_policy = &bcm_nat_exp_policy,
+ .expect_class_max = 1,
+ .help = bcm_nat_help,
+};
+
+/****************************************************************************/
+static inline int find_exp(__be32 ip, __be16 port, struct nf_conn *ct)
+{
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_expect *i = NULL;
+
+
+ memset(&tuple, 0, sizeof(tuple));
+ tuple.src.l3num = AF_INET;
+ tuple.dst.protonum = IPPROTO_UDP;
+ tuple.dst.u3.ip = ip;
+ tuple.dst.u.udp.port = port;
+
+ rcu_read_lock();
+ i = __nf_ct_expect_find(nf_ct_net(ct), nf_ct_zone(ct), &tuple);
+ rcu_read_unlock();
+
+ return i != NULL;
+}
+
+/****************************************************************************/
+static inline struct nf_conntrack_expect *find_fullcone_exp(struct nf_conn *ct)
+{
+ struct nf_conntrack_tuple * tp =
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ struct nf_conntrack_expect * exp = NULL;
+ struct nf_conntrack_expect * i;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < nf_ct_expect_hsize; h++) {
+ hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) {
+ if (nf_inet_addr_cmp(&i->saved_addr, &tp->src.u3) &&
+ i->saved_proto.all == tp->src.u.all &&
+ i->tuple.dst.protonum == tp->dst.protonum &&
+ i->tuple.src.u3.ip == 0 &&
+ i->tuple.src.u.udp.port == 0) {
+ exp = i;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ return exp;
+}
+
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
const struct nf_nat_range2 *range,
@@ -61,6 +187,72 @@ nf_nat_masquerade_ipv4(struct sk_buff *s
if (nat)
nat->masq_index = out->ifindex;
+/* RFC 4787 - 4.2.2. Port Parity
+ i.e., an even port will be mapped to an even port, and an odd port will be mapped to an odd port.
+*/
+#define CHECK_PORT_PARITY(a, b) ((a%2)==(b%2))
+ if (range->min_addr.ip != 0 /* nat_mode == full cone */
+ && (nfct_help(ct) == NULL || nfct_help(ct)->helper == NULL)
+ && nf_ct_protonum(ct) == IPPROTO_UDP) {
+ unsigned int ret;
+ u_int16_t minport;
+ u_int16_t maxport;
+ struct nf_conntrack_expect *exp;
+
+ pr_debug("bcm_nat: need full cone NAT\n");
+
+ /* Choose port */
+ spin_lock_bh(&nf_conntrack_expect_lock);
+ /* Look for existing expectation */
+ exp = find_fullcone_exp(ct);
+ if (exp) {
+ minport = maxport = exp->tuple.dst.u.udp.port;
+ pr_debug("bcm_nat: existing mapped port = %hu\n",
+ ntohs(minport));
+ } else { /* no previous expect */
+ u_int16_t newport, tmpport, orgport;
+
+ minport = range->min_proto.all == 0?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.
+ u.udp.port : range->min_proto.all;
+ maxport = range->max_proto.all == 0?
+ htons(65535) : range->max_proto.all;
+ orgport = ntohs(minport);
+ for (newport = ntohs(minport),tmpport = ntohs(maxport);
+ newport <= tmpport; newport++) {
+ if (CHECK_PORT_PARITY(orgport, newport) && !find_exp(newsrc, htons(newport), ct)) {
+ pr_debug("bcm_nat: new mapped port = "
+ "%hu\n", newport);
+ minport = maxport = htons(newport);
+ break;
+ }
+ }
+ }
+ spin_unlock_bh(&nf_conntrack_expect_lock);
+
+
+ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+ memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+
+ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS |
+ NF_NAT_RANGE_PROTO_SPECIFIED;
+ newrange.max_addr.ip = newrange.min_addr.ip = newsrc;
+ newrange.min_proto.udp.port = newrange.max_proto.udp.port = minport;
+
+ /* Set ct helper */
+ ret = nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
+ if (ret == NF_ACCEPT) {
+ struct nf_conn_help *help = nfct_help(ct);
+ if (help == NULL)
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
+ if (help != NULL) {
+ help->helper = &nf_conntrack_helper_bcm_nat;
+ pr_debug("bcm_nat: helper set\n");
+ }
+ }
+ return ret;
+ }
+
/* Transfer from original range. */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
@@ -352,6 +544,7 @@ EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet
void nf_nat_masquerade_inet_unregister_notifiers(void)
{
+ nf_conntrack_helper_unregister(&nf_conntrack_helper_bcm_nat);
mutex_lock(&masq_mutex);
/* check if the notifiers still have clients */
if (--masq_refcnt > 0)
--- a/net/netfilter/xt_MASQUERADE.c
+++ b/net/netfilter/xt_MASQUERADE.c
@@ -42,6 +42,9 @@ masquerade_tg(struct sk_buff *skb, const
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
+ range.min_addr.ip = mr->range[0].min_ip;
+ range.max_addr.ip = mr->range[0].max_ip;
+
return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
xt_out(par));
}

View File

@ -1,12 +0,0 @@
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -959,6 +959,9 @@ static int wireless_process_ioctl(struct
else if (private)
return private(dev, iwr, cmd, info, handler);
}
+ /* Old driver API : call driver ioctl handler */
+ if (dev->netdev_ops->ndo_do_ioctl)
+ return dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *) iwr, cmd);
return -EOPNOTSUPP;
}

View File

@ -1,29 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Oct 2020 22:00:03 +0200
Subject: [PATCH] compiler.h: only include asm/rwonce.h for kernel code
This header file is not in uapi, which makes any user space code that includes
linux/compiler.h to fail with the error 'asm/rwonce.h: No such file or directory'
Fixes: e506ea451254 ("compiler.h: Split {READ,WRITE}_ONCE definitions out into rwonce.h")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -213,6 +213,8 @@ void ftrace_likely_update(struct ftrace_
#define function_nocfi(x) (x)
#endif
+#include <asm/rwonce.h>
+
#endif /* __KERNEL__ */
/*
@@ -251,6 +253,4 @@ static inline void *offset_to_ptr(const
*/
#define prevent_tail_call_optimization() mb()
-#include <asm/rwonce.h>
-
#endif /* __LINUX_COMPILER_H */

View File

@ -1,11 +0,0 @@
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -3,7 +3,7 @@
#define _UAPI_LINUX_SWAB_H
#include <linux/types.h>
-#include <linux/compiler.h>
+#include <linux/stddef.h>
#include <asm/bitsperlong.h>
#include <asm/swab.h>

View File

@ -1,57 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 18 Apr 2018 10:50:05 +0200
Subject: [PATCH] MIPS: only process negative stack offsets on stack traces
Fixes endless back traces in cases where the compiler emits a stack
pointer increase in a branch delay slot (probably for some form of
function return).
[ 3.475442] BUG: MAX_STACK_TRACE_ENTRIES too low!
[ 3.480070] turning off the locking correctness validator.
[ 3.485521] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.14.34 #0
[ 3.491475] Stack : 00000000 00000000 00000000 00000000 80e0fce2 00000034 00000000 00000000
[ 3.499764] 87c3838c 80696377 8061047c 00000000 00000001 00000001 87c2d850 6534689f
[ 3.508059] 00000000 00000000 80e10000 00000000 00000000 000000cf 0000000f 00000000
[ 3.516353] 00000000 806a0000 00076891 00000000 00000000 00000000 ffffffff 00000000
[ 3.524648] 806c0000 00000004 80e10000 806a0000 00000003 80690000 00000000 80700000
[ 3.532942] ...
[ 3.535362] Call Trace:
[ 3.537818] [<80010a48>] show_stack+0x58/0x100
[ 3.542207] [<804c2f78>] dump_stack+0xe8/0x170
[ 3.546613] [<80079f90>] save_trace+0xf0/0x110
[ 3.551010] [<8007b1ec>] mark_lock+0x33c/0x78c
[ 3.555413] [<8007bf48>] __lock_acquire+0x2ac/0x1a08
[ 3.560337] [<8007de60>] lock_acquire+0x64/0x8c
[ 3.564846] [<804e1570>] _raw_spin_lock_irqsave+0x54/0x78
[ 3.570186] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.574770] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.579257] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.583839] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.588329] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.592911] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.597401] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.601983] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.606473] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.611055] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.615545] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.620125] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.624619] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.629197] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.633691] [<801b618c>] kernfs_notify+0x94/0xac
[ 3.638269] [<801b7b10>] sysfs_notify+0x74/0xa0
[ 3.642763] [<801b618c>] kernfs_notify+0x94/0xac
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -394,6 +394,8 @@ static inline int is_sp_move_ins(union m
if (ip->i_format.opcode == addiu_op ||
ip->i_format.opcode == daddiu_op) {
+ if (ip->i_format.simmediate > 0)
+ return 0;
*frame_size = -ip->i_format.simmediate;
return 1;
}

View File

@ -1,82 +0,0 @@
From: Tobias Wolf <dev-NTEO@vplace.de>
Subject: mm: Fix alloc_node_mem_map with ARCH_PFN_OFFSET calculation
An rt288x (ralink) based router (Belkin F5D8235 v1) does not boot with any
kernel beyond version 4.3 resulting in:
BUG: Bad page state in process swapper pfn:086ac
bisect resulted in:
a1c34a3bf00af2cede839879502e12dc68491ad5 is the first bad commit
commit a1c34a3bf00af2cede839879502e12dc68491ad5
Author: Laura Abbott <laura@labbott.name>
Date: Thu Nov 5 18:48:46 2015 -0800
mm: Don't offset memmap for flatmem
Srinivas Kandagatla reported bad page messages when trying to remove the
bottom 2MB on an ARM based IFC6410 board
BUG: Bad page state in process swapper pfn:fffa8
page:ef7fb500 count:0 mapcount:0 mapping: (null) index:0x0
flags: 0x96640253(locked|error|dirty|active|arch_1|reclaim|mlocked)
page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
bad because of flags:
flags: 0x200041(locked|active|mlocked)
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 3.19.0-rc3-00007-g412f9ba-dirty
#816
Hardware name: Qualcomm (Flattened Device Tree)
unwind_backtrace
show_stack
dump_stack
bad_page
free_pages_prepare
free_hot_cold_page
__free_pages
free_highmem_page
mem_init
start_kernel
Disabling lock debugging due to kernel taint
[...]
:040000 040000 2de013c372345fd471cd58f0553c9b38b0ef1cc4
0a8156f848733dfa21e16c196dfb6c0a76290709 M mm
This fix for ARM does not account ARCH_PFN_OFFSET for mem_map as later used by
page_to_pfn anymore.
The following output was generated with two hacked in printk statements:
printk("before %p vs. %p or %p\n", mem_map, mem_map - offset, mem_map -
(pgdat->node_start_pfn - ARCH_PFN_OFFSET));
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= offset + (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
printk("after %p\n", mem_map);
Output:
[ 0.000000] before 8861b280 vs. 8861b280 or 8851b280
[ 0.000000] after 8851b280
As seen in the first line mem_map with subtraction of offset does not equal the
mem_map after subtraction of ARCH_PFN_OFFSET.
After adding the offset of ARCH_PFN_OFFSET as well to mem_map as the
previously calculated offset is zero for the named platform it is able to boot
4.4 and 4.9-rc7 again.
Signed-off-by: Tobias Wolf <dev-NTEO@vplace.de>
---
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7841,7 +7841,7 @@ static void __init alloc_node_mem_map(st
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
- mem_map -= offset;
+ mem_map -= offset + (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
}
#endif
}

View File

@ -1,81 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: jffs2: use .rename2 and add RENAME_WHITEOUT support
It is required for renames on overlayfs
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -614,8 +614,8 @@ static int jffs2_rmdir (struct inode *di
return ret;
}
-static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i,
- struct dentry *dentry, umode_t mode, dev_t rdev)
+static int __jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i,
+ struct dentry *dentry, umode_t mode, dev_t rdev, bool whiteout)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
@@ -754,7 +754,11 @@ static int jffs2_mknod (struct user_name
mutex_unlock(&dir_f->sem);
jffs2_complete_reservation(c);
- d_instantiate_new(dentry, inode);
+ if (!whiteout)
+ d_instantiate_new(dentry, inode);
+ else
+ unlock_new_inode(inode);
+
return 0;
fail:
@@ -762,6 +766,19 @@ static int jffs2_mknod (struct user_name
return ret;
}
+static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ return __jffs2_mknod(mnt_userns, dir_i, dentry, mode, rdev, false);
+}
+
+static int jffs2_whiteout (struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry)
+{
+ return __jffs2_mknod(mnt_userns, old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE,
+ WHITEOUT_DEV, true);
+}
+
static int jffs2_rename (struct user_namespace *mnt_userns,
struct inode *old_dir_i, struct dentry *old_dentry,
struct inode *new_dir_i, struct dentry *new_dentry,
@@ -773,7 +790,7 @@ static int jffs2_rename (struct user_nam
uint8_t type;
uint32_t now;
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE|RENAME_WHITEOUT))
return -EINVAL;
/* The VFS will check for us and prevent trying to rename a
@@ -839,9 +856,14 @@ static int jffs2_rename (struct user_nam
if (d_is_dir(old_dentry) && !victim_f)
inc_nlink(new_dir_i);
- /* Unlink the original */
- ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
- old_dentry->d_name.name, old_dentry->d_name.len, NULL, now);
+ if (flags & RENAME_WHITEOUT)
+ /* Replace with whiteout */
+ ret = jffs2_whiteout(mnt_userns, old_dir_i, old_dentry);
+ else
+ /* Unlink the original */
+ ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, NULL, now);
/* We don't touch inode->i_nlink */

View File

@ -1,73 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: jffs2: add RENAME_EXCHANGE support
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -787,18 +787,31 @@ static int jffs2_rename (struct user_nam
int ret;
struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dir_i->i_sb);
struct jffs2_inode_info *victim_f = NULL;
+ struct inode *fst_inode = d_inode(old_dentry);
+ struct inode *snd_inode = d_inode(new_dentry);
uint8_t type;
uint32_t now;
- if (flags & ~(RENAME_NOREPLACE|RENAME_WHITEOUT))
+ if (flags & ~(RENAME_NOREPLACE|RENAME_WHITEOUT|RENAME_EXCHANGE))
return -EINVAL;
+ if ((flags & RENAME_EXCHANGE) && (old_dir_i != new_dir_i)) {
+ if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) {
+ inc_nlink(new_dir_i);
+ drop_nlink(old_dir_i);
+ }
+ else if (!S_ISDIR(fst_inode->i_mode) && S_ISDIR(snd_inode->i_mode)) {
+ drop_nlink(new_dir_i);
+ inc_nlink(old_dir_i);
+ }
+ }
+
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
* needs to do that for itself.
*/
- if (d_really_is_positive(new_dentry)) {
+ if (d_really_is_positive(new_dentry) && !(flags & RENAME_EXCHANGE)) {
victim_f = JFFS2_INODE_INFO(d_inode(new_dentry));
if (d_is_dir(new_dentry)) {
struct jffs2_full_dirent *fd;
@@ -833,7 +846,7 @@ static int jffs2_rename (struct user_nam
if (ret)
return ret;
- if (victim_f) {
+ if (victim_f && !(flags & RENAME_EXCHANGE)) {
/* There was a victim. Kill it off nicely */
if (d_is_dir(new_dentry))
clear_nlink(d_inode(new_dentry));
@@ -859,6 +872,12 @@ static int jffs2_rename (struct user_nam
if (flags & RENAME_WHITEOUT)
/* Replace with whiteout */
ret = jffs2_whiteout(mnt_userns, old_dir_i, old_dentry);
+ else if (flags & RENAME_EXCHANGE)
+ /* Replace the original */
+ ret = jffs2_do_link(c, JFFS2_INODE_INFO(old_dir_i),
+ d_inode(new_dentry)->i_ino, type,
+ old_dentry->d_name.name, old_dentry->d_name.len,
+ now);
else
/* Unlink the original */
ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
@@ -890,7 +909,7 @@ static int jffs2_rename (struct user_nam
return ret;
}
- if (d_is_dir(old_dentry))
+ if (d_is_dir(old_dentry) && !(flags & RENAME_EXCHANGE))
drop_nlink(old_dir_i);
new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);

View File

@ -1,20 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: jffs2: add splice ops
Add splice_read using generic_file_splice_read.
Add splice_write using iter_file_splice_write
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -53,6 +53,8 @@ const struct file_operations jffs2_file_
.open = generic_file_open,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl=jffs2_ioctl,
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,

View File

@ -1,45 +0,0 @@
From: Stephen Hemminger <stephen@networkplumber.org>
Subject: bridge: allow receiption on disabled port
When an ethernet device is enslaved to a bridge, and the bridge STP
detects loss of carrier (or operational state down), then normally
packet receiption is blocked.
This breaks control applications like WPA which maybe expecting to
receive packets to negotiate to bring link up. The bridge needs to
block forwarding packets from these disabled ports, but there is no
hard requirement to not allow local packet delivery.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -222,6 +222,9 @@ static void __br_handle_local_finish(str
/* note: already called with rcu_read_lock */
static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
+
+ if (p->state != BR_STATE_DISABLED)
__br_handle_local_finish(skb);
/* return 1 to signal the okfn() was called so it's ok to use the skb */
@@ -390,6 +393,17 @@ forward:
goto defer_stp_filtering;
switch (p->state) {
+ case BR_STATE_DISABLED:
+ if (ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_HOST;
+
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ br_handle_local_finish) == 1) {
+ return RX_HANDLER_PASS;
+ }
+ break;
+
case BR_STATE_FORWARDING:
case BR_STATE_LEARNING:
defer_stp_filtering:

View File

@ -1,94 +0,0 @@
From: Daniel González Cabanelas <dgcbueu@gmail.com>
Subject: [PATCH 1/2] rtc: rs5c372: support alarms up to 1 week
The Ricoh R2221x, R2223x, RS5C372, RV5C387A chips can handle 1 week
alarms.
Read the "wday" alarm register and convert it to a date to support up 1
week in our driver.
Signed-off-by: Daniel González Cabanelas <dgcbueu@gmail.com>
---
drivers/rtc/rtc-rs5c372.c | 48 ++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 6 deletions(-)
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -399,7 +399,9 @@ static int rs5c_read_alarm(struct device
{
struct i2c_client *client = to_i2c_client(dev);
struct rs5c372 *rs5c = i2c_get_clientdata(client);
- int status;
+ int status, wday_offs;
+ struct rtc_time rtc;
+ unsigned long alarm_secs;
status = rs5c_get_regs(rs5c);
if (status < 0)
@@ -409,6 +411,30 @@ static int rs5c_read_alarm(struct device
t->time.tm_sec = 0;
t->time.tm_min = bcd2bin(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f);
t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]);
+ t->time.tm_wday = ffs(rs5c->regs[RS5C_REG_ALARM_A_WDAY] & 0x7f) - 1;
+
+ /* determine the day, month and year based on alarm wday, taking as a
+ * reference the current time from the rtc
+ */
+ status = rs5c372_rtc_read_time(dev, &rtc);
+ if (status < 0)
+ return status;
+
+ wday_offs = t->time.tm_wday - rtc.tm_wday;
+ alarm_secs = mktime64(rtc.tm_year + 1900,
+ rtc.tm_mon + 1,
+ rtc.tm_mday + wday_offs,
+ t->time.tm_hour,
+ t->time.tm_min,
+ t->time.tm_sec);
+
+ if (wday_offs < 0 || (wday_offs == 0 &&
+ (t->time.tm_hour < rtc.tm_hour ||
+ (t->time.tm_hour == rtc.tm_hour &&
+ t->time.tm_min <= rtc.tm_min))))
+ alarm_secs += 7 * 86400;
+
+ rtc_time64_to_tm(alarm_secs, &t->time);
/* ... and status */
t->enabled = !!(rs5c->regs[RS5C_REG_CTRL1] & RS5C_CTRL1_AALE);
@@ -423,12 +449,20 @@ static int rs5c_set_alarm(struct device
struct rs5c372 *rs5c = i2c_get_clientdata(client);
int status, addr, i;
unsigned char buf[3];
+ struct rtc_time rtc_tm;
+ unsigned long rtc_secs, alarm_secs;
- /* only handle up to 24 hours in the future, like RTC_ALM_SET */
- if (t->time.tm_mday != -1
- || t->time.tm_mon != -1
- || t->time.tm_year != -1)
+ /* chip only can handle alarms up to one week in the future*/
+ status = rs5c372_rtc_read_time(dev, &rtc_tm);
+ if (status)
+ return status;
+ rtc_secs = rtc_tm_to_time64(&rtc_tm);
+ alarm_secs = rtc_tm_to_time64(&t->time);
+ if (alarm_secs >= rtc_secs + 7 * 86400) {
+ dev_err(dev, "%s: alarm maximum is one week in the future (%d)\n",
+ __func__, status);
return -EINVAL;
+ }
/* REVISIT: round up tm_sec */
@@ -449,7 +483,9 @@ static int rs5c_set_alarm(struct device
/* set alarm */
buf[0] = bin2bcd(t->time.tm_min);
buf[1] = rs5c_hr2reg(rs5c, t->time.tm_hour);
- buf[2] = 0x7f; /* any/all days */
+ /* each bit is the day of the week, 0x7f means all days */
+ buf[2] = (t->time.tm_wday >= 0 && t->time.tm_wday < 7) ?
+ BIT(t->time.tm_wday) : 0x7f;
for (i = 0; i < sizeof(buf); i++) {
addr = RS5C_ADDR(RS5C_REG_ALARM_A_MIN + i);

View File

@ -1,70 +0,0 @@
From: Daniel González Cabanelas <dgcbueu@gmail.com>
Subject: [PATCH 2/2] rtc: rs5c372: let the alarm to be used as wakeup source
Currently there is no use for the interrupts on the rs5c372 RTC and the
wakealarm isn't enabled. There are some devices like NASes which use this
RTC to wake up from the power off state when the INTR pin is activated by
the alarm clock.
Enable the alarm and let to be used as a wakeup source.
Tested on a Buffalo LS421DE NAS.
Signed-off-by: Daniel González Cabanelas <dgcbueu@gmail.com>
---
drivers/rtc/rtc-rs5c372.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -833,6 +833,7 @@ static int rs5c372_probe(struct i2c_clie
int err = 0;
int smbus_mode = 0;
struct rs5c372 *rs5c372;
+ bool rs5c372_can_wakeup_device = false;
dev_dbg(&client->dev, "%s\n", __func__);
@@ -868,6 +869,12 @@ static int rs5c372_probe(struct i2c_clie
else
rs5c372->type = id->driver_data;
+#ifdef CONFIG_OF
+ if(of_property_read_bool(client->dev.of_node,
+ "wakeup-source"))
+ rs5c372_can_wakeup_device = true;
+#endif
+
/* we read registers 0x0f then 0x00-0x0f; skip the first one */
rs5c372->regs = &rs5c372->buf[1];
rs5c372->smbus = smbus_mode;
@@ -901,6 +908,8 @@ static int rs5c372_probe(struct i2c_clie
goto exit;
}
+ rs5c372->has_irq = 1;
+
/* if the oscillator lost power and no other software (like
* the bootloader) set it up, do it here.
*
@@ -927,6 +936,10 @@ static int rs5c372_probe(struct i2c_clie
);
/* REVISIT use client->irq to register alarm irq ... */
+ if (rs5c372_can_wakeup_device) {
+ device_init_wakeup(&client->dev, true);
+ }
+
rs5c372->rtc = devm_rtc_device_register(&client->dev,
rs5c372_driver.driver.name,
&rs5c372_rtc_ops, THIS_MODULE);
@@ -940,6 +953,9 @@ static int rs5c372_probe(struct i2c_clie
if (err)
goto exit;
+ /* the rs5c372 alarm only supports a minute accuracy */
+
+
return 0;
exit:

View File

@ -1,25 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: Upgrade to Linux 2.6.19
- Includes large parts of the patch from #1021 by dpalffy
- Includes RB532 NAND driver changes by n0-1
[john@phrozen.org: feix will add this to his upstream queue]
lede-commit: bff468813f78f81e36ebb2a3f4354de7365e640f
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
Makefile | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/Makefile
+++ b/Makefile
@@ -759,7 +759,7 @@ KBUILD_CFLAGS += $(call cc-disable-warni
ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
KBUILD_CFLAGS += -O2
else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
-KBUILD_CFLAGS += -Os
+KBUILD_CFLAGS += -Os -fno-reorder-blocks -fno-tree-ch
endif
# Tell gcc to never replace conditional load with a non-conditional one

View File

@ -1,119 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: kernel: add a config option for keeping the kallsyms table uncompressed, saving ~9kb kernel size after lzma on ar71xx
[john@phrozen.org: added to my upstream queue 30.12.2016]
lede-commit: e0e3509b5ce2ccf93d4d67ea907613f5f7ec2eed
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
init/Kconfig | 11 +++++++++++
kernel/kallsyms.c | 8 ++++++++
scripts/kallsyms.c | 12 ++++++++++++
scripts/link-vmlinux.sh | 4 ++++
4 files changed, 35 insertions(+)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1464,6 +1464,17 @@ config SYSCTL_ARCH_UNALIGN_ALLOW
the unaligned access emulation.
see arch/parisc/kernel/unaligned.c for reference
+config KALLSYMS_UNCOMPRESSED
+ bool "Keep kallsyms uncompressed"
+ depends on KALLSYMS
+ help
+ Normally kallsyms contains compressed symbols (using a token table),
+ reducing the uncompressed kernel image size. Keeping the symbol table
+ uncompressed significantly improves the size of this part in compressed
+ kernel images.
+
+ Say N unless you need compressed kernel images to be small.
+
config HAVE_PCSPKR_PLATFORM
bool
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -61,6 +61,11 @@ static unsigned int kallsyms_expand_symb
* For every byte on the compressed symbol data, copy the table
* entry for that byte.
*/
+#ifdef CONFIG_KALLSYMS_UNCOMPRESSED
+ memcpy(result, data + 1, len - 1);
+ result += len - 1;
+ len = 0;
+#endif
while (len) {
tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
data++;
@@ -93,6 +98,9 @@ tail:
*/
static char kallsyms_get_symbol_type(unsigned int off)
{
+#ifdef CONFIG_KALLSYMS_UNCOMPRESSED
+ return kallsyms_names[off + 1];
+#endif
/*
* Get just the first code, look it up in the token table,
* and return the first char from this token.
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -58,6 +58,7 @@ static struct addr_range percpu_range =
static struct sym_entry **table;
static unsigned int table_size, table_cnt;
static int all_symbols;
+static int uncompressed;
static int absolute_percpu;
static int base_relative;
@@ -487,6 +488,9 @@ static void write_src(void)
free(markers);
+ if (uncompressed)
+ return;
+
output_label("kallsyms_token_table");
off = 0;
for (i = 0; i < 256; i++) {
@@ -538,6 +542,9 @@ static unsigned char *find_token(unsigne
{
int i;
+ if (uncompressed)
+ return NULL;
+
for (i = 0; i < len - 1; i++) {
if (str[i] == token[0] && str[i+1] == token[1])
return &str[i];
@@ -610,6 +617,9 @@ static void optimize_result(void)
{
int i, best;
+ if (uncompressed)
+ return;
+
/* using the '\0' symbol last allows compress_symbols to use standard
* fast string functions */
for (i = 255; i >= 0; i--) {
@@ -774,6 +784,8 @@ int main(int argc, char **argv)
absolute_percpu = 1;
else if (strcmp(argv[i], "--base-relative") == 0)
base_relative = 1;
+ else if (strcmp(argv[i], "--uncompressed") == 0)
+ uncompressed = 1;
else
usage();
}
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -156,6 +156,10 @@ kallsyms()
kallsymopt="${kallsymopt} --base-relative"
fi
+ if [ -n "${CONFIG_KALLSYMS_UNCOMPRESSED}" ]; then
+ kallsymopt="${kallsymopt} --uncompressed"
+ fi
+
info KSYMS ${2}
${NM} -n ${1} | scripts/kallsyms ${kallsymopt} > ${2}
}

View File

@ -1,41 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: kernel: when KALLSYMS is disabled, print module address + size for matching backtrace entries
[john@phrozen.org: felix will add this to his upstream queue]
lede-commit 53827cdc824556cda910b23ce5030c363b8f1461
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
lib/vsprintf.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -980,8 +980,10 @@ char *symbol_string(char *buf, char *end
struct printf_spec spec, const char *fmt)
{
unsigned long value;
-#ifdef CONFIG_KALLSYMS
char sym[KSYM_SYMBOL_LEN];
+#ifndef CONFIG_KALLSYMS
+ struct module *mod;
+ int len;
#endif
if (fmt[1] == 'R')
@@ -1002,8 +1004,14 @@ char *symbol_string(char *buf, char *end
return string_nocheck(buf, end, sym, spec);
#else
- return special_hex_number(buf, end, value, sizeof(void *));
+ len = snprintf(sym, sizeof(sym), "0x%lx", value);
+ mod = __module_address(value);
+ if (mod)
+ snprintf(sym + len, sizeof(sym) - len, " [%s@%p+0x%x]",
+ mod->name, mod->core_layout.base,
+ mod->core_layout.size);
#endif
+ return string(buf, end, sym, spec);
}
static const struct printf_spec default_str_spec = {

View File

@ -1,30 +0,0 @@
From: Gabor Juhos <juhosg@openwrt.org>
Subject: usr: sanitize deps_initramfs list
If any filename in the intramfs dependency
list contains a colon, that causes a kernel
build error like this:
/devel/openwrt/build_dir/linux-ar71xx_generic/linux-3.6.6/usr/Makefile:58: *** multiple target patterns. Stop.
make[5]: *** [usr] Error 2
Fix it by removing such filenames from the
deps_initramfs list.
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
usr/Makefile | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -56,6 +56,8 @@ hostprogs := gen_init_cpio
# The dependency list is generated by gen_initramfs.sh -l
-include $(obj)/.initramfs_data.cpio.d
+deps_initramfs := $(foreach v,$(deps_initramfs),$(if $(findstring :,$(v)),,$(v)))
+
# do not try to update files included in initramfs
$(deps_initramfs): ;

View File

@ -1,20 +0,0 @@
From: Imre Kaloz <kaloz@openwrt.org>
Subject: [PATCH] hack: net: wireless: make the wl12xx glue code available with
compat-wireless, too
Signed-off-by: Imre Kaloz <kaloz@openwrt.org>
---
drivers/net/wireless/ti/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/wireless/ti/Kconfig
+++ b/drivers/net/wireless/ti/Kconfig
@@ -20,7 +20,7 @@ source "drivers/net/wireless/ti/wlcore/K
config WILINK_PLATFORM_DATA
bool "TI WiLink platform data"
- depends on WLCORE_SDIO || WL1251_SDIO
+ depends on WLCORE_SDIO || WL1251_SDIO || ARCH_OMAP2PLUS
default y
help
Small platform data bit needed to pass data to the sdio modules.

View File

@ -1,35 +0,0 @@
From c2deb5ef01a0ef09088832744cbace9e239a6ee0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thibaut=20VAR=C3=88NE?= <hacks@slashdirt.org>
Date: Sat, 28 Mar 2020 12:11:50 +0100
Subject: [PATCH] generic: platform/mikrotik build bits (5.4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This patch adds platform/mikrotik kernel build bits
Signed-off-by: Thibaut VARÈNE <hacks@slashdirt.org>
---
drivers/platform/Kconfig | 2 ++
drivers/platform/Makefile | 1 +
2 files changed, 3 insertions(+)
--- a/drivers/platform/Kconfig
+++ b/drivers/platform/Kconfig
@@ -9,6 +9,8 @@ source "drivers/platform/chrome/Kconfig"
source "drivers/platform/mellanox/Kconfig"
+source "drivers/platform/mikrotik/Kconfig"
+
source "drivers/platform/olpc/Kconfig"
source "drivers/platform/surface/Kconfig"
--- a/drivers/platform/Makefile
+++ b/drivers/platform/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_MIPS) += mips/
obj-$(CONFIG_OLPC_EC) += olpc/
obj-$(CONFIG_GOLDFISH) += goldfish/
obj-$(CONFIG_CHROME_PLATFORMS) += chrome/
+obj-$(CONFIG_MIKROTIK) += mikrotik/
obj-$(CONFIG_SURFACE_PLATFORMS) += surface/

View File

@ -1,40 +0,0 @@
From: Mark Miller <mark@mirell.org>
Subject: mips: expose CONFIG_BOOT_RAW
This exposes the CONFIG_BOOT_RAW symbol in Kconfig. This is needed on
certain Broadcom chipsets running CFE in order to load the kernel.
Signed-off-by: Mark Miller <mark@mirell.org>
Acked-by: Rob Landley <rob@landley.net>
---
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1032,9 +1032,6 @@ config FW_ARC
config ARCH_MAY_HAVE_PC_FDC
bool
-config BOOT_RAW
- bool
-
config CEVT_BCM1480
bool
@@ -3089,6 +3086,18 @@ choice
bool "Extend builtin kernel arguments with bootloader arguments"
endchoice
+config BOOT_RAW
+ bool "Enable the kernel to be executed from the load address"
+ default n
+ help
+ Allow the kernel to be executed from the load address for
+ bootloaders which cannot read the ELF format. This places
+ a jump to start_kernel at the load address.
+
+ If unsure, say N.
+
+
+
endmenu
config LOCKDEP_SUPPORT

View File

@ -1,22 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: mips: use -mno-branch-likely for kernel and userspace
saves ~11k kernel size after lzma and ~12k squashfs size in the
lede-commit: 41a039f46450ffae9483d6216422098669da2900
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
arch/mips/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -94,7 +94,7 @@ all-$(CONFIG_SYS_SUPPORTS_ZBOOT)+= vmlin
# machines may also. Since BFD is incredibly buggy with respect to
# crossformat linking we rely on the elf2ecoff tool for format conversion.
#
-cflags-y += -G 0 -mno-abicalls -fno-pic -pipe
+cflags-y += -G 0 -mno-abicalls -fno-pic -pipe -mno-branch-likely
cflags-y += -msoft-float
LDFLAGS_vmlinux += -G 0 -static -n -nostdlib
KBUILD_AFLAGS_MODULE += -mlong-calls

View File

@ -1,370 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: mips: replace -mlong-calls with -mno-long-calls to make function calls faster in kernel modules to achieve this, try to
lede-commit: 3b3d64743ba2a874df9d70cd19e242205b0a788c
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
arch/mips/Makefile | 5 +
arch/mips/include/asm/module.h | 5 +
arch/mips/kernel/module.c | 279 ++++++++++++++++++++++++++++++++++++++++-
3 files changed, 284 insertions(+), 5 deletions(-)
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -97,8 +97,18 @@ all-$(CONFIG_SYS_SUPPORTS_ZBOOT)+= vmlin
cflags-y += -G 0 -mno-abicalls -fno-pic -pipe -mno-branch-likely
cflags-y += -msoft-float
LDFLAGS_vmlinux += -G 0 -static -n -nostdlib
+ifdef CONFIG_64BIT
KBUILD_AFLAGS_MODULE += -mlong-calls
KBUILD_CFLAGS_MODULE += -mlong-calls
+else
+ ifdef CONFIG_DYNAMIC_FTRACE
+ KBUILD_AFLAGS_MODULE += -mlong-calls
+ KBUILD_CFLAGS_MODULE += -mlong-calls
+ else
+ KBUILD_AFLAGS_MODULE += -mno-long-calls
+ KBUILD_CFLAGS_MODULE += -mno-long-calls
+ endif
+endif
ifeq ($(CONFIG_RELOCATABLE),y)
LDFLAGS_vmlinux += --emit-relocs
--- a/arch/mips/include/asm/module.h
+++ b/arch/mips/include/asm/module.h
@@ -12,6 +12,11 @@ struct mod_arch_specific {
const struct exception_table_entry *dbe_start;
const struct exception_table_entry *dbe_end;
struct mips_hi16 *r_mips_hi16_list;
+
+ void *phys_plt_tbl;
+ void *virt_plt_tbl;
+ unsigned int phys_plt_offset;
+ unsigned int virt_plt_offset;
};
typedef uint8_t Elf64_Byte; /* Type for a 8-bit quantity. */
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -32,23 +32,261 @@ struct mips_hi16 {
static LIST_HEAD(dbe_list);
static DEFINE_SPINLOCK(dbe_lock);
-#ifdef MODULE_START
+/*
+ * Get the potential max trampolines size required of the init and
+ * non-init sections. Only used if we cannot find enough contiguous
+ * physically mapped memory to put the module into.
+ */
+static unsigned int
+get_plt_size(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings, unsigned int symindex, bool is_init)
+{
+ unsigned long ret = 0;
+ unsigned int i, j;
+ Elf_Sym *syms;
+
+ /* Everything marked ALLOC (this includes the exported symbols) */
+ for (i = 1; i < hdr->e_shnum; ++i) {
+ unsigned int info = sechdrs[i].sh_info;
+
+ if (sechdrs[i].sh_type != SHT_REL
+ && sechdrs[i].sh_type != SHT_RELA)
+ continue;
+
+ /* Not a valid relocation section? */
+ if (info >= hdr->e_shnum)
+ continue;
+
+ /* Don't bother with non-allocated sections */
+ if (!(sechdrs[info].sh_flags & SHF_ALLOC))
+ continue;
+
+ /* If it's called *.init*, and we're not init, we're
+ not interested */
+ if ((strstr(secstrings + sechdrs[i].sh_name, ".init") != 0)
+ != is_init)
+ continue;
+
+ syms = (Elf_Sym *) sechdrs[symindex].sh_addr;
+ if (sechdrs[i].sh_type == SHT_REL) {
+ Elf_Mips_Rel *rel = (void *) sechdrs[i].sh_addr;
+ unsigned int size = sechdrs[i].sh_size / sizeof(*rel);
+
+ for (j = 0; j < size; ++j) {
+ Elf_Sym *sym;
+
+ if (ELF_MIPS_R_TYPE(rel[j]) != R_MIPS_26)
+ continue;
+
+ sym = syms + ELF_MIPS_R_SYM(rel[j]);
+ if (!is_init && sym->st_shndx != SHN_UNDEF)
+ continue;
+
+ ret += 4 * sizeof(int);
+ }
+ } else {
+ Elf_Mips_Rela *rela = (void *) sechdrs[i].sh_addr;
+ unsigned int size = sechdrs[i].sh_size / sizeof(*rela);
+
+ for (j = 0; j < size; ++j) {
+ Elf_Sym *sym;
+
+ if (ELF_MIPS_R_TYPE(rela[j]) != R_MIPS_26)
+ continue;
+
+ sym = syms + ELF_MIPS_R_SYM(rela[j]);
+ if (!is_init && sym->st_shndx != SHN_UNDEF)
+ continue;
+
+ ret += 4 * sizeof(int);
+ }
+ }
+ }
+
+ return ret;
+}
+
+#ifndef MODULE_START
+static void *alloc_phys(unsigned long size)
+{
+ unsigned order;
+ struct page *page;
+ struct page *p;
+
+ size = PAGE_ALIGN(size);
+ order = get_order(size);
+
+ page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN |
+ __GFP_THISNODE, order);
+ if (!page)
+ return NULL;
+
+ split_page(page, order);
+
+ /* mark all pages except for the last one */
+ for (p = page; p + 1 < page + (size >> PAGE_SHIFT); ++p)
+ set_bit(PG_owner_priv_1, &p->flags);
+
+ for (p = page + (size >> PAGE_SHIFT); p < page + (1 << order); ++p)
+ __free_page(p);
+
+ return page_address(page);
+}
+#endif
+
+static void free_phys(void *ptr)
+{
+ struct page *page;
+ bool free;
+
+ page = virt_to_page(ptr);
+ do {
+ free = test_and_clear_bit(PG_owner_priv_1, &page->flags);
+ __free_page(page);
+ page++;
+ } while (free);
+}
+
void *module_alloc(unsigned long size)
{
+#ifdef MODULE_START
return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
+#else
+ void *ptr;
+
+ if (size == 0)
+ return NULL;
+
+ ptr = alloc_phys(size);
+
+ /* If we failed to allocate physically contiguous memory,
+ * fall back to regular vmalloc. The module loader code will
+ * create jump tables to handle long jumps */
+ if (!ptr)
+ return vmalloc(size);
+
+ return ptr;
+#endif
}
+
+static inline bool is_phys_addr(void *ptr)
+{
+#ifdef CONFIG_64BIT
+ return (KSEGX((unsigned long)ptr) == CKSEG0);
+#else
+ return (KSEGX(ptr) == KSEG0);
#endif
+}
+
+/* Free memory returned from module_alloc */
+void module_memfree(void *module_region)
+{
+ if (is_phys_addr(module_region))
+ free_phys(module_region);
+ else
+ vfree(module_region);
+}
+
+static void *__module_alloc(int size, bool phys)
+{
+ void *ptr;
+
+ if (phys)
+ ptr = kmalloc(size, GFP_KERNEL);
+ else
+ ptr = vmalloc(size);
+ return ptr;
+}
+
+static void __module_free(void *ptr)
+{
+ if (is_phys_addr(ptr))
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ char *secstrings, struct module *mod)
+{
+ unsigned int symindex = 0;
+ unsigned int core_size, init_size;
+ int i;
+
+ mod->arch.phys_plt_offset = 0;
+ mod->arch.virt_plt_offset = 0;
+ mod->arch.phys_plt_tbl = NULL;
+ mod->arch.virt_plt_tbl = NULL;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ return 0;
+
+ for (i = 1; i < hdr->e_shnum; i++)
+ if (sechdrs[i].sh_type == SHT_SYMTAB)
+ symindex = i;
+
+ core_size = get_plt_size(hdr, sechdrs, secstrings, symindex, false);
+ init_size = get_plt_size(hdr, sechdrs, secstrings, symindex, true);
+
+ if ((core_size + init_size) == 0)
+ return 0;
+
+ mod->arch.phys_plt_tbl = __module_alloc(core_size + init_size, 1);
+ if (!mod->arch.phys_plt_tbl)
+ return -ENOMEM;
+
+ mod->arch.virt_plt_tbl = __module_alloc(core_size + init_size, 0);
+ if (!mod->arch.virt_plt_tbl) {
+ __module_free(mod->arch.phys_plt_tbl);
+ mod->arch.phys_plt_tbl = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
static void apply_r_mips_32(u32 *location, u32 base, Elf_Addr v)
{
*location = base + v;
}
+static Elf_Addr add_plt_entry_to(unsigned *plt_offset,
+ void *start, Elf_Addr v)
+{
+ unsigned *tramp = start + *plt_offset;
+ *plt_offset += 4 * sizeof(int);
+
+ /* adjust carry for addiu */
+ if (v & 0x00008000)
+ v += 0x10000;
+
+ tramp[0] = 0x3c190000 | (v >> 16); /* lui t9, hi16 */
+ tramp[1] = 0x27390000 | (v & 0xffff); /* addiu t9, t9, lo16 */
+ tramp[2] = 0x03200008; /* jr t9 */
+ tramp[3] = 0x00000000; /* nop */
+
+ return (Elf_Addr) tramp;
+}
+
+static Elf_Addr add_plt_entry(struct module *me, void *location, Elf_Addr v)
+{
+ if (is_phys_addr(location))
+ return add_plt_entry_to(&me->arch.phys_plt_offset,
+ me->arch.phys_plt_tbl, v);
+ else
+ return add_plt_entry_to(&me->arch.virt_plt_offset,
+ me->arch.virt_plt_tbl, v);
+
+}
+
+
static int apply_r_mips_26(struct module *me, u32 *location, u32 base,
Elf_Addr v)
{
+ u32 ofs = base & 0x03ffffff;
+
if (v % 4) {
pr_err("module %s: dangerous R_MIPS_26 relocation\n",
me->name);
@@ -56,13 +294,17 @@ static int apply_r_mips_26(struct module
}
if ((v & 0xf0000000) != (((unsigned long)location + 4) & 0xf0000000)) {
- pr_err("module %s: relocation overflow\n",
- me->name);
- return -ENOEXEC;
+ v = add_plt_entry(me, location, v + (ofs << 2));
+ if (!v) {
+ pr_err("module %s: relocation overflow\n",
+ me->name);
+ return -ENOEXEC;
+ }
+ ofs = 0;
}
*location = (*location & ~0x03ffffff) |
- ((base + (v >> 2)) & 0x03ffffff);
+ ((ofs + (v >> 2)) & 0x03ffffff);
return 0;
}
@@ -442,9 +684,36 @@ int module_finalize(const Elf_Ehdr *hdr,
list_add(&me->arch.dbe_list, &dbe_list);
spin_unlock_irq(&dbe_lock);
}
+
+ /* Get rid of the fixup trampoline if we're running the module
+ * from physically mapped address space */
+ if (me->arch.phys_plt_offset == 0) {
+ __module_free(me->arch.phys_plt_tbl);
+ me->arch.phys_plt_tbl = NULL;
+ }
+ if (me->arch.virt_plt_offset == 0) {
+ __module_free(me->arch.virt_plt_tbl);
+ me->arch.virt_plt_tbl = NULL;
+ }
+
return 0;
}
+void module_arch_freeing_init(struct module *mod)
+{
+ if (mod->state == MODULE_STATE_LIVE)
+ return;
+
+ if (mod->arch.phys_plt_tbl) {
+ __module_free(mod->arch.phys_plt_tbl);
+ mod->arch.phys_plt_tbl = NULL;
+ }
+ if (mod->arch.virt_plt_tbl) {
+ __module_free(mod->arch.virt_plt_tbl);
+ mod->arch.virt_plt_tbl = NULL;
+ }
+}
+
void module_arch_cleanup(struct module *mod)
{
spin_lock_irq(&dbe_lock);

View File

@ -1,19 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: kernel: adjust mips highmem offset to avoid the need for -mlong-calls on systems with >256M RAM
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
arch/mips/include/asm/mach-generic/spaces.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/include/asm/mach-generic/spaces.h
+++ b/arch/mips/include/asm/mach-generic/spaces.h
@@ -46,7 +46,7 @@
* Memory above this physical address will be considered highmem.
*/
#ifndef HIGHMEM_START
-#define HIGHMEM_START _AC(0x20000000, UL)
+#define HIGHMEM_START _AC(0x10000000, UL)
#endif
#endif /* CONFIG_32BIT */

View File

@ -1,22 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: kernel: add -mtune=34kc to MIPS CFLAGS when building for mips32r2
This provides a good tradeoff across at least 24Kc-74Kc, while also
producing smaller code.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
arch/mips/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -172,7 +172,7 @@ cflags-$(CONFIG_CPU_R4300) += -march=r43
cflags-$(CONFIG_CPU_R4X00) += -march=r4600 -Wa,--trap
cflags-$(CONFIG_CPU_TX49XX) += -march=r4600 -Wa,--trap
cflags-$(CONFIG_CPU_MIPS32_R1) += -march=mips32 -Wa,--trap
-cflags-$(CONFIG_CPU_MIPS32_R2) += -march=mips32r2 -Wa,--trap
+cflags-$(CONFIG_CPU_MIPS32_R2) += -march=mips32r2 -mtune=34kc -Wa,--trap
cflags-$(CONFIG_CPU_MIPS32_R5) += -march=mips32r5 -Wa,--trap -modd-spreg
cflags-$(CONFIG_CPU_MIPS32_R6) += -march=mips32r6 -Wa,--trap -modd-spreg
cflags-$(CONFIG_CPU_MIPS64_R1) += -march=mips64 -Wa,--trap

View File

@ -1,22 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Subject: fix errors in unresolved weak symbols on arm
lede-commit: 570699d4838a907c3ef9f2819bf19eb72997b32f
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
arch/arm/kernel/module.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -146,6 +146,10 @@ apply_relocate(Elf32_Shdr *sechdrs, cons
return -ENOEXEC;
}
+ if ((IS_ERR_VALUE(sym->st_value) || !sym->st_value) &&
+ ELF_ST_BIND(sym->st_info) == STB_WEAK)
+ continue;
+
loc = dstsec->sh_addr + rel->r_offset;
switch (ELF32_R_TYPE(rel->r_info)) {

View File

@ -1,282 +0,0 @@
From: Yousong Zhou <yszhou4tech@gmail.com>
Subject: MIPS: kexec: Accept command line parameters from userspace.
Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
---
arch/mips/kernel/machine_kexec.c | 153 +++++++++++++++++++++++++++++++-----
arch/mips/kernel/machine_kexec.h | 20 +++++
arch/mips/kernel/relocate_kernel.S | 21 +++--
3 files changed, 167 insertions(+), 27 deletions(-)
create mode 100644 arch/mips/kernel/machine_kexec.h
--- a/arch/mips/kernel/machine_kexec.c
+++ b/arch/mips/kernel/machine_kexec.c
@@ -9,14 +9,11 @@
#include <linux/delay.h>
#include <linux/libfdt.h>
+#include <asm/bootinfo.h>
#include <asm/cacheflush.h>
#include <asm/page.h>
-
-extern const unsigned char relocate_new_kernel[];
-extern const size_t relocate_new_kernel_size;
-
-extern unsigned long kexec_start_address;
-extern unsigned long kexec_indirection_page;
+#include <linux/uaccess.h>
+#include "machine_kexec.h"
static unsigned long reboot_code_buffer;
@@ -30,6 +27,101 @@ void (*_crash_smp_send_stop)(void) = NUL
void (*_machine_kexec_shutdown)(void) = NULL;
void (*_machine_crash_shutdown)(struct pt_regs *regs) = NULL;
+static void machine_kexec_print_args(void)
+{
+ unsigned long argc = (int)kexec_args[0];
+ int i;
+
+ pr_info("kexec_args[0] (argc): %lu\n", argc);
+ pr_info("kexec_args[1] (argv): %p\n", (void *)kexec_args[1]);
+ pr_info("kexec_args[2] (env ): %p\n", (void *)kexec_args[2]);
+ pr_info("kexec_args[3] (desc): %p\n", (void *)kexec_args[3]);
+
+ for (i = 0; i < argc; i++) {
+ pr_info("kexec_argv[%d] = %p, %s\n",
+ i, kexec_argv[i], kexec_argv[i]);
+ }
+}
+
+static void machine_kexec_init_argv(struct kimage *image)
+{
+ void __user *buf = NULL;
+ size_t bufsz;
+ size_t size;
+ int i;
+
+ bufsz = 0;
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *seg;
+
+ seg = &image->segment[i];
+ if (seg->bufsz < 6)
+ continue;
+
+ if (strncmp((char *) seg->buf, "kexec ", 6))
+ continue;
+
+ buf = seg->buf;
+ bufsz = seg->bufsz;
+ break;
+ }
+
+ if (!buf)
+ return;
+
+ size = KEXEC_COMMAND_LINE_SIZE;
+ size = min(size, bufsz);
+ if (size < bufsz)
+ pr_warn("kexec command line truncated to %zd bytes\n", size);
+
+ /* Copy to kernel space */
+ if (copy_from_user(kexec_argv_buf, buf, size))
+ pr_warn("kexec command line copy to kernel space failed\n");
+
+ kexec_argv_buf[size - 1] = 0;
+}
+
+static void machine_kexec_parse_argv(struct kimage *image)
+{
+ char *reboot_code_buffer;
+ int reloc_delta;
+ char *ptr;
+ int argc;
+ int i;
+
+ ptr = kexec_argv_buf;
+ argc = 0;
+
+ /*
+ * convert command line string to array of parameters
+ * (as bootloader does).
+ */
+ while (ptr && *ptr && (KEXEC_MAX_ARGC > argc)) {
+ if (*ptr == ' ') {
+ *ptr++ = '\0';
+ continue;
+ }
+
+ kexec_argv[argc++] = ptr;
+ ptr = strchr(ptr, ' ');
+ }
+
+ if (!argc)
+ return;
+
+ kexec_args[0] = argc;
+ kexec_args[1] = (unsigned long)kexec_argv;
+ kexec_args[2] = 0;
+ kexec_args[3] = 0;
+
+ reboot_code_buffer = page_address(image->control_code_page);
+ reloc_delta = reboot_code_buffer - (char *)kexec_relocate_new_kernel;
+
+ kexec_args[1] += reloc_delta;
+ for (i = 0; i < argc; i++)
+ kexec_argv[i] += reloc_delta;
+}
+
static void kexec_image_info(const struct kimage *kimage)
{
unsigned long i;
@@ -99,6 +191,18 @@ machine_kexec_prepare(struct kimage *kim
#endif
kexec_image_info(kimage);
+ /*
+ * Whenever arguments passed from kexec-tools, Init the arguments as
+ * the original ones to try avoiding booting failure.
+ */
+
+ kexec_args[0] = fw_arg0;
+ kexec_args[1] = fw_arg1;
+ kexec_args[2] = fw_arg2;
+ kexec_args[3] = fw_arg3;
+
+ machine_kexec_init_argv(kimage);
+ machine_kexec_parse_argv(kimage);
if (_machine_kexec_prepare)
return _machine_kexec_prepare(kimage);
@@ -161,7 +265,7 @@ machine_crash_shutdown(struct pt_regs *r
void kexec_nonboot_cpu_jump(void)
{
local_flush_icache_range((unsigned long)relocated_kexec_smp_wait,
- reboot_code_buffer + relocate_new_kernel_size);
+ reboot_code_buffer + KEXEC_RELOCATE_NEW_KERNEL_SIZE);
relocated_kexec_smp_wait(NULL);
}
@@ -199,7 +303,7 @@ void kexec_reboot(void)
* machine_kexec() CPU.
*/
local_flush_icache_range(reboot_code_buffer,
- reboot_code_buffer + relocate_new_kernel_size);
+ reboot_code_buffer + KEXEC_RELOCATE_NEW_KERNEL_SIZE);
do_kexec = (void *)reboot_code_buffer;
do_kexec();
@@ -212,10 +316,12 @@ machine_kexec(struct kimage *image)
unsigned long *ptr;
reboot_code_buffer =
- (unsigned long)page_address(image->control_code_page);
+ (unsigned long)page_address(image->control_code_page);
+ pr_info("reboot_code_buffer = %p\n", (void *)reboot_code_buffer);
kexec_start_address =
(unsigned long) phys_to_virt(image->start);
+ pr_info("kexec_start_address = %p\n", (void *)kexec_start_address);
if (image->type == KEXEC_TYPE_DEFAULT) {
kexec_indirection_page =
@@ -223,9 +329,19 @@ machine_kexec(struct kimage *image)
} else {
kexec_indirection_page = (unsigned long)&image->head;
}
+ pr_info("kexec_indirection_page = %p\n", (void *)kexec_indirection_page);
- memcpy((void*)reboot_code_buffer, relocate_new_kernel,
- relocate_new_kernel_size);
+ pr_info("Where is memcpy: %p\n", memcpy);
+ pr_info("kexec_relocate_new_kernel = %p, kexec_relocate_new_kernel_end = %p\n",
+ (void *)kexec_relocate_new_kernel, &kexec_relocate_new_kernel_end);
+ pr_info("Copy %lu bytes from %p to %p\n", KEXEC_RELOCATE_NEW_KERNEL_SIZE,
+ (void *)kexec_relocate_new_kernel, (void *)reboot_code_buffer);
+ memcpy((void*)reboot_code_buffer, kexec_relocate_new_kernel,
+ KEXEC_RELOCATE_NEW_KERNEL_SIZE);
+
+ pr_info("Before _print_args().\n");
+ machine_kexec_print_args();
+ pr_info("Before eval loop.\n");
/*
* The generic kexec code builds a page list with physical
@@ -256,7 +372,7 @@ machine_kexec(struct kimage *image)
#ifdef CONFIG_SMP
/* All secondary cpus now may jump to kexec_wait cycle */
relocated_kexec_smp_wait = reboot_code_buffer +
- (void *)(kexec_smp_wait - relocate_new_kernel);
+ (void *)(kexec_smp_wait - kexec_relocate_new_kernel);
smp_wmb();
atomic_set(&kexec_ready_to_reboot, 1);
#endif
--- /dev/null
+++ b/arch/mips/kernel/machine_kexec.h
@@ -0,0 +1,20 @@
+#ifndef _MACHINE_KEXEC_H
+#define _MACHINE_KEXEC_H
+
+#ifndef __ASSEMBLY__
+extern const unsigned char kexec_relocate_new_kernel[];
+extern unsigned long kexec_relocate_new_kernel_end;
+extern unsigned long kexec_start_address;
+extern unsigned long kexec_indirection_page;
+
+extern char kexec_argv_buf[];
+extern char *kexec_argv[];
+
+#define KEXEC_RELOCATE_NEW_KERNEL_SIZE ((unsigned long)&kexec_relocate_new_kernel_end - (unsigned long)kexec_relocate_new_kernel)
+#endif /* !__ASSEMBLY__ */
+
+#define KEXEC_COMMAND_LINE_SIZE 256
+#define KEXEC_ARGV_SIZE (KEXEC_COMMAND_LINE_SIZE / 16)
+#define KEXEC_MAX_ARGC (KEXEC_ARGV_SIZE / sizeof(long))
+
+#endif
--- a/arch/mips/kernel/relocate_kernel.S
+++ b/arch/mips/kernel/relocate_kernel.S
@@ -10,10 +10,11 @@
#include <asm/mipsregs.h>
#include <asm/stackframe.h>
#include <asm/addrspace.h>
+#include "machine_kexec.h"
#include <kernel-entry-init.h>
-LEAF(relocate_new_kernel)
+LEAF(kexec_relocate_new_kernel)
PTR_L a0, arg0
PTR_L a1, arg1
PTR_L a2, arg2
@@ -98,7 +99,7 @@ done:
#endif
/* jump to kexec_start_address */
j s1
- END(relocate_new_kernel)
+ END(kexec_relocate_new_kernel)
#ifdef CONFIG_SMP
/*
@@ -177,8 +178,15 @@ EXPORT(kexec_indirection_page)
PTR_WD 0
.size kexec_indirection_page, PTRSIZE
-relocate_new_kernel_end:
+kexec_argv_buf:
+ EXPORT(kexec_argv_buf)
+ .skip KEXEC_COMMAND_LINE_SIZE
+ .size kexec_argv_buf, KEXEC_COMMAND_LINE_SIZE
+
+kexec_argv:
+ EXPORT(kexec_argv)
+ .skip KEXEC_ARGV_SIZE
+ .size kexec_argv, KEXEC_ARGV_SIZE
-EXPORT(relocate_new_kernel_size)
- PTR_WD relocate_new_kernel_end - relocate_new_kernel
- .size relocate_new_kernel_size, PTRSIZE
+kexec_relocate_new_kernel_end:
+ EXPORT(kexec_relocate_new_kernel_end)

View File

@ -1,84 +0,0 @@
From bb0c3b0175240bf152fd7c644821a0cf9f77c37c Mon Sep 17 00:00:00 2001
From: Evgeniy Didin <Evgeniy.Didin@synopsys.com>
Date: Fri, 15 Mar 2019 18:53:38 +0300
Subject: [PATCH] arc add OWRTDTB section
This change allows OpenWRT to patch resulting kernel binary with
external .dtb.
That allows us to re-use exactky the same vmlinux on different boards
given its ARC core configurations match (at least cache line sizes etc).
""patch-dtb" searches for ASCII "OWRTDTB:" strign and copies external
.dtb right after it, keeping the string in place.
Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Evgeniy Didin <Evgeniy.Didin@synopsys.com>
---
arch/arc/kernel/head.S | 10 ++++++++++
arch/arc/kernel/setup.c | 4 +++-
arch/arc/kernel/vmlinux.lds.S | 13 +++++++++++++
3 files changed, 26 insertions(+), 1 deletion(-)
--- a/arch/arc/kernel/head.S
+++ b/arch/arc/kernel/head.S
@@ -88,6 +88,16 @@
DSP_EARLY_INIT
.endm
+ ; Here "patch-dtb" will embed external .dtb
+ ; Note "patch-dtb" searches for ASCII "OWRTDTB:" string
+ ; and pastes .dtb right after it, hense the string precedes
+ ; __image_dtb symbol.
+ .section .owrt, "aw",@progbits
+ .ascii "OWRTDTB:"
+ENTRY(__image_dtb)
+ .fill 0x4000
+END(__image_dtb)
+
.section .init.text, "ax",@progbits
;----------------------------------------------------------------
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -495,6 +495,8 @@ static inline bool uboot_arg_invalid(uns
/* We always pass 0 as magic from U-boot */
#define UBOOT_MAGIC_VALUE 0
+extern struct boot_param_header __image_dtb;
+
void __init handle_uboot_args(void)
{
bool use_embedded_dtb = true;
@@ -533,7 +535,7 @@ void __init handle_uboot_args(void)
ignore_uboot_args:
if (use_embedded_dtb) {
- machine_desc = setup_machine_fdt(__dtb_start);
+ machine_desc = setup_machine_fdt(&__image_dtb);
if (!machine_desc)
panic("Embedded DT invalid\n");
}
--- a/arch/arc/kernel/vmlinux.lds.S
+++ b/arch/arc/kernel/vmlinux.lds.S
@@ -27,6 +27,19 @@ SECTIONS
. = CONFIG_LINUX_LINK_BASE;
+ /*
+ * In OpenWRT we want to patch built binary embedding .dtb of choice.
+ * This is implemented with "patch-dtb" utility which searches for
+ * "OWRTDTB:" string in first 16k of image and if it is found
+ * copies .dtb right after mentioned string.
+ *
+ * Note: "OWRTDTB:" won't be overwritten with .dtb, .dtb will follow it.
+ */
+ .owrt : {
+ *(.owrt)
+ . = ALIGN(PAGE_SIZE);
+ }
+
_int_vec_base_lds = .;
.vector : {
*(.vector)

View File

@ -1,24 +0,0 @@
From: Alexey Brodkin <abrodkin@synopsys.com>
Subject: arc: enable unaligned access in kernel mode
This enables misaligned access handling even in kernel mode.
Some wireless drivers (ath9k-htc and mt7601u) use misaligned accesses
here and there and to cope with that without fixing stuff in the drivers
we're just gracefully handling it on ARC.
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
---
arch/arc/kernel/unaligned.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/arc/kernel/unaligned.c
+++ b/arch/arc/kernel/unaligned.c
@@ -202,7 +202,7 @@ int misaligned_fixup(unsigned long addre
char buf[TASK_COMM_LEN];
/* handle user mode only and only if enabled by sysadmin */
- if (!user_mode(regs) || !unaligned_enabled)
+ if (!unaligned_enabled)
return 1;
if (no_unaligned_warning) {

View File

@ -1,25 +0,0 @@
From 66770a004afe10df11d3902e16eaa0c2c39436bb Mon Sep 17 00:00:00 2001
From: Pawel Dembicki <paweldembicki@gmail.com>
Date: Fri, 24 May 2019 17:56:19 +0200
Subject: [PATCH] powerpc: Enable kernel XZ compression option on PPC_85xx
Enable kernel XZ compression option on PPC_85xx. Tested with
simpleImage on TP-Link TL-WDR4900 (Freescale P1014 processor).
Suggested-by: Christian Lamparter <chunkeey@gmail.com>
Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>
---
arch/powerpc/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -228,7 +228,7 @@ config PPC
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE
select HAVE_KERNEL_LZO if DEFAULT_UIMAGE
- select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x
+ select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x || PPC_85xx
select HAVE_KPROBES
select HAVE_KPROBES_ON_FTRACE
select HAVE_KRETPROBES

View File

@ -1,314 +0,0 @@
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -12,6 +12,25 @@ menuconfig MTD
if MTD
+menu "OpenWrt specific MTD options"
+
+config MTD_ROOTFS_ROOT_DEV
+ bool "Automatically set 'rootfs' partition to be root filesystem"
+ default y
+
+config MTD_SPLIT_FIRMWARE
+ bool "Automatically split firmware partition for kernel+rootfs"
+ default y
+
+config MTD_SPLIT_FIRMWARE_NAME
+ string "Firmware partition name"
+ depends on MTD_SPLIT_FIRMWARE
+ default "firmware"
+
+source "drivers/mtd/mtdsplit/Kconfig"
+
+endmenu
+
config MTD_TESTS
tristate "MTD tests support (DANGEROUS)"
depends on m
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -15,11 +15,13 @@
#include <linux/kmod.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/partitions.h>
+#include <linux/magic.h>
#include <linux/err.h>
#include <linux/of.h>
#include <linux/of_platform.h>
#include "mtdcore.h"
+#include "mtdsplit/mtdsplit.h"
/*
* MTD methods which simply translate the effective address and pass through
@@ -236,6 +238,146 @@ static int mtd_add_partition_attrs(struc
return ret;
}
+static DEFINE_SPINLOCK(part_parser_lock);
+static LIST_HEAD(part_parsers);
+
+static struct mtd_part_parser *mtd_part_parser_get(const char *name)
+{
+ struct mtd_part_parser *p, *ret = NULL;
+
+ spin_lock(&part_parser_lock);
+
+ list_for_each_entry(p, &part_parsers, list)
+ if (!strcmp(p->name, name) && try_module_get(p->owner)) {
+ ret = p;
+ break;
+ }
+
+ spin_unlock(&part_parser_lock);
+
+ return ret;
+}
+
+static inline void mtd_part_parser_put(const struct mtd_part_parser *p)
+{
+ module_put(p->owner);
+}
+
+static struct mtd_part_parser *
+get_partition_parser_by_type(enum mtd_parser_type type,
+ struct mtd_part_parser *start)
+{
+ struct mtd_part_parser *p, *ret = NULL;
+
+ spin_lock(&part_parser_lock);
+
+ p = list_prepare_entry(start, &part_parsers, list);
+ if (start)
+ mtd_part_parser_put(start);
+
+ list_for_each_entry_continue(p, &part_parsers, list) {
+ if (p->type == type && try_module_get(p->owner)) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock(&part_parser_lock);
+
+ return ret;
+}
+
+static int parse_mtd_partitions_by_type(struct mtd_info *master,
+ enum mtd_parser_type type,
+ const struct mtd_partition **pparts,
+ struct mtd_part_parser_data *data)
+{
+ struct mtd_part_parser *prev = NULL;
+ int ret = 0;
+
+ while (1) {
+ struct mtd_part_parser *parser;
+
+ parser = get_partition_parser_by_type(type, prev);
+ if (!parser)
+ break;
+
+ ret = (*parser->parse_fn)(master, pparts, data);
+
+ if (ret > 0) {
+ mtd_part_parser_put(parser);
+ printk(KERN_NOTICE
+ "%d %s partitions found on MTD device %s\n",
+ ret, parser->name, master->name);
+ break;
+ }
+
+ prev = parser;
+ }
+
+ return ret;
+}
+
+static int
+run_parsers_by_type(struct mtd_info *child, enum mtd_parser_type type)
+{
+ struct mtd_partition *parts;
+ int nr_parts;
+ int i;
+
+ nr_parts = parse_mtd_partitions_by_type(child, type, (const struct mtd_partition **)&parts,
+ NULL);
+ if (nr_parts <= 0)
+ return nr_parts;
+
+ if (WARN_ON(!parts))
+ return 0;
+
+ for (i = 0; i < nr_parts; i++) {
+ /* adjust partition offsets */
+ parts[i].offset += child->part.offset;
+
+ mtd_add_partition(child->parent,
+ parts[i].name,
+ parts[i].offset,
+ parts[i].size);
+ }
+
+ kfree(parts);
+
+ return nr_parts;
+}
+
+#ifdef CONFIG_MTD_SPLIT_FIRMWARE_NAME
+#define SPLIT_FIRMWARE_NAME CONFIG_MTD_SPLIT_FIRMWARE_NAME
+#else
+#define SPLIT_FIRMWARE_NAME "unused"
+#endif
+
+static void split_firmware(struct mtd_info *master, struct mtd_info *part)
+{
+ run_parsers_by_type(part, MTD_PARSER_TYPE_FIRMWARE);
+}
+
+static void mtd_partition_split(struct mtd_info *master, struct mtd_info *part)
+{
+ static int rootfs_found = 0;
+
+ if (rootfs_found)
+ return;
+
+ if (!strcmp(part->name, "rootfs")) {
+ run_parsers_by_type(part, MTD_PARSER_TYPE_ROOTFS);
+
+ rootfs_found = 1;
+ }
+
+ if (IS_ENABLED(CONFIG_MTD_SPLIT_FIRMWARE) &&
+ !strcmp(part->name, SPLIT_FIRMWARE_NAME) &&
+ !of_find_property(mtd_get_of_node(part), "compatible", NULL))
+ split_firmware(master, part);
+}
+
int mtd_add_partition(struct mtd_info *parent, const char *name,
long long offset, long long length)
{
@@ -274,6 +416,7 @@ int mtd_add_partition(struct mtd_info *p
if (ret)
goto err_remove_part;
+ mtd_partition_split(parent, child);
mtd_add_partition_attrs(child);
return 0;
@@ -422,6 +565,7 @@ int add_mtd_partitions(struct mtd_info *
goto err_del_partitions;
}
+ mtd_partition_split(master, child);
mtd_add_partition_attrs(child);
/* Look for subpartitions */
@@ -438,31 +582,6 @@ err_del_partitions:
return ret;
}
-static DEFINE_SPINLOCK(part_parser_lock);
-static LIST_HEAD(part_parsers);
-
-static struct mtd_part_parser *mtd_part_parser_get(const char *name)
-{
- struct mtd_part_parser *p, *ret = NULL;
-
- spin_lock(&part_parser_lock);
-
- list_for_each_entry(p, &part_parsers, list)
- if (!strcmp(p->name, name) && try_module_get(p->owner)) {
- ret = p;
- break;
- }
-
- spin_unlock(&part_parser_lock);
-
- return ret;
-}
-
-static inline void mtd_part_parser_put(const struct mtd_part_parser *p)
-{
- module_put(p->owner);
-}
-
/*
* Many partition parsers just expected the core to kfree() all their data in
* one chunk. Do that by default.
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -75,6 +75,12 @@ struct mtd_part_parser_data {
* Functions dealing with the various ways of partitioning the space
*/
+enum mtd_parser_type {
+ MTD_PARSER_TYPE_DEVICE = 0,
+ MTD_PARSER_TYPE_ROOTFS,
+ MTD_PARSER_TYPE_FIRMWARE,
+};
+
struct mtd_part_parser {
struct list_head list;
struct module *owner;
@@ -83,6 +89,7 @@ struct mtd_part_parser {
int (*parse_fn)(struct mtd_info *, const struct mtd_partition **,
struct mtd_part_parser_data *);
void (*cleanup)(const struct mtd_partition *pparts, int nr_parts);
+ enum mtd_parser_type type;
};
/* Container for passing around a set of parsed partitions */
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -9,6 +9,8 @@ mtd-y := mtdcore.o mtdsuper.o mtdconc
obj-y += parsers/
+obj-$(CONFIG_MTD_SPLIT) += mtdsplit/
+
# 'Users' - code which presents functionality to userspace.
obj-$(CONFIG_MTD_BLKDEVS) += mtd_blkdevs.o
obj-$(CONFIG_MTD_BLOCK) += mtdblock.o
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -608,6 +608,24 @@ static inline void mtd_align_erase_req(s
req->len += mtd->erasesize - mod;
}
+static inline uint64_t mtd_roundup_to_eb(uint64_t sz, struct mtd_info *mtd)
+{
+ if (mtd_mod_by_eb(sz, mtd) == 0)
+ return sz;
+
+ /* Round up to next erase block */
+ return (mtd_div_by_eb(sz, mtd) + 1) * mtd->erasesize;
+}
+
+static inline uint64_t mtd_rounddown_to_eb(uint64_t sz, struct mtd_info *mtd)
+{
+ if (mtd_mod_by_eb(sz, mtd) == 0)
+ return sz;
+
+ /* Round down to the start of the current erase block */
+ return (mtd_div_by_eb(sz, mtd)) * mtd->erasesize;
+}
+
static inline uint32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
{
if (mtd->writesize_shift)
@@ -680,6 +698,13 @@ extern void __put_mtd_device(struct mtd_
extern struct mtd_info *get_mtd_device_nm(const char *name);
extern void put_mtd_device(struct mtd_info *mtd);
+static inline uint64_t mtdpart_get_offset(const struct mtd_info *mtd)
+{
+ if (!mtd_is_partition(mtd))
+ return 0;
+
+ return mtd->part.offset;
+}
struct mtd_notifier {
void (*add)(struct mtd_info *mtd);

View File

@ -1,389 +0,0 @@
From patchwork Tue Jun 8 04:07:19 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: John Thomson <git@johnthomson.fastmail.com.au>
X-Patchwork-Id: 1489105
X-Patchwork-Delegate: tudor.ambarus@gmail.com
Return-Path:
<linux-mtd-bounces+incoming=patchwork.ozlabs.org@lists.infradead.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Authentication-Results: ozlabs.org;
spf=none (no SPF record) smtp.mailfrom=lists.infradead.org
(client-ip=2607:7c80:54:e::133; helo=bombadil.infradead.org;
envelope-from=linux-mtd-bounces+incoming=patchwork.ozlabs.org@lists.infradead.org;
receiver=<UNKNOWN>)
Authentication-Results: ozlabs.org;
dkim=pass (2048-bit key;
secure) header.d=lists.infradead.org header.i=@lists.infradead.org
header.a=rsa-sha256 header.s=bombadil.20210309 header.b=EMabhVoR;
dkim=fail reason="signature verification failed" (2048-bit key;
unprotected) header.d=fastmail.com.au header.i=@fastmail.com.au
header.a=rsa-sha256 header.s=fm3 header.b=dLzuZ6dB;
dkim=fail reason="signature verification failed" (2048-bit key;
unprotected) header.d=messagingengine.com header.i=@messagingengine.com
header.a=rsa-sha256 header.s=fm3 header.b=nSRGsW+C;
dkim-atps=neutral
Received: from bombadil.infradead.org (bombadil.infradead.org
[IPv6:2607:7c80:54:e::133])
(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
SHA256)
(No client certificate requested)
by ozlabs.org (Postfix) with ESMTPS id 4FzcFN1j1nz9sW8
for <incoming@patchwork.ozlabs.org>; Tue, 8 Jun 2021 14:09:28 +1000 (AEST)
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
d=lists.infradead.org; s=bombadil.20210309; h=Sender:
Content-Transfer-Encoding:Content-Type:List-Subscribe:List-Help:List-Post:
List-Archive:List-Unsubscribe:List-Id:MIME-Version:Message-Id:Date:Subject:Cc
:To:From:Reply-To:Content-ID:Content-Description:Resent-Date:Resent-From:
Resent-Sender:Resent-To:Resent-Cc:Resent-Message-ID:In-Reply-To:References:
List-Owner; bh=6mUWQd71FwsINycGYY1qOhKz+ecWJVNtwDkTebG3XkA=; b=EMabhVoRE3ad89
o3L2AgyKrs+blSofUC3hoSsQe7gi3m4si8S9HW8Z+8SsS5TufUsvGwDl80qSYGlQOytQF+1yRUWvE
6FJ/+bqv+TwjqZFibgJ6+9OVsQN9dZ/no1R0bBXIpmrf8ORUmv58QK4ZQquaFKbyXKpFeWOC2MSv4
H2MAhyhTU8a3gtooH6G8+KvsJEfVgh6C+aDbwxyh2UY3chHKuw1kvL6AktbfUE2xl4zxi3x3kc70B
Wi3LiJBFokxVdgnROXxTU5tI0XboWYkQV64gLuQNV4XKClcuhVpzloDK8Iok6NTd7b32a7TdEFlCS
lGKsEKmxtUlW2FpfoduA==;
Received: from localhost ([::1] helo=bombadil.infradead.org)
by bombadil.infradead.org with esmtp (Exim 4.94.2 #2 (Red Hat Linux))
id 1lqT1r-006OAW-DX; Tue, 08 Jun 2021 04:07:51 +0000
Received: from new1-smtp.messagingengine.com ([66.111.4.221])
by bombadil.infradead.org with esmtps (Exim 4.94.2 #2 (Red Hat Linux))
id 1lqT1l-006O9b-Fq
for linux-mtd@lists.infradead.org; Tue, 08 Jun 2021 04:07:50 +0000
Received: from compute2.internal (compute2.nyi.internal [10.202.2.42])
by mailnew.nyi.internal (Postfix) with ESMTP id 4456B580622;
Tue, 8 Jun 2021 00:07:42 -0400 (EDT)
Received: from mailfrontend2 ([10.202.2.163])
by compute2.internal (MEProxy); Tue, 08 Jun 2021 00:07:42 -0400
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fastmail.com.au;
h=from:to:cc:subject:date:message-id:mime-version
:content-transfer-encoding; s=fm3; bh=ZXRH+YluM1mHCS1EWUiCY/Sg8O
LccfHe1oW5iAay6y8=; b=dLzuZ6dBYf7ZA8tWLOBFZYLi7ERsGe/4vnMXG+ovvb
dNBO0+SaFGwoqYSFrfq/TeyHfKyvxrA7+LCdopIuT4abpLHxtRwtRiafQcDYCPat
qJIqOZO+wCZC5S9Jc1OP7+t1FviGpgevqIMotci37P+RWc5u3AweMzFljZk90E8C
uorV6rXagD+OssJQzllRnAIK88+rOAC9ZyXv2gWxy4d1HSCwSWgzx2vnV9CNp918
YC/3tiHas9krbrPIaAsdBROr7Bvoe/ShRRzruKRuvZVgg5NN90vX+/5ZjI8u04GM
p2bWCbC62CP6wlcgDaz+c/Sgr5ITd2GPENJsHfqmLRBA==
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=
messagingengine.com; h=cc:content-transfer-encoding:date:from
:message-id:mime-version:subject:to:x-me-proxy:x-me-proxy
:x-me-sender:x-me-sender:x-sasl-enc; s=fm3; bh=ZXRH+YluM1mHCS1EW
UiCY/Sg8OLccfHe1oW5iAay6y8=; b=nSRGsW+CQ2Zx1RVpIUu8W/VD/k5P+32BW
5k2ltd+UhI3dfldBPzHrYiOP/IJqGkNW+V+rHASacW/vFygnaZoxNjRYKnOsu+26
wb2yK3jpl6lsNTg3N1Z4XJrYY2lf9H29DMFbhC67l0PTc050rcZk4XsKTLAlv14Q
VA4WREYSaX/4IN4O+ES4TMq0a/3gKZh6nvbbJXbsXfK0WlSHTGZtZmW3fyrqvbXa
t+R7L8vvqWvwls0pV+Sn8LeQqb7+A69w0UOnuznjkcA3sCc2YehcHbxcUEnMH+9N
bxOjmIDeg9/4X/829tUWUJiLhE5SFmQZ1P6oFtmbWoLrDz0ZJIVBw==
X-ME-Sender: <xms:C-2-YD2uka4HsA6gcdsV2Ia7vebY4Yjp9E8q7KBMb54jnAzGL7-67Q>
<xme:C-2-YCEaxASy5VlcrvNO_jLFpMDGkFCRsuVNuZGEQsiRZygk8jPHWq7unPjeT6uYS
2pUP6PrTQ2rggjEIg>
X-ME-Received:
<xmr:C-2-YD4exeK49N_YZWWf2BWDhVyCbCY3wwvjTyDOFxeugx7Jg08pzMUToo9oJjrBpcVTaA3kbfk>
X-ME-Proxy-Cause:
gggruggvucftvghtrhhoucdtuddrgeduledrfedtkedgjeduucetufdoteggodetrfdotf
fvucfrrhhofhhilhgvmecuhfgrshhtofgrihhlpdfqfgfvpdfurfetoffkrfgpnffqhgen
uceurghilhhouhhtmecufedttdenucesvcftvggtihhpihgvnhhtshculddquddttddmne
cujfgurhephffvufffkffoggfgsedtkeertdertddtnecuhfhrohhmpeflohhhnhcuvfhh
ohhmshhonhcuoehgihhtsehjohhhnhhthhhomhhsohhnrdhfrghsthhmrghilhdrtghomh
drrghuqeenucggtffrrghtthgvrhhnpefffeeihfdukedtuedufeetieeuudfhhefhkefh
tefgtdeuffekffelleetveduieenucevlhhushhtvghrufhiiigvpedtnecurfgrrhgrmh
epmhgrihhlfhhrohhmpehgihhtsehjohhhnhhthhhomhhsohhnrdhfrghsthhmrghilhdr
tghomhdrrghu
X-ME-Proxy: <xmx:C-2-YI0AJZGjcB3wIbI9BoC9X8VNl4i9A7cQnBkvwZ25czWJlkKCLw>
<xmx:C-2-YGGufw99T-O81-FeiSyEruv6_Pr0IHFhspQdxjv5k1VFTZ0lzQ>
<xmx:C-2-YJ8BW7DhSDSCEAPSJWrwh_hHP79qreTZtWh_kOUwSh1c0MMlAg>
<xmx:Du2-YJBeX2Fg9oFZVXGwEJ1ZrZnXHiAqNON8tbpzquYgcm2o_LM48g>
Received: by mail.messagingengine.com (Postfix) with ESMTPA; Tue,
8 Jun 2021 00:07:35 -0400 (EDT)
From: John Thomson <git@johnthomson.fastmail.com.au>
To: Miquel Raynal <miquel.raynal@bootlin.com>,
Richard Weinberger <richard@nod.at>, Vignesh Raghavendra <vigneshr@ti.com>,
Tudor Ambarus <tudor.ambarus@microchip.com>,
Michael Walle <michael@walle.cc>, Pratyush Yadav <p.yadav@ti.com>,
linux-mtd@lists.infradead.org
Cc: linux-kernel@vger.kernel.org,
John Thomson <git@johnthomson.fastmail.com.au>,
kernel test robot <lkp@intel.com>, Dan Carpenter <dan.carpenter@oracle.com>
Subject: [PATCH] mtd: spi-nor: write support for minor aligned partitions
Date: Tue, 8 Jun 2021 14:07:19 +1000
Message-Id: <20210608040719.14431-1-git@johnthomson.fastmail.com.au>
X-Mailer: git-send-email 2.31.1
MIME-Version: 1.0
X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3
X-CRM114-CacheID: sfid-20210607_210745_712053_67A7D864
X-CRM114-Status: GOOD ( 26.99 )
X-Spam-Score: -0.8 (/)
X-Spam-Report: Spam detection software,
running on the system "bombadil.infradead.org",
has NOT identified this incoming email as spam. The original
message has been attached to this so you can view it or label
similar future email. If you have any questions, see
the administrator of that system for details.
Content preview: Do not prevent writing to mtd partitions where a partition
boundary sits on a minor erasesize boundary. This addresses a FIXME that
has been present since the start of the linux git history: /* Doesn' [...]
Content analysis details: (-0.8 points, 5.0 required)
pts rule name description
---- ----------------------
--------------------------------------------------
-0.7 RCVD_IN_DNSWL_LOW RBL: Sender listed at https://www.dnswl.org/,
low trust [66.111.4.221 listed in list.dnswl.org]
-0.0 SPF_PASS SPF: sender matches SPF record
-0.0 SPF_HELO_PASS SPF: HELO matches SPF record
0.0 RCVD_IN_MSPIKE_H3 RBL: Good reputation (+3)
[66.111.4.221 listed in wl.mailspike.net]
-0.1 DKIM_VALID Message has at least one valid DKIM or DK signature
0.1 DKIM_SIGNED Message has a DKIM or DK signature,
not necessarily
valid
-0.1 DKIM_VALID_EF Message has a valid DKIM or DK signature from
envelope-from domain
0.0 RCVD_IN_MSPIKE_WL Mailspike good senders
X-BeenThere: linux-mtd@lists.infradead.org
X-Mailman-Version: 2.1.34
Precedence: list
List-Id: Linux MTD discussion mailing list <linux-mtd.lists.infradead.org>
List-Unsubscribe: <http://lists.infradead.org/mailman/options/linux-mtd>,
<mailto:linux-mtd-request@lists.infradead.org?subject=unsubscribe>
List-Archive: <http://lists.infradead.org/pipermail/linux-mtd/>
List-Post: <mailto:linux-mtd@lists.infradead.org>
List-Help: <mailto:linux-mtd-request@lists.infradead.org?subject=help>
List-Subscribe: <http://lists.infradead.org/mailman/listinfo/linux-mtd>,
<mailto:linux-mtd-request@lists.infradead.org?subject=subscribe>
Sender: "linux-mtd" <linux-mtd-bounces@lists.infradead.org>
Errors-To: linux-mtd-bounces+incoming=patchwork.ozlabs.org@lists.infradead.org
Do not prevent writing to mtd partitions where a partition boundary sits
on a minor erasesize boundary.
This addresses a FIXME that has been present since the start of the
linux git history:
/* Doesn't start on a boundary of major erase size */
/* FIXME: Let it be writable if it is on a boundary of
* _minor_ erase size though */
Allow a uniform erase region spi-nor device to be configured
to use the non-uniform erase regions code path for an erase with:
CONFIG_MTD_SPI_NOR_USE_VARIABLE_ERASE=y
On supporting hardware (SECT_4K: majority of current SPI-NOR device)
provide the facility for an erase to use the least number
of SPI-NOR operations, as well as access to 4K erase without
requiring CONFIG_MTD_SPI_NOR_USE_4K_SECTORS
Introduce erasesize_minor to the mtd struct,
the smallest erasesize supported by the device
On existing devices, this is useful where write support is wanted
for data on a 4K partition, such as some u-boot-env partitions,
or RouterBoot soft_config, while still netting the performance
benefits of using 64K sectors
Performance:
time mtd erase firmware
OpenWrt 5.10 ramips MT7621 w25q128jv 0xfc0000 partition length
Without this patch
MTD_SPI_NOR_USE_4K_SECTORS=y |n
real 2m 11.66s |0m 50.86s
user 0m 0.00s |0m 0.00s
sys 1m 56.20s |0m 50.80s
With this patch
MTD_SPI_NOR_USE_VARIABLE_ERASE=n|y |4K_SECTORS=y
real 0m 51.68s |0m 50.85s |2m 12.89s
user 0m 0.00s |0m 0.00s |0m 0.01s
sys 0m 46.94s |0m 50.38s |2m 12.46s
Signed-off-by: John Thomson <git@johnthomson.fastmail.com.au>
---
Have not tested on variable erase regions device.
checkpatch does not like the printk(KERN_WARNING
these should be changed separately beforehand?
Changes RFC -> v1:
Fix uninitialized variable smatch warning
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
---
drivers/mtd/mtdpart.c | 52 ++++++++++++++++++++++++++++---------
drivers/mtd/spi-nor/Kconfig | 10 +++++++
drivers/mtd/spi-nor/core.c | 10 +++++--
include/linux/mtd/mtd.h | 2 ++
4 files changed, 60 insertions(+), 14 deletions(-)
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -41,10 +41,11 @@ static struct mtd_info *allocate_partiti
struct mtd_info *master = mtd_get_master(parent);
int wr_alignment = (parent->flags & MTD_NO_ERASE) ?
master->writesize : master->erasesize;
+ int wr_alignment_minor = 0;
u64 parent_size = mtd_is_partition(parent) ?
parent->part.size : parent->size;
struct mtd_info *child;
- u32 remainder;
+ u32 remainder, remainder_minor;
char *name;
u64 tmp;
@@ -146,6 +147,7 @@ static struct mtd_info *allocate_partiti
int i, max = parent->numeraseregions;
u64 end = child->part.offset + child->part.size;
struct mtd_erase_region_info *regions = parent->eraseregions;
+ uint32_t erasesize_minor = child->erasesize;
/* Find the first erase regions which is part of this
* partition. */
@@ -156,15 +158,24 @@ static struct mtd_info *allocate_partiti
if (i > 0)
i--;
- /* Pick biggest erasesize */
for (; i < max && regions[i].offset < end; i++) {
+ /* Pick biggest erasesize */
if (child->erasesize < regions[i].erasesize)
child->erasesize = regions[i].erasesize;
+ /* Pick smallest non-zero erasesize */
+ if ((erasesize_minor > regions[i].erasesize) && (regions[i].erasesize > 0))
+ erasesize_minor = regions[i].erasesize;
}
+
+ if (erasesize_minor < child->erasesize)
+ child->erasesize_minor = erasesize_minor;
+
BUG_ON(child->erasesize == 0);
} else {
/* Single erase size */
child->erasesize = master->erasesize;
+ if (master->erasesize_minor)
+ child->erasesize_minor = master->erasesize_minor;
}
/*
@@ -172,26 +183,43 @@ static struct mtd_info *allocate_partiti
* exposes several regions with different erasesize. Adjust
* wr_alignment accordingly.
*/
- if (!(child->flags & MTD_NO_ERASE))
+ if (!(child->flags & MTD_NO_ERASE)) {
wr_alignment = child->erasesize;
+ if (IS_ENABLED(CONFIG_MTD_SPI_NOR_USE_VARIABLE_ERASE) && child->erasesize_minor)
+ wr_alignment_minor = child->erasesize_minor;
+ }
tmp = mtd_get_master_ofs(child, 0);
remainder = do_div(tmp, wr_alignment);
if ((child->flags & MTD_WRITEABLE) && remainder) {
- /* Doesn't start on a boundary of major erase size */
- /* FIXME: Let it be writable if it is on a boundary of
- * _minor_ erase size though */
- child->flags &= ~MTD_WRITEABLE;
- printk(KERN_WARNING"mtd: partition \"%s\" doesn't start on an erase/write block boundary -- force read-only\n",
- part->name);
+ if (wr_alignment_minor) {
+ tmp = mtd_get_master_ofs(child, 0);
+ remainder_minor = do_div(tmp, wr_alignment_minor);
+ if (remainder_minor == 0)
+ child->erasesize = child->erasesize_minor;
+ }
+
+ if ((!wr_alignment_minor) || (wr_alignment_minor && remainder_minor != 0)) {
+ child->flags &= ~MTD_WRITEABLE;
+ printk(KERN_WARNING"mtd: partition \"%s\" doesn't start on an erase/write block boundary -- force read-only\n",
+ part->name);
+ }
}
tmp = mtd_get_master_ofs(child, 0) + child->part.size;
remainder = do_div(tmp, wr_alignment);
if ((child->flags & MTD_WRITEABLE) && remainder) {
- child->flags &= ~MTD_WRITEABLE;
- printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase/write block -- force read-only\n",
- part->name);
+ if (wr_alignment_minor) {
+ tmp = mtd_get_master_ofs(child, 0) + child->part.size;
+ remainder_minor = do_div(tmp, wr_alignment_minor);
+ if (remainder_minor == 0)
+ child->erasesize = child->erasesize_minor;
+ }
+ if ((!wr_alignment_minor) || (wr_alignment_minor && remainder_minor != 0)) {
+ child->flags &= ~MTD_WRITEABLE;
+ printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase/write block -- force read-only\n",
+ part->name);
+ }
}
child->size = child->part.size;
--- a/drivers/mtd/spi-nor/Kconfig
+++ b/drivers/mtd/spi-nor/Kconfig
@@ -10,6 +10,16 @@ menuconfig MTD_SPI_NOR
if MTD_SPI_NOR
+config MTD_SPI_NOR_USE_VARIABLE_ERASE
+ bool "Disable uniform_erase to allow use of all hardware supported erasesizes"
+ depends on !MTD_SPI_NOR_USE_4K_SECTORS
+ default n
+ help
+ Allow mixed use of all hardware supported erasesizes,
+ by forcing spi_nor to use the multiple eraseregions code path.
+ For example: A 68K erase will use one 64K erase, and one 4K erase
+ on supporting hardware.
+
config MTD_SPI_NOR_USE_4K_SECTORS
bool "Use small 4096 B erase sectors"
default y
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -1048,6 +1048,8 @@ static u8 spi_nor_convert_3to4_erase(u8
static bool spi_nor_has_uniform_erase(const struct spi_nor *nor)
{
+ if (IS_ENABLED(CONFIG_MTD_SPI_NOR_USE_VARIABLE_ERASE))
+ return false;
return !!nor->params->erase_map.uniform_erase_type;
}
@@ -2144,6 +2146,7 @@ static int spi_nor_select_erase(struct s
{
struct spi_nor_erase_map *map = &nor->params->erase_map;
const struct spi_nor_erase_type *erase = NULL;
+ const struct spi_nor_erase_type *erase_minor = NULL;
struct mtd_info *mtd = &nor->mtd;
u32 wanted_size = nor->info->sector_size;
int i;
@@ -2176,8 +2179,9 @@ static int spi_nor_select_erase(struct s
*/
for (i = SNOR_ERASE_TYPE_MAX - 1; i >= 0; i--) {
if (map->erase_type[i].size) {
- erase = &map->erase_type[i];
- break;
+ if (!erase)
+ erase = &map->erase_type[i];
+ erase_minor = &map->erase_type[i];
}
}
@@ -2185,6 +2189,8 @@ static int spi_nor_select_erase(struct s
return -EINVAL;
mtd->erasesize = erase->size;
+ if (erase_minor && erase_minor->size < erase->size)
+ mtd->erasesize_minor = erase_minor->size;
return 0;
}
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -238,6 +238,8 @@ struct mtd_info {
* information below if they desire
*/
uint32_t erasesize;
+ /* "Minor" (smallest) erase size supported by the whole device */
+ uint32_t erasesize_minor;
/* Minimal writable flash unit size. In case of NOR flash it is 1 (even
* though individual bits can be cleared), in case of NAND flash it is
* one NAND page (or half, or one-fourths of it), in case of ECC-ed NOR

Some files were not shown because too many files have changed in this diff Show More