kernel: bump 5.4 to 5.4.119

2025-04-16 04:13:31 +00:00 · 2021-05-20 12:54:36 +08:00 · 2021-05-20 12:54:36 +08:00 · a92a7010eb
commit a92a7010eb
parent 5402ff8230
227 changed files with 78473 additions and 191 deletions
--- a/include/kernel-version.mk
+++ b/include/kernel-version.mk
@ -8,11 +8,11 @@ endif

 LINUX_VERSION-4.14 = .195
 LINUX_VERSION-4.19 = .138
-LINUX_VERSION-5.4 = .117
+LINUX_VERSION-5.4 = .119

 LINUX_KERNEL_HASH-4.14.195 = 394f28798670240baacd9e2cce521fbd79f8da5e1fc191695b0e11381445a021
 LINUX_KERNEL_HASH-4.19.138 = d15c27d05f6c527269b75b30cc72972748e55720e7e00ad8abbaa4fe3b1d5e02
-LINUX_KERNEL_HASH-5.4.117 = 4e989b5775830092e5c76b5cca65ebff862ad0c87d0b58c3a20d415c3d4ec770
+LINUX_KERNEL_HASH-5.4.119 = 71e7decf1e8149a8aed88d30df4f2a62a6c6b168111de6b261685ac7c0ecb2a0

 remove_uri_prefix=$(subst git://,,$(subst http://,,$(subst https://,,$(1))))
 sanitize_uri=$(call qstrip,$(subst @,_,$(subst :,_,$(subst .,_,$(subst -,_,$(subst /,_,$(1)))))))
--- a/target/linux/apm821xx/patches-5.4/802-usb-xhci-force-msi-renesas-xhci.patch
+++ b/target/linux/apm821xx/patches-5.4/802-usb-xhci-force-msi-renesas-xhci.patch
@ -23,7 +23,7 @@ produce a noisy warning.
 		xhci->quirks |= XHCI_RESET_ON_RESUME;
 --- a/drivers/usb/host/xhci.c
 +++ b/drivers/usb/host/xhci.c
-@@ -423,10 +423,14 @@ static int xhci_try_enable_msi(struct us
+@@ -427,10 +427,14 @@ static int xhci_try_enable_msi(struct us
 		free_irq(hcd->irq, hcd);
 	hcd->irq = 0;
 
--- a/target/linux/ath79/patches-5.4/0033-spi-ath79-drop-pdata-support.patch
+++ b/target/linux/ath79/patches-5.4/0033-spi-ath79-drop-pdata-support.patch
@ -52,7 +52,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
 	unsigned long rate;
 	int ret;
 
-@@ -152,16 +150,10 @@ static int ath79_spi_probe(struct platfo
+@@ -152,15 +150,9 @@ static int ath79_spi_probe(struct platfo
 	master->dev.of_node = pdev->dev.of_node;
 	platform_set_drvdata(pdev, sp);
 
@ -60,8 +60,7 @@ Signed-off-by: John Crispin <john@phrozen.org>
 -
 	master->use_gpio_descriptors = true;
 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
- 	master->setup = spi_bitbang_setup;
- 	master->cleanup = spi_bitbang_cleanup;
+ 	master->flags = SPI_MASTER_GPIO_SS;
 -	if (pdata) {
 -		master->bus_num = pdata->bus_num;
 -		master->num_chipselect = pdata->num_chipselect;
--- a/target/linux/ath79/patches-5.4/0050-spi-ath79-remove-spi-master-setup-and-cleanup-assign.patch
+++ b/target/linux/ath79/patches-5.4/0050-spi-ath79-remove-spi-master-setup-and-cleanup-assign.patch
@ -1,28 +0,0 @@
-From b142b1beb199f62d47370c98a3dd8e13f792e9c0 Mon Sep 17 00:00:00 2001
-From: David Bauer <mail@david-bauer.net>
-Date: Thu, 27 Feb 2020 23:03:20 +0100
-Subject: [PATCH] spi: ath79: remove spi-master setup and cleanup assignment
-
-This removes the assignment of setup and cleanup functions for the ath79
-target. Assigning the setup-method will lead to 'setup_transfer' not
-being assigned in spi_bitbang_init.
-
-Also drop the redundant cleanup assignment, as this also happens in
-spi_bitbang_init.
-
-Signed-off-by: David Bauer <mail@david-bauer.net>
---
- drivers/spi/spi-ath79.c | 2 --
- 1 file changed, 2 deletions(-)
-
--- a/drivers/spi/spi-ath79.c
-+++ b/drivers/spi/spi-ath79.c
-@@ -152,8 +152,6 @@ static int ath79_spi_probe(struct platfo
- 
- 	master->use_gpio_descriptors = true;
- 	master->bits_per_word_mask = SPI_BPW_RANGE_MASK(1, 32);
-	master->setup = spi_bitbang_setup;
-	master->cleanup = spi_bitbang_cleanup;
- 
- 	sp->bitbang.master = master;
- 	sp->bitbang.chipselect = ath79_spi_chipselect;
--- a/target/linux/ath79/patches-5.4/410-spi-ath79-Implement-the-spi_mem-interface.patch
+++ b/target/linux/ath79/patches-5.4/410-spi-ath79-Implement-the-spi_mem-interface.patch
@ -58,7 +58,7 @@ Signed-off-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
 static int ath79_spi_probe(struct platform_device *pdev)
 {
 	struct spi_master *master;
-@@ -163,6 +197,7 @@ static int ath79_spi_probe(struct platfo
+@@ -164,6 +198,7 @@ static int ath79_spi_probe(struct platfo
 		ret = PTR_ERR(sp->base);
 		goto err_put_master;
 	}
--- a/target/linux/bcm27xx/patches-5.4/950-0043-MMC-added-alternative-MMC-driver.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0043-MMC-added-alternative-MMC-driver.patch
@ -222,7 +222,7 @@ Signed-off-by: Yaroslav Rosomakho <yaroslavros@gmail.com>
 static inline int mmc_blk_part_switch(struct mmc_card *card,
 				      unsigned int part_type);
 
-@@ -2868,6 +2875,7 @@ static int mmc_blk_probe(struct mmc_card
+@@ -2884,6 +2891,7 @@ static int mmc_blk_probe(struct mmc_card
 {
 	struct mmc_blk_data *md, *part_md;
 	char cap_str[10];
@ -230,7 +230,7 @@ Signed-off-by: Yaroslav Rosomakho <yaroslavros@gmail.com>
 
 	/*
 	 * Check that the card supports the command class(es) we need.
-@@ -2875,7 +2883,16 @@ static int mmc_blk_probe(struct mmc_card
+@@ -2891,7 +2899,16 @@ static int mmc_blk_probe(struct mmc_card
 	if (!(card->csd.cmdclass & CCC_BLOCK_READ))
 		return -ENODEV;
 
@ -248,7 +248,7 @@ Signed-off-by: Yaroslav Rosomakho <yaroslavros@gmail.com>
 
 	card->complete_wq = alloc_workqueue("mmc_complete",
 					WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
-@@ -2890,9 +2907,14 @@ static int mmc_blk_probe(struct mmc_card
+@@ -2906,9 +2923,14 @@ static int mmc_blk_probe(struct mmc_card
 
 	string_get_size((u64)get_capacity(md->disk), 512, STRING_UNITS_2,
 			cap_str, sizeof(cap_str));
@ -279,7 +279,7 @@ Signed-off-by: Yaroslav Rosomakho <yaroslavros@gmail.com>
 }
 --- a/drivers/mmc/core/host.c
 +++ b/drivers/mmc/core/host.c
-@@ -397,15 +397,30 @@ struct mmc_host *mmc_alloc_host(int extr
+@@ -434,15 +434,30 @@ struct mmc_host *mmc_alloc_host(int extr
 {
 	int err;
 	struct mmc_host *host;
--- a/target/linux/bcm27xx/patches-5.4/950-0219-xhci-implement-xhci_fixup_endpoint-for-interval-adju.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0219-xhci-implement-xhci_fixup_endpoint-for-interval-adju.patch
@ -15,7 +15,7 @@ Signed-off-by: Jonathan Bell <jonathan@raspberrypi.org>

 --- a/drivers/usb/host/xhci.c
 +++ b/drivers/usb/host/xhci.c
-@@ -1464,6 +1464,103 @@ command_cleanup:
+@@ -1468,6 +1468,103 @@ command_cleanup:
 }
 
 /*
@ -119,7 +119,7 @@ Signed-off-by: Jonathan Bell <jonathan@raspberrypi.org>
  * non-error returns are a promise to giveback() the urb later
  * we drop ownership so next owner (or urb unlink) can get it
  */
-@@ -5345,6 +5442,7 @@ static const struct hc_driver xhci_hc_dr
+@@ -5357,6 +5454,7 @@ static const struct hc_driver xhci_hc_dr
 	.endpoint_reset =	xhci_endpoint_reset,
 	.check_bandwidth =	xhci_check_bandwidth,
 	.reset_bandwidth =	xhci_reset_bandwidth,
--- a/target/linux/bcm27xx/patches-5.4/950-0280-hid-usb-Add-device-quirks-for-Freeway-Airmouse-T3-an.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0280-hid-usb-Add-device-quirks-for-Freeway-Airmouse-T3-an.patch
@ -33,7 +33,7 @@ Signed-off-by: Jonathan Bell <jonathan@raspberrypi.org>
 #define USB_VENDOR_ID_BELKIN		0x050d
 #define USB_DEVICE_ID_FLIP_KVM		0x3201
 
-@@ -1258,6 +1261,9 @@
+@@ -1259,6 +1262,9 @@
 #define USB_VENDOR_ID_XAT	0x2505
 #define USB_DEVICE_ID_XAT_CSR	0x0220
 
--- a/target/linux/bcm27xx/patches-5.4/950-0293-xhci-Use-more-event-ring-segment-table-entries.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0293-xhci-Use-more-event-ring-segment-table-entries.patch
@ -22,7 +22,7 @@ Signed-off-by: Jonathan Bell <jonathan@raspberrypi.org>

 --- a/drivers/usb/host/xhci-mem.c
 +++ b/drivers/usb/host/xhci-mem.c
-@@ -2503,9 +2503,11 @@ int xhci_mem_init(struct xhci_hcd *xhci,
+@@ -2512,9 +2512,11 @@ int xhci_mem_init(struct xhci_hcd *xhci,
 	 * Event ring setup: Allocate a normal ring, but also setup
 	 * the event ring segment table (ERST).  Section 4.9.3.
 	 */
@ -36,7 +36,7 @@ Signed-off-by: Jonathan Bell <jonathan@raspberrypi.org>
 	if (!xhci->event_ring)
 		goto fail;
 	if (xhci_check_trb_in_td_math(xhci) < 0)
-@@ -2518,7 +2520,7 @@ int xhci_mem_init(struct xhci_hcd *xhci,
+@@ -2527,7 +2529,7 @@ int xhci_mem_init(struct xhci_hcd *xhci,
 	/* set ERST count with the number of entries in the segment table */
 	val = readl(&xhci->ir_set->erst_size);
 	val &= ERST_SIZE_MASK;
--- a/target/linux/bcm27xx/patches-5.4/950-0440-x86-PCI-sta2x11-use-default-DMA-address-translation.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0440-x86-PCI-sta2x11-use-default-DMA-address-translation.patch
@ -31,7 +31,7 @@ Signed-off-by: Christoph Hellwig <hch@lst.de>

 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
-@@ -708,7 +708,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
+@@ -709,7 +709,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
 config STA2X11
 	bool "STA2X11 Companion Chip Support"
 	depends on X86_32_NON_STANDARD && PCI
--- a/target/linux/bcm27xx/patches-5.4/950-0482-media-add-V4L2_CTRL_TYPE_AREA-control-type.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0482-media-add-V4L2_CTRL_TYPE_AREA-control-type.patch
@ -39,7 +39,7 @@ Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
 	default:
 		return -EINVAL;
 	}
-@@ -2423,6 +2429,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
+@@ -2431,6 +2437,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
 	case V4L2_CTRL_TYPE_VP8_FRAME_HEADER:
 		elem_size = sizeof(struct v4l2_ctrl_vp8_frame_header);
 		break;
@ -49,7 +49,7 @@ Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
 	default:
 		if (type < V4L2_CTRL_COMPOUND_TYPES)
 			elem_size = sizeof(s32);
-@@ -4087,6 +4096,18 @@ int __v4l2_ctrl_s_ctrl_string(struct v4l
+@@ -4098,6 +4107,18 @@ int __v4l2_ctrl_s_ctrl_string(struct v4l
 }
 EXPORT_SYMBOL(__v4l2_ctrl_s_ctrl_string);
 
--- a/target/linux/bcm27xx/patches-5.4/950-0492-media-v4l-Add-definitions-for-HEVC-stateless-decodin.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0492-media-v4l-Add-definitions-for-HEVC-stateless-decodin.patch
@ -820,7 +820,7 @@ Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
 	default:
 		return -EINVAL;
 	}
-@@ -2434,6 +2532,15 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
+@@ -2442,6 +2540,15 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
 	case V4L2_CTRL_TYPE_VP8_FRAME_HEADER:
 		elem_size = sizeof(struct v4l2_ctrl_vp8_frame_header);
 		break;
--- a/target/linux/bcm27xx/patches-5.4/950-0495-media-uapi-hevc-Add-scaling-matrix-control.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0495-media-uapi-hevc-Add-scaling-matrix-control.patch
@ -106,7 +106,7 @@ Signed-off-by: Jernej Skrabec <jernej.skrabec@siol.net>
 	case V4L2_CTRL_TYPE_AREA:
 		area = p;
 		if (!area->width || !area->height)
-@@ -2541,6 +2548,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
+@@ -2549,6 +2556,9 @@ static struct v4l2_ctrl *v4l2_ctrl_new(s
 	case V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS:
 		elem_size = sizeof(struct v4l2_ctrl_hevc_slice_params);
 		break;
--- a/target/linux/bcm27xx/patches-5.4/950-0504-mmc-sdhci-Silence-MMC-warnings.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0504-mmc-sdhci-Silence-MMC-warnings.patch
@ -22,7 +22,7 @@ Signed-off-by: Maxime Ripard <maxime@cerno.tech>
 
 #define MAX_TUNING_LOOP 40
 
-@@ -2756,7 +2756,7 @@ static void sdhci_timeout_timer(struct t
+@@ -2758,7 +2758,7 @@ static void sdhci_timeout_timer(struct t
 	spin_lock_irqsave(&host->lock, flags);
 
 	if (host->cmd && !sdhci_data_line_cmd(host->cmd)) {
@ -31,7 +31,7 @@ Signed-off-by: Maxime Ripard <maxime@cerno.tech>
 		       mmc_hostname(host->mmc));
 		sdhci_dumpregs(host);
 
-@@ -2778,7 +2778,7 @@ static void sdhci_timeout_data_timer(str
+@@ -2780,7 +2780,7 @@ static void sdhci_timeout_data_timer(str
 
 	if (host->data || host->data_cmd ||
 	    (host->cmd && sdhci_data_line_cmd(host->cmd))) {
--- a/target/linux/bcm27xx/patches-5.4/950-0646-spi-Force-CS_HIGH-if-GPIO-descriptors-are-used.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0646-spi-Force-CS_HIGH-if-GPIO-descriptors-are-used.patch
@ -23,7 +23,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>

 --- a/drivers/spi/spi.c
 +++ b/drivers/spi/spi.c
-@@ -3115,6 +3115,7 @@ static int __spi_validate_bits_per_word(
+@@ -3110,6 +3110,7 @@ static int __spi_validate_bits_per_word(
  */
 int spi_setup(struct spi_device *spi)
 {
@ -31,7 +31,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>
 	unsigned	bad_bits, ugly_bits;
 	int		status;
 
-@@ -3132,6 +3133,14 @@ int spi_setup(struct spi_device *spi)
+@@ -3127,6 +3128,14 @@ int spi_setup(struct spi_device *spi)
 		(SPI_TX_DUAL | SPI_TX_QUAD | SPI_TX_OCTAL |
 		 SPI_RX_DUAL | SPI_RX_QUAD | SPI_RX_OCTAL)))
 		return -EINVAL;
--- a/target/linux/bcm27xx/patches-5.4/950-0690-SQUASH-spi-Demote-SPI_CS_HIGH-warning-to-KERN_DEBUG.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0690-SQUASH-spi-Demote-SPI_CS_HIGH-warning-to-KERN_DEBUG.patch
@ -15,7 +15,7 @@ Signed-off-by: Phil Elwell <phil@raspberrypi.com>

 --- a/drivers/spi/spi.c
 +++ b/drivers/spi/spi.c
-@@ -3127,8 +3127,8 @@ int spi_setup(struct spi_device *spi)
+@@ -3122,8 +3122,8 @@ int spi_setup(struct spi_device *spi)
 
 	if (ctlr->use_gpio_descriptors && ctlr->cs_gpiods &&
 	    ctlr->cs_gpiods[spi->chip_select] && !(spi->mode & SPI_CS_HIGH)) {
--- a/target/linux/bcm27xx/patches-5.4/950-0908-media-v4l2-ctrls-Add-helper-to-register-properties.patch
+++ b/target/linux/bcm27xx/patches-5.4/950-0908-media-v4l2-ctrls-Add-helper-to-register-properties.patch
@ -29,7 +29,7 @@ Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
 
 #define dprintk(vdev, fmt, arg...) do {					\
 	if (!WARN_ON(!(vdev)) && ((vdev)->dev_debug & V4L2_DEV_DEBUG_CTRL)) \
-@@ -4578,3 +4579,42 @@ __poll_t v4l2_ctrl_poll(struct file *fil
+@@ -4589,3 +4590,42 @@ __poll_t v4l2_ctrl_poll(struct file *fil
 	return 0;
 }
 EXPORT_SYMBOL(v4l2_ctrl_poll);
--- a/target/linux/bcm53xx/patches-5.4/180-usb-xhci-add-support-for-performing-fake-doorbell.patch
+++ b/target/linux/bcm53xx/patches-5.4/180-usb-xhci-add-support-for-performing-fake-doorbell.patch
@ -90,7 +90,7 @@ it on BCM4708 family.
 /*
  * Reset a halted HC.
  *
-@@ -604,10 +647,20 @@ static int xhci_init(struct usb_hcd *hcd
+@@ -608,10 +651,20 @@ static int xhci_init(struct usb_hcd *hcd
 
 static int xhci_run_finished(struct xhci_hcd *xhci)
 {
@ -114,7 +114,7 @@ it on BCM4708 family.
 	xhci->shared_hcd->state = HC_STATE_RUNNING;
 	xhci->cmd_ring_state = CMD_RING_STATE_RUNNING;
 
-@@ -617,6 +670,10 @@ static int xhci_run_finished(struct xhci
+@@ -621,6 +674,10 @@ static int xhci_run_finished(struct xhci
 	xhci_dbg_trace(xhci, trace_xhci_dbg_init,
 			"Finished xhci_run for USB3 roothub");
 	return 0;
--- a/target/linux/bcm63xx/patches-5.4/026-v5.8-mtd-rawnand-brcmnand-support-v2.1-v2.2-controllers.patch
+++ b/target/linux/bcm63xx/patches-5.4/026-v5.8-mtd-rawnand-brcmnand-support-v2.1-v2.2-controllers.patch
@ -183,7 +183,7 @@ Link: https://lore.kernel.org/linux-mtd/20200522121524.4161539-6-noltari@gmail.c
 	nand_writereg(ctrl, acc_control_offs, tmp);
 
 	brcmnand_set_sector_size_1k(host, cfg->sector_size_1k);
-@@ -2524,6 +2589,8 @@ const struct dev_pm_ops brcmnand_pm_ops
+@@ -2530,6 +2595,8 @@ const struct dev_pm_ops brcmnand_pm_ops
 EXPORT_SYMBOL_GPL(brcmnand_pm_ops);
 
 static const struct of_device_id brcmnand_of_match[] = {
--- a/target/linux/bcm63xx/patches-5.4/434-nand-brcmnand-fix-OOB-R-W-with-Hamming-ECC.patch
+++ b/target/linux/bcm63xx/patches-5.4/434-nand-brcmnand-fix-OOB-R-W-with-Hamming-ECC.patch
@ -0,0 +1,34 @@
+From cf0d2fbaae9e962d91a321de75e0d4f9f9ccbdfe Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?=C3=81lvaro=20Fern=C3=A1ndez=20Rojas?= <noltari@gmail.com>
+Date: Thu, 21 Jan 2021 18:17:37 +0100
+Subject: [PATCH] nand: brcmnand: fix OOB R/W with Hamming ECC
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Hamming ECC doesn't cover the OOB data, so reading or writing OOB shall
+always be done without ECC enabled.
+This is a problem when adding JFFS2 cleanmarkers to erased blocks. When JFFS2
+clenmarkers are added to the OOB with ECC enabled, OOB bytes will be changed
+from ff ff ff to 00 00 00, reporting incorrect ECC errors.
+
+Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
+---
+ drivers/mtd/nand/raw/brcmnand/brcmnand.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/mtd/nand/raw/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/raw/brcmnand/brcmnand.c
+@@ -2432,6 +2432,12 @@ static int brcmnand_attach_chip(struct n
+ 		chip->ecc.read_oob = brcmnand_read_oob_raw;
+ 	}
+ 
+	/* If OOB is written with ECC enabled it will cause ECC errors */
+	if (is_hamming_ecc(host->ctrl, &host->hwcfg)) {
+		chip->ecc.write_oob = brcmnand_write_oob_raw;
+		chip->ecc.read_oob = brcmnand_read_oob_raw;
+	}
+
+ 	return ret;
+ }
+ 
--- a/target/linux/generic/backport-5.4/050-gro-fix-napi_gro_frags-Fast-GRO-breakage-due-to-IP-a.patch
+++ b/target/linux/generic/backport-5.4/050-gro-fix-napi_gro_frags-Fast-GRO-breakage-due-to-IP-a.patch
@ -1,75 +0,0 @@
-From 3007b05df4301aad179acc6ca1c3645785576df6 Mon Sep 17 00:00:00 2001
-From: Alexander Lobakin <alobakin@pm.me>
-Date: Mon, 19 Apr 2021 12:53:06 +0000
-Subject: gro: fix napi_gro_frags() Fast GRO breakage due to IP
- alignment check
-
-Commit 7ad18ff6449cbd6beb26b53128ddf56d2685aa93 upstream.
-
-Commit 38ec4944b593 ("gro: ensure frag0 meets IP header alignment")
-did the right thing, but missed the fact that napi_gro_frags() logics
-calls for skb_gro_reset_offset() *before* pulling Ethernet header
-to the skb linear space.
-That said, the introduced check for frag0 address being aligned to 4
-always fails for it as Ethernet header is obviously 14 bytes long,
-and in case with NET_IP_ALIGN its start is not aligned to 4.
-
-Fix this by adding @nhoff argument to skb_gro_reset_offset() which
-tells if an IP header is placed right at the start of frag0 or not.
-This restores Fast GRO for napi_gro_frags() that became very slow
-after the mentioned commit, and preserves the introduced check to
-avoid silent unaligned accesses.
-
-From v1 [0]:
- - inline tiny skb_gro_reset_offset() to let the code be optimized
-   more efficively (esp. for the !NET_IP_ALIGN case) (Eric);
- - pull in Reviewed-by from Eric.
-
-[0] https://lore.kernel.org/netdev/20210418114200.5839-1-alobakin@pm.me
-
-Fixes: 38ec4944b593 ("gro: ensure frag0 meets IP header alignment")
-Reviewed-by: Eric Dumazet <edumazet@google.com>
-Signed-off-by: Alexander Lobakin <alobakin@pm.me>
-Signed-off-by: David S. Miller <davem@davemloft.net>
---
- net/core/dev.c | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
--- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -5395,7 +5395,7 @@ static struct list_head *gro_list_prepar
- 	return head;
- }
- 
-static void skb_gro_reset_offset(struct sk_buff *skb)
-+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
- {
- 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
- 	const skb_frag_t *frag0 = &pinfo->frags[0];
-@@ -5407,7 +5407,7 @@ static void skb_gro_reset_offset(struct
- 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
- 	    pinfo->nr_frags &&
- 	    !PageHighMem(skb_frag_page(frag0)) &&
-	    (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) {
-+	    (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
- 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- 		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
- 						    skb_frag_size(frag0),
-@@ -5640,7 +5640,7 @@ gro_result_t napi_gro_receive(struct nap
- 	skb_mark_napi_id(skb, napi);
- 	trace_napi_gro_receive_entry(skb);
- 
-	skb_gro_reset_offset(skb);
-+	skb_gro_reset_offset(skb, 0);
- 
- 	ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
- 	trace_napi_gro_receive_exit(ret);
-@@ -5733,7 +5733,7 @@ static struct sk_buff *napi_frags_skb(st
- 	napi->skb = NULL;
- 
- 	skb_reset_mac_header(skb);
-	skb_gro_reset_offset(skb);
-+	skb_gro_reset_offset(skb, hlen);
- 
- 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
- 		eth = skb_gro_header_slow(skb, hlen, 0);
--- a/target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0001-crypto-lib-tidy-up-lib-crypto-Kconfig-and-Makefile.patch
@ -0,0 +1,112 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:07 +0100
+Subject: [PATCH] crypto: lib - tidy up lib/crypto Kconfig and Makefile
+
+commit 746b2e024c67aa605ac12d135cd7085a49cf9dc4 upstream.
+
+In preparation of introducing a set of crypto library interfaces, tidy
+up the Makefile and split off the Kconfig symbols into a separate file.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig      | 13 +------------
+ lib/crypto/Kconfig  | 15 +++++++++++++++
+ lib/crypto/Makefile | 16 ++++++++--------
+ 3 files changed, 24 insertions(+), 20 deletions(-)
+ create mode 100644 lib/crypto/Kconfig
+
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -878,9 +878,6 @@ config CRYPTO_SHA1_PPC_SPE
+ 	  SHA-1 secure hash standard (DFIPS 180-4) implemented
+ 	  using powerpc SPE SIMD instruction set.
+ 
+-config CRYPTO_LIB_SHA256
+-	tristate
+-
+ config CRYPTO_SHA256
+ 	tristate "SHA224 and SHA256 digest algorithm"
+ 	select CRYPTO_HASH
+@@ -1019,9 +1016,6 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL
+ 
+ comment "Ciphers"
+ 
+-config CRYPTO_LIB_AES
+-	tristate
+-
+ config CRYPTO_AES
+ 	tristate "AES cipher algorithms"
+ 	select CRYPTO_ALGAPI
+@@ -1150,9 +1144,6 @@ config CRYPTO_ANUBIS
+ 	  <https://www.cosic.esat.kuleuven.be/nessie/reports/>
+ 	  <http://www.larc.usp.br/~pbarreto/AnubisPage.html>
+ 
+-config CRYPTO_LIB_ARC4
+-	tristate
+-
+ config CRYPTO_ARC4
+ 	tristate "ARC4 cipher algorithm"
+ 	select CRYPTO_BLKCIPHER
+@@ -1339,9 +1330,6 @@ config CRYPTO_CAST6_AVX_X86_64
+ 	  This module provides the Cast6 cipher algorithm that processes
+ 	  eight blocks parallel using the AVX instruction set.
+ 
+-config CRYPTO_LIB_DES
+-	tristate
+-
+ config CRYPTO_DES
+ 	tristate "DES and Triple DES EDE cipher algorithms"
+ 	select CRYPTO_ALGAPI
+@@ -1845,6 +1833,7 @@ config CRYPTO_STATS
+ config CRYPTO_HASH_INFO
+ 	bool
+ 
+source "lib/crypto/Kconfig"
+ source "drivers/crypto/Kconfig"
+ source "crypto/asymmetric_keys/Kconfig"
+ source "certs/Kconfig"
+--- /dev/null
+++ b/lib/crypto/Kconfig
+@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+
+comment "Crypto library routines"
+
+config CRYPTO_LIB_AES
+	tristate
+
+config CRYPTO_LIB_ARC4
+	tristate
+
+config CRYPTO_LIB_DES
+	tristate
+
+config CRYPTO_LIB_SHA256
+	tristate
+--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
+@@ -1,13 +1,13 @@
+ # SPDX-License-Identifier: GPL-2.0
+ 
+-obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
+-libaes-y := aes.o
+obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
+libaes-y					:= aes.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o
+-libarc4-y := arc4.o
+obj-$(CONFIG_CRYPTO_LIB_ARC4)			+= libarc4.o
+libarc4-y					:= arc4.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
+-libdes-y := des.o
+obj-$(CONFIG_CRYPTO_LIB_DES)			+= libdes.o
+libdes-y					:= des.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+-libsha256-y := sha256.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256)			+= libsha256.o
+libsha256-y					:= sha256.o
--- a/target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0002-crypto-chacha-move-existing-library-code-into-lib-cr.patch
@ -0,0 +1,668 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:08 +0100
+Subject: [PATCH] crypto: chacha - move existing library code into lib/crypto
+
+commit 5fb8ef25803ef33e2eb60b626435828b937bed75 upstream.
+
+Currently, our generic ChaCha implementation consists of a permute
+function in lib/chacha.c that operates on the 64-byte ChaCha state
+directly [and which is always included into the core kernel since it
+is used by the /dev/random driver], and the crypto API plumbing to
+expose it as a skcipher.
+
+In order to support in-kernel users that need the ChaCha streamcipher
+but have no need [or tolerance] for going through the abstractions of
+the crypto API, let's expose the streamcipher bits via a library API
+as well, in a way that permits the implementation to be superseded by
+an architecture specific one if provided.
+
+So move the streamcipher code into a separate module in lib/crypto,
+and expose the init() and crypt() routines to users of the library.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-neon-glue.c   |  2 +-
+ arch/arm64/crypto/chacha-neon-glue.c |  2 +-
+ arch/x86/crypto/chacha_glue.c        |  2 +-
+ crypto/Kconfig                       |  1 +
+ crypto/chacha_generic.c              | 60 ++--------------------
+ include/crypto/chacha.h              | 77 ++++++++++++++++++++++------
+ include/crypto/internal/chacha.h     | 53 +++++++++++++++++++
+ lib/Makefile                         |  3 +-
+ lib/crypto/Kconfig                   | 26 ++++++++++
+ lib/crypto/Makefile                  |  4 ++
+ lib/{ => crypto}/chacha.c            | 20 ++++----
+ lib/crypto/libchacha.c               | 35 +++++++++++++
+ 12 files changed, 199 insertions(+), 86 deletions(-)
+ create mode 100644 include/crypto/internal/chacha.h
+ rename lib/{ => crypto}/chacha.c (88%)
+ create mode 100644 lib/crypto/libchacha.c
+
+--- a/arch/arm/crypto/chacha-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
+@@ -20,7 +20,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -20,7 +20,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -7,7 +7,7 @@
+  */
+ 
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/kernel.h>
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -1393,6 +1393,7 @@ config CRYPTO_SALSA20
+ 
+ config CRYPTO_CHACHA20
+ 	tristate "ChaCha stream cipher algorithms"
+	select CRYPTO_LIB_CHACHA_GENERIC
+ 	select CRYPTO_BLKCIPHER
+ 	help
+ 	  The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
+--- a/crypto/chacha_generic.c
+++ b/crypto/chacha_generic.c
+@@ -8,29 +8,10 @@
+ 
+ #include <asm/unaligned.h>
+ #include <crypto/algapi.h>
+-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
+ #include <crypto/internal/skcipher.h>
+ #include <linux/module.h>
+ 
+-static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
+-			   unsigned int bytes, int nrounds)
+-{
+-	/* aligned to potentially speed up crypto_xor() */
+-	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+-
+-	while (bytes >= CHACHA_BLOCK_SIZE) {
+-		chacha_block(state, stream, nrounds);
+-		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+-		bytes -= CHACHA_BLOCK_SIZE;
+-		dst += CHACHA_BLOCK_SIZE;
+-		src += CHACHA_BLOCK_SIZE;
+-	}
+-	if (bytes) {
+-		chacha_block(state, stream, nrounds);
+-		crypto_xor_cpy(dst, src, stream, bytes);
+-	}
+-}
+-
+ static int chacha_stream_xor(struct skcipher_request *req,
+ 			     const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -48,8 +29,8 @@ static int chacha_stream_xor(struct skci
+ 		if (nbytes < walk.total)
+ 			nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE);
+ 
+-		chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+-			       nbytes, ctx->nrounds);
+		chacha_crypt_generic(state, walk.dst.virt.addr,
+				     walk.src.virt.addr, nbytes, ctx->nrounds);
+ 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ 	}
+ 
+@@ -58,41 +39,10 @@ static int chacha_stream_xor(struct skci
+ 
+ void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv)
+ {
+-	state[0]  = 0x61707865; /* "expa" */
+-	state[1]  = 0x3320646e; /* "nd 3" */
+-	state[2]  = 0x79622d32; /* "2-by" */
+-	state[3]  = 0x6b206574; /* "te k" */
+-	state[4]  = ctx->key[0];
+-	state[5]  = ctx->key[1];
+-	state[6]  = ctx->key[2];
+-	state[7]  = ctx->key[3];
+-	state[8]  = ctx->key[4];
+-	state[9]  = ctx->key[5];
+-	state[10] = ctx->key[6];
+-	state[11] = ctx->key[7];
+-	state[12] = get_unaligned_le32(iv +  0);
+-	state[13] = get_unaligned_le32(iv +  4);
+-	state[14] = get_unaligned_le32(iv +  8);
+-	state[15] = get_unaligned_le32(iv + 12);
+	chacha_init_generic(state, ctx->key, iv);
+ }
+ EXPORT_SYMBOL_GPL(crypto_chacha_init);
+ 
+-static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			 unsigned int keysize, int nrounds)
+-{
+-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-	int i;
+-
+-	if (keysize != CHACHA_KEY_SIZE)
+-		return -EINVAL;
+-
+-	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+-		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+-
+-	ctx->nrounds = nrounds;
+-	return 0;
+-}
+-
+ int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ 			   unsigned int keysize)
+ {
+@@ -126,7 +76,7 @@ int crypto_xchacha_crypt(struct skcipher
+ 
+ 	/* Compute the subkey given the original key and first 128 nonce bits */
+ 	crypto_chacha_init(state, ctx, req->iv);
+-	hchacha_block(state, subctx.key, ctx->nrounds);
+	hchacha_block_generic(state, subctx.key, ctx->nrounds);
+ 	subctx.nrounds = ctx->nrounds;
+ 
+ 	/* Build the real IV */
+--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
+@@ -15,9 +15,8 @@
+ #ifndef _CRYPTO_CHACHA_H
+ #define _CRYPTO_CHACHA_H
+ 
+-#include <crypto/skcipher.h>
+#include <asm/unaligned.h>
+ #include <linux/types.h>
+-#include <linux/crypto.h>
+ 
+ /* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
+ #define CHACHA_IV_SIZE		16
+@@ -29,26 +28,70 @@
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE		32
+ 
+-struct chacha_ctx {
+-	u32 key[8];
+-	int nrounds;
+-};
+-
+-void chacha_block(u32 *state, u8 *stream, int nrounds);
+void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
+ static inline void chacha20_block(u32 *state, u8 *stream)
+ {
+-	chacha_block(state, stream, 20);
+	chacha_block_generic(state, stream, 20);
+ }
+-void hchacha_block(const u32 *in, u32 *out, int nrounds);
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
+void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
+void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);
+
+static inline void hchacha_block(const u32 *state, u32 *out, int nrounds)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+		hchacha_block_arch(state, out, nrounds);
+	else
+		hchacha_block_generic(state, out, nrounds);
+}
+ 
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize);
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize);
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
+static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv)
+{
+	state[0]  = 0x61707865; /* "expa" */
+	state[1]  = 0x3320646e; /* "nd 3" */
+	state[2]  = 0x79622d32; /* "2-by" */
+	state[3]  = 0x6b206574; /* "te k" */
+	state[4]  = key[0];
+	state[5]  = key[1];
+	state[6]  = key[2];
+	state[7]  = key[3];
+	state[8]  = key[4];
+	state[9]  = key[5];
+	state[10] = key[6];
+	state[11] = key[7];
+	state[12] = get_unaligned_le32(iv +  0);
+	state[13] = get_unaligned_le32(iv +  4);
+	state[14] = get_unaligned_le32(iv +  8);
+	state[15] = get_unaligned_le32(iv + 12);
+}
+ 
+-int crypto_chacha_crypt(struct skcipher_request *req);
+-int crypto_xchacha_crypt(struct skcipher_request *req);
+static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+		chacha_init_arch(state, key, iv);
+	else
+		chacha_init_generic(state, key, iv);
+}
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+		       unsigned int bytes, int nrounds);
+void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds);
+
+static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src,
+				unsigned int bytes, int nrounds)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+		chacha_crypt_arch(state, dst, src, bytes, nrounds);
+	else
+		chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
+
+static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src,
+				  unsigned int bytes)
+{
+	chacha_crypt(state, dst, src, bytes, 20);
+}
+ 
+ #endif /* _CRYPTO_CHACHA_H */
+--- /dev/null
+++ b/include/crypto/internal/chacha.h
+@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRYPTO_INTERNAL_CHACHA_H
+#define _CRYPTO_INTERNAL_CHACHA_H
+
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/crypto.h>
+
+struct chacha_ctx {
+	u32 key[8];
+	int nrounds;
+};
+
+void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
+
+static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+				unsigned int keysize, int nrounds)
+{
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int i;
+
+	if (keysize != CHACHA_KEY_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+		ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+	ctx->nrounds = nrounds;
+	return 0;
+}
+
+static inline int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+				  unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 20);
+}
+
+static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+				  unsigned int keysize)
+{
+	return chacha_setkey(tfm, key, keysize, 12);
+}
+
+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
+int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			   unsigned int keysize);
+
+int crypto_chacha_crypt(struct skcipher_request *req);
+int crypto_xchacha_crypt(struct skcipher_request *req);
+
+#endif /* _CRYPTO_CHACHA_H */
+--- a/lib/Makefile
+++ b/lib/Makefile
+@@ -26,8 +26,7 @@ endif
+ 
+ lib-y := ctype.o string.o vsprintf.o cmdline.o \
+ 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
+-	 idr.o extable.o \
+-	 sha1.o chacha.o irq_regs.o argv_split.o \
+	 idr.o extable.o sha1.o irq_regs.o argv_split.o \
+ 	 flex_proportions.o ratelimit.o show_mem.o \
+ 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
+ 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
+--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
+@@ -8,6 +8,32 @@ config CRYPTO_LIB_AES
+ config CRYPTO_LIB_ARC4
+ 	tristate
+ 
+config CRYPTO_ARCH_HAVE_LIB_CHACHA
+	tristate
+	help
+	  Declares whether the architecture provides an arch-specific
+	  accelerated implementation of the ChaCha library interface,
+	  either builtin or as a module.
+
+config CRYPTO_LIB_CHACHA_GENERIC
+	tristate
+	select CRYPTO_ALGAPI
+	help
+	  This symbol can be depended upon by arch implementations of the
+	  ChaCha library interface that require the generic code as a
+	  fallback, e.g., for SIMD implementations. If no arch specific
+	  implementation is enabled, this implementation serves the users
+	  of CRYPTO_LIB_CHACHA.
+
+config CRYPTO_LIB_CHACHA
+	tristate "ChaCha library interface"
+	depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
+	select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
+	help
+	  Enable the ChaCha library interface. This interface may be fulfilled
+	  by either the generic implementation or an arch-specific one, if one
+	  is available and enabled.
+
+ config CRYPTO_LIB_DES
+ 	tristate
+ 
+--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
+@@ -1,5 +1,9 @@
+ # SPDX-License-Identifier: GPL-2.0
+ 
+# chacha is used by the /dev/random driver which is always builtin
+obj-y						+= chacha.o
+obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)		+= libchacha.o
+
+ obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
+ libaes-y					:= aes.o
+ 
+--- a/lib/chacha.c
+++ /dev/null
+@@ -1,113 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
+-/*
+- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+- *
+- * Copyright (C) 2015 Martin Willi
+- */
+-
+-#include <linux/kernel.h>
+-#include <linux/export.h>
+-#include <linux/bitops.h>
+-#include <linux/cryptohash.h>
+-#include <asm/unaligned.h>
+-#include <crypto/chacha.h>
+-
+-static void chacha_permute(u32 *x, int nrounds)
+-{
+-	int i;
+-
+-	/* whitelist the allowed round counts */
+-	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+-
+-	for (i = 0; i < nrounds; i += 2) {
+-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+-
+-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+-
+-		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+-		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+-		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+-		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+-
+-		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+-		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+-		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+-		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+-
+-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+-
+-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+-
+-		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+-		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+-		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+-		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+-
+-		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+-		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+-		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+-		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+-	}
+-}
+-
+-/**
+- * chacha_block - generate one keystream block and increment block counter
+- * @state: input state matrix (16 32-bit words)
+- * @stream: output keystream block (64 bytes)
+- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+- *
+- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+- * The caller has already converted the endianness of the input.  This function
+- * also handles incrementing the block counter in the input matrix.
+- */
+-void chacha_block(u32 *state, u8 *stream, int nrounds)
+-{
+-	u32 x[16];
+-	int i;
+-
+-	memcpy(x, state, 64);
+-
+-	chacha_permute(x, nrounds);
+-
+-	for (i = 0; i < ARRAY_SIZE(x); i++)
+-		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+-
+-	state[12]++;
+-}
+-EXPORT_SYMBOL(chacha_block);
+-
+-/**
+- * hchacha_block - abbreviated ChaCha core, for XChaCha
+- * @in: input state matrix (16 32-bit words)
+- * @out: output (8 32-bit words)
+- * @nrounds: number of rounds (20 or 12; 20 is recommended)
+- *
+- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+- * skips the final addition of the initial state, and outputs only certain words
+- * of the state.  It should not be used for streaming directly.
+- */
+-void hchacha_block(const u32 *in, u32 *out, int nrounds)
+-{
+-	u32 x[16];
+-
+-	memcpy(x, in, 64);
+-
+-	chacha_permute(x, nrounds);
+-
+-	memcpy(&out[0], &x[0], 16);
+-	memcpy(&out[4], &x[12], 16);
+-}
+-EXPORT_SYMBOL(hchacha_block);
+--- /dev/null
+++ b/lib/crypto/chacha.c
+@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/cryptohash.h>
+#include <asm/unaligned.h>
+#include <crypto/chacha.h>
+
+static void chacha_permute(u32 *x, int nrounds)
+{
+	int i;
+
+	/* whitelist the allowed round counts */
+	WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+	for (i = 0; i < nrounds; i += 2) {
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+
+		x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+		x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+		x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+		x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+
+		x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+		x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+		x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+		x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+
+		x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+		x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+		x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+		x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+
+		x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+		x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+		x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+		x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+	}
+}
+
+/**
+ * chacha_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input.  This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
+{
+	u32 x[16];
+	int i;
+
+	memcpy(x, state, 64);
+
+	chacha_permute(x, nrounds);
+
+	for (i = 0; i < ARRAY_SIZE(x); i++)
+		put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+
+	state[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state.  It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
+{
+	u32 x[16];
+
+	memcpy(x, state, 64);
+
+	chacha_permute(x, nrounds);
+
+	memcpy(&stream[0], &x[0], 16);
+	memcpy(&stream[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
+--- /dev/null
+++ b/lib/crypto/libchacha.c
+@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+#include <crypto/algapi.h> // for crypto_xor_cpy
+#include <crypto/chacha.h>
+
+void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	/* aligned to potentially speed up crypto_xor() */
+	u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+		bytes -= CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+	}
+	if (bytes) {
+		chacha_block_generic(state, stream, nrounds);
+		crypto_xor_cpy(dst, src, stream, bytes);
+	}
+}
+EXPORT_SYMBOL(chacha_crypt_generic);
+
+MODULE_LICENSE("GPL");
--- a/target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0003-crypto-x86-chacha-depend-on-generic-chacha-library-i.patch
@ -0,0 +1,192 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:09 +0100
+Subject: [PATCH] crypto: x86/chacha - depend on generic chacha library instead
+ of crypto driver
+
+commit 28e8d89b1ce8d2e7badfb5f69971dd635acb8863 upstream.
+
+In preparation of extending the x86 ChaCha driver to also expose the ChaCha
+library interface, drop the dependency on the chacha_generic crypto driver
+as a non-SIMD fallback, and depend on the generic ChaCha library directly.
+This way, we only pull in the code we actually need, without registering
+a set of ChaCha skciphers that we will never use.
+
+Since turning the FPU on and off is cheap these days, simplify the SIMD
+routine by dropping the per-page yield, which makes for a cleaner switch
+to the library API as well. This also allows use to invoke the skcipher
+walk routines in non-atomic mode.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 90 ++++++++++++++---------------------
+ crypto/Kconfig                |  2 +-
+ 2 files changed, 36 insertions(+), 56 deletions(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -123,37 +123,38 @@ static void chacha_dosimd(u32 *state, u8
+ 	}
+ }
+ 
+-static int chacha_simd_stream_xor(struct skcipher_walk *walk,
+static int chacha_simd_stream_xor(struct skcipher_request *req,
+ 				  const struct chacha_ctx *ctx, const u8 *iv)
+ {
+ 	u32 *state, state_buf[16 + 2] __aligned(8);
+-	int next_yield = 4096; /* bytes until next FPU yield */
+-	int err = 0;
+	struct skcipher_walk walk;
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+ 
+ 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ 
+-	crypto_chacha_init(state, ctx, iv);
+	chacha_init_generic(state, ctx->key, iv);
+ 
+-	while (walk->nbytes > 0) {
+-		unsigned int nbytes = walk->nbytes;
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+ 
+-		if (nbytes < walk->total) {
+-			nbytes = round_down(nbytes, walk->stride);
+-			next_yield -= nbytes;
+-		}
+-
+-		chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+-			      nbytes, ctx->nrounds);
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+ 
+-		if (next_yield <= 0) {
+-			/* temporarily allow preemption */
+-			kernel_fpu_end();
+		if (!crypto_simd_usable()) {
+			chacha_crypt_generic(state, walk.dst.virt.addr,
+					     walk.src.virt.addr, nbytes,
+					     ctx->nrounds);
+		} else {
+ 			kernel_fpu_begin();
+-			next_yield = 4096;
+			chacha_dosimd(state, walk.dst.virt.addr,
+				      walk.src.virt.addr, nbytes,
+				      ctx->nrounds);
+			kernel_fpu_end();
+ 		}
+-
+-		err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ 	}
+ 
+ 	return err;
+@@ -163,55 +164,34 @@ static int chacha_simd(struct skcipher_r
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-	struct skcipher_walk walk;
+-	int err;
+-
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_chacha_crypt(req);
+ 
+-	err = skcipher_walk_virt(&walk, req, true);
+-	if (err)
+-		return err;
+-
+-	kernel_fpu_begin();
+-	err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+-	kernel_fpu_end();
+-	return err;
+	return chacha_simd_stream_xor(req, ctx, req->iv);
+ }
+ 
+ static int xchacha_simd(struct skcipher_request *req)
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-	struct skcipher_walk walk;
+-	struct chacha_ctx subctx;
+ 	u32 *state, state_buf[16 + 2] __aligned(8);
+	struct chacha_ctx subctx;
+ 	u8 real_iv[16];
+-	int err;
+-
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_xchacha_crypt(req);
+-
+-	err = skcipher_walk_virt(&walk, req, true);
+-	if (err)
+-		return err;
+ 
+ 	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ 	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+-	crypto_chacha_init(state, ctx, req->iv);
+	chacha_init_generic(state, ctx->key, req->iv);
+ 
+-	kernel_fpu_begin();
+-
+-	hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+		kernel_fpu_end();
+	} else {
+		hchacha_block_generic(state, subctx.key, ctx->nrounds);
+	}
+ 	subctx.nrounds = ctx->nrounds;
+ 
+ 	memcpy(&real_iv[0], req->iv + 24, 8);
+ 	memcpy(&real_iv[8], req->iv + 16, 8);
+-	err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+-
+-	kernel_fpu_end();
+-
+-	return err;
+	return chacha_simd_stream_xor(req, &subctx, real_iv);
+ }
+ 
+ static struct skcipher_alg algs[] = {
+@@ -227,7 +207,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= CHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= chacha_simd,
+ 		.decrypt		= chacha_simd,
+ 	}, {
+@@ -242,7 +222,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= xchacha_simd,
+ 		.decrypt		= xchacha_simd,
+ 	}, {
+@@ -257,7 +237,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha12_setkey,
+		.setkey			= chacha12_setkey,
+ 		.encrypt		= xchacha_simd,
+ 		.decrypt		= xchacha_simd,
+ 	},
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -1417,7 +1417,7 @@ config CRYPTO_CHACHA20_X86_64
+ 	tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
+ 	depends on X86 && 64BIT
+ 	select CRYPTO_BLKCIPHER
+-	select CRYPTO_CHACHA20
+	select CRYPTO_LIB_CHACHA_GENERIC
+ 	help
+ 	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+ 	  XChaCha20, and XChaCha12 stream ciphers.
--- a/target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0004-crypto-x86-chacha-expose-SIMD-ChaCha-routine-as-libr.patch
@ -0,0 +1,205 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:10 +0100
+Subject: [PATCH] crypto: x86/chacha - expose SIMD ChaCha routine as library
+ function
+
+commit 84e03fa39fbe95a5567d43bff458c6d3b3a23ad1 upstream.
+
+Wire the existing x86 SIMD ChaCha code into the new ChaCha library
+interface, so that users of the library interface will get the
+accelerated version when available.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 91 +++++++++++++++++++++++++----------
+ crypto/Kconfig                |  1 +
+ include/crypto/chacha.h       |  6 +++
+ 3 files changed, 73 insertions(+), 25 deletions(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u
+ asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ 					unsigned int len, int nrounds);
+ asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+-#ifdef CONFIG_AS_AVX2
+
+ asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ 				       unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ 				       unsigned int len, int nrounds);
+ asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ 				       unsigned int len, int nrounds);
+-static bool chacha_use_avx2;
+-#ifdef CONFIG_AS_AVX512
+
+ asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ 					   unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ 					   unsigned int len, int nrounds);
+ asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ 					   unsigned int len, int nrounds);
+-static bool chacha_use_avx512vl;
+-#endif
+-#endif
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
+ 
+ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+ {
+@@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsig
+ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+ 			  unsigned int bytes, int nrounds)
+ {
+-#ifdef CONFIG_AS_AVX2
+-#ifdef CONFIG_AS_AVX512
+-	if (chacha_use_avx512vl) {
+	if (IS_ENABLED(CONFIG_AS_AVX512) &&
+	    static_branch_likely(&chacha_use_avx512vl)) {
+ 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ 						   nrounds);
+@@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8
+ 			return;
+ 		}
+ 	}
+-#endif
+-	if (chacha_use_avx2) {
+
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    static_branch_likely(&chacha_use_avx2)) {
+ 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ 			bytes -= CHACHA_BLOCK_SIZE * 8;
+@@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8
+ 			return;
+ 		}
+ 	}
+-#endif
+
+ 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ 		bytes -= CHACHA_BLOCK_SIZE * 4;
+@@ -123,6 +123,43 @@ static void chacha_dosimd(u32 *state, u8
+ 	}
+ }
+ 
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, stream, nrounds);
+	} else {
+		kernel_fpu_begin();
+		hchacha_block_ssse3(state, stream, nrounds);
+		kernel_fpu_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		       int nrounds)
+{
+	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+
+	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE)
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_fpu_begin();
+	chacha_dosimd(state, dst, src, bytes, nrounds);
+	kernel_fpu_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+ static int chacha_simd_stream_xor(struct skcipher_request *req,
+ 				  const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -143,7 +180,8 @@ static int chacha_simd_stream_xor(struct
+ 		if (nbytes < walk.total)
+ 			nbytes = round_down(nbytes, walk.stride);
+ 
+-		if (!crypto_simd_usable()) {
+		if (!static_branch_likely(&chacha_use_simd) ||
+		    !crypto_simd_usable()) {
+ 			chacha_crypt_generic(state, walk.dst.virt.addr,
+ 					     walk.src.virt.addr, nbytes,
+ 					     ctx->nrounds);
+@@ -246,18 +284,21 @@ static struct skcipher_alg algs[] = {
+ static int __init chacha_simd_mod_init(void)
+ {
+ 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+-		return -ENODEV;
+		return 0;
+ 
+-#ifdef CONFIG_AS_AVX2
+-	chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+-			  boot_cpu_has(X86_FEATURE_AVX2) &&
+-			  cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-#ifdef CONFIG_AS_AVX512
+-	chacha_use_avx512vl = chacha_use_avx2 &&
+-			      boot_cpu_has(X86_FEATURE_AVX512VL) &&
+-			      boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+-#endif
+-#endif
+	static_branch_enable(&chacha_use_simd);
+
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		static_branch_enable(&chacha_use_avx2);
+
+		if (IS_ENABLED(CONFIG_AS_AVX512) &&
+		    boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+			static_branch_enable(&chacha_use_avx512vl);
+	}
+ 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -1418,6 +1418,7 @@ config CRYPTO_CHACHA20_X86_64
+ 	depends on X86 && 64BIT
+ 	select CRYPTO_BLKCIPHER
+ 	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 	help
+ 	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+ 	  XChaCha20, and XChaCha12 stream ciphers.
+--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
+@@ -25,6 +25,12 @@
+ #define CHACHA_BLOCK_SIZE	64
+ #define CHACHAPOLY_IV_SIZE	12
+ 
+#ifdef CONFIG_X86_64
+#define CHACHA_STATE_WORDS	((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
+#else
+#define CHACHA_STATE_WORDS	(CHACHA_BLOCK_SIZE / sizeof(u32))
+#endif
+
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE		32
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0005-crypto-arm64-chacha-depend-on-generic-chacha-library.patch
@ -0,0 +1,129 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:11 +0100
+Subject: [PATCH] crypto: arm64/chacha - depend on generic chacha library
+ instead of crypto driver
+
+commit c77da4867cbb7841177275dbb250f5c09679fae4 upstream.
+
+Depend on the generic ChaCha library routines instead of pulling in the
+generic ChaCha skcipher driver, which is more than we need, and makes
+managing the dependencies between the generic library, generic driver,
+accelerated library and driver more complicated.
+
+While at it, drop the logic to prefer the scalar code on short inputs.
+Turning the NEON on and off is cheap these days, and one major use case
+for ChaCha20 is ChaCha20-Poly1305, which is guaranteed to hit the scalar
+path upon every invocation  (when doing the Poly1305 nonce generation)
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/Kconfig            |  2 +-
+ arch/arm64/crypto/chacha-neon-glue.c | 40 +++++++++++++++-------------
+ 2 files changed, 23 insertions(+), 19 deletions(-)
+
+--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
+@@ -103,7 +103,7 @@ config CRYPTO_CHACHA20_NEON
+ 	tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
+ 	depends on KERNEL_MODE_NEON
+ 	select CRYPTO_BLKCIPHER
+-	select CRYPTO_CHACHA20
+	select CRYPTO_LIB_CHACHA_GENERIC
+ 
+ config CRYPTO_NHPOLY1305_NEON
+ 	tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -68,7 +68,7 @@ static int chacha_neon_stream_xor(struct
+ 
+ 	err = skcipher_walk_virt(&walk, req, false);
+ 
+-	crypto_chacha_init(state, ctx, iv);
+	chacha_init_generic(state, ctx->key, iv);
+ 
+ 	while (walk.nbytes > 0) {
+ 		unsigned int nbytes = walk.nbytes;
+@@ -76,10 +76,16 @@ static int chacha_neon_stream_xor(struct
+ 		if (nbytes < walk.total)
+ 			nbytes = rounddown(nbytes, walk.stride);
+ 
+-		kernel_neon_begin();
+-		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+-			      nbytes, ctx->nrounds);
+-		kernel_neon_end();
+		if (!crypto_simd_usable()) {
+			chacha_crypt_generic(state, walk.dst.virt.addr,
+					     walk.src.virt.addr, nbytes,
+					     ctx->nrounds);
+		} else {
+			kernel_neon_begin();
+			chacha_doneon(state, walk.dst.virt.addr,
+				      walk.src.virt.addr, nbytes, ctx->nrounds);
+			kernel_neon_end();
+		}
+ 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ 	}
+ 
+@@ -91,9 +97,6 @@ static int chacha_neon(struct skcipher_r
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ 
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_chacha_crypt(req);
+-
+ 	return chacha_neon_stream_xor(req, ctx, req->iv);
+ }
+ 
+@@ -105,14 +108,15 @@ static int xchacha_neon(struct skcipher_
+ 	u32 state[16];
+ 	u8 real_iv[16];
+ 
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_xchacha_crypt(req);
+-
+-	crypto_chacha_init(state, ctx, req->iv);
+	chacha_init_generic(state, ctx->key, req->iv);
+ 
+-	kernel_neon_begin();
+-	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-	kernel_neon_end();
+	if (crypto_simd_usable()) {
+		kernel_neon_begin();
+		hchacha_block_neon(state, subctx.key, ctx->nrounds);
+		kernel_neon_end();
+	} else {
+		hchacha_block_generic(state, subctx.key, ctx->nrounds);
+	}
+ 	subctx.nrounds = ctx->nrounds;
+ 
+ 	memcpy(&real_iv[0], req->iv + 24, 8);
+@@ -134,7 +138,7 @@ static struct skcipher_alg algs[] = {
+ 		.ivsize			= CHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+ 		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= chacha_neon,
+ 		.decrypt		= chacha_neon,
+ 	}, {
+@@ -150,7 +154,7 @@ static struct skcipher_alg algs[] = {
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+ 		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= xchacha_neon,
+ 		.decrypt		= xchacha_neon,
+ 	}, {
+@@ -166,7 +170,7 @@ static struct skcipher_alg algs[] = {
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+ 		.walksize		= 5 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha12_setkey,
+		.setkey			= chacha12_setkey,
+ 		.encrypt		= xchacha_neon,
+ 		.decrypt		= xchacha_neon,
+ 	}
--- a/target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0006-crypto-arm64-chacha-expose-arm64-ChaCha-routine-as-l.patch
@ -0,0 +1,138 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:12 +0100
+Subject: [PATCH] crypto: arm64/chacha - expose arm64 ChaCha routine as library
+ function
+
+commit b3aad5bad26a01a4bd8c49a5c5f52aec665f3b7c upstream.
+
+Expose the accelerated NEON ChaCha routine directly as a symbol
+export so that users of the ChaCha library API can use it directly.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/Kconfig            |  1 +
+ arch/arm64/crypto/chacha-neon-glue.c | 53 ++++++++++++++++++++++------
+ 2 files changed, 43 insertions(+), 11 deletions(-)
+
+--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
+@@ -104,6 +104,7 @@ config CRYPTO_CHACHA20_NEON
+ 	depends on KERNEL_MODE_NEON
+ 	select CRYPTO_BLKCIPHER
+ 	select CRYPTO_LIB_CHACHA_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
+ config CRYPTO_NHPOLY1305_NEON
+ 	tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -23,6 +23,7 @@
+ #include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ 
+@@ -36,6 +37,8 @@ asmlinkage void chacha_4block_xor_neon(u
+ 				       int nrounds, int bytes);
+ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+ 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+ 			  int bytes, int nrounds)
+ {
+@@ -59,6 +62,37 @@ static void chacha_doneon(u32 *state, u8
+ 	}
+ }
+ 
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+		hchacha_block_generic(state, stream, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, stream, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		       int nrounds)
+{
+	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+	    !crypto_simd_usable())
+		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+	kernel_neon_begin();
+	chacha_doneon(state, dst, src, bytes, nrounds);
+	kernel_neon_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+ static int chacha_neon_stream_xor(struct skcipher_request *req,
+ 				  const struct chacha_ctx *ctx, const u8 *iv)
+ {
+@@ -76,7 +110,8 @@ static int chacha_neon_stream_xor(struct
+ 		if (nbytes < walk.total)
+ 			nbytes = rounddown(nbytes, walk.stride);
+ 
+-		if (!crypto_simd_usable()) {
+		if (!static_branch_likely(&have_neon) ||
+		    !crypto_simd_usable()) {
+ 			chacha_crypt_generic(state, walk.dst.virt.addr,
+ 					     walk.src.virt.addr, nbytes,
+ 					     ctx->nrounds);
+@@ -109,14 +144,7 @@ static int xchacha_neon(struct skcipher_
+ 	u8 real_iv[16];
+ 
+ 	chacha_init_generic(state, ctx->key, req->iv);
+-
+-	if (crypto_simd_usable()) {
+-		kernel_neon_begin();
+-		hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-		kernel_neon_end();
+-	} else {
+-		hchacha_block_generic(state, subctx.key, ctx->nrounds);
+-	}
+	hchacha_block_arch(state, subctx.key, ctx->nrounds);
+ 	subctx.nrounds = ctx->nrounds;
+ 
+ 	memcpy(&real_iv[0], req->iv + 24, 8);
+@@ -179,14 +207,17 @@ static struct skcipher_alg algs[] = {
+ static int __init chacha_simd_mod_init(void)
+ {
+ 	if (!cpu_have_named_feature(ASIMD))
+-		return -ENODEV;
+		return 0;
+
+	static_branch_enable(&have_neon);
+ 
+ 	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+	if (cpu_have_named_feature(ASIMD))
+		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
--- a/target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0007-crypto-arm-chacha-import-Eric-Biggers-s-scalar-accel.patch
@ -0,0 +1,480 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:13 +0100
+Subject: [PATCH] crypto: arm/chacha - import Eric Biggers's scalar accelerated
+ ChaCha code
+
+commit 29621d099f9c642b22a69dc8e7e20c108473a392 upstream.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-scalar-core.S | 461 +++++++++++++++++++++++++++
+ 1 file changed, 461 insertions(+)
+ create mode 100644 arch/arm/crypto/chacha-scalar-core.S
+
+--- /dev/null
+++ b/arch/arm/crypto/chacha-scalar-core.S
+@@ -0,0 +1,461 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
+ * instructions.  To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount.  The rotation amount is then fixed up just in time
+ * when the values are used.  'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+	// ChaCha state registers
+	X0	.req	r0
+	X1	.req	r1
+	X2	.req	r2
+	X3	.req	r3
+	X4	.req	r4
+	X5	.req	r5
+	X6	.req	r6
+	X7	.req	r7
+	X8_X10	.req	r8	// shared by x8 and x10
+	X9_X11	.req	r9	// shared by x9 and x11
+	X12	.req	r10
+	X13	.req	r11
+	X14	.req	r12
+	X15	.req	r14
+
+.Lexpand_32byte_k:
+	// "expand 32-byte k"
+	.word	0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+
+#ifdef __thumb2__
+#  define adrl adr
+#endif
+
+.macro __rev		out, in,  t0, t1, t2
+.if __LINUX_ARM_ARCH__ >= 6
+	rev		\out, \in
+.else
+	lsl		\t0, \in, #24
+	and		\t1, \in, #0xff00
+	and		\t2, \in, #0xff0000
+	orr		\out, \t0, \in, lsr #24
+	orr		\out, \out, \t1, lsl #8
+	orr		\out, \out, \t2, lsr #8
+.endif
+.endm
+
+.macro _le32_bswap	x,  t0, t1, t2
+#ifdef __ARMEB__
+	__rev		\x, \x,  \t0, \t1, \t2
+#endif
+.endm
+
+.macro _le32_bswap_4x	a, b, c, d,  t0, t1, t2
+	_le32_bswap	\a,  \t0, \t1, \t2
+	_le32_bswap	\b,  \t0, \t1, \t2
+	_le32_bswap	\c,  \t0, \t1, \t2
+	_le32_bswap	\d,  \t0, \t1, \t2
+.endm
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
+
+	// a += b; d ^= a; d = rol(d, 16);
+	add		\a1, \a1, \b1, ror #brot
+	add		\a2, \a2, \b2, ror #brot
+	eor		\d1, \a1, \d1, ror #drot
+	eor		\d2, \a2, \d2, ror #drot
+	// drot == 32 - 16 == 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	add		\c1, \c1, \d1, ror #16
+	add		\c2, \c2, \d2, ror #16
+	eor		\b1, \c1, \b1, ror #brot
+	eor		\b2, \c2, \b2, ror #brot
+	// brot == 32 - 12 == 20
+
+	// a += b; d ^= a; d = rol(d, 8);
+	add		\a1, \a1, \b1, ror #20
+	add		\a2, \a2, \b2, ror #20
+	eor		\d1, \a1, \d1, ror #16
+	eor		\d2, \a2, \d2, ror #16
+	// drot == 32 - 8 == 24
+
+	// c += d; b ^= c; b = rol(b, 7);
+	add		\c1, \c1, \d1, ror #24
+	add		\c2, \c2, \d2, ror #24
+	eor		\b1, \c1, \b1, ror #20
+	eor		\b2, \c2, \b2, ror #20
+	// brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+	// column round
+
+	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
+
+	// save (x8, x9); restore (x10, x11)
+	__strd		X8_X10, X9_X11, sp, 0
+	__ldrd		X8_X10, X9_X11, sp, 8
+
+	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
+
+	.set brot, 25
+	.set drot, 24
+
+	// diagonal round
+
+	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
+
+	// save (x10, x11); restore (x8, x9)
+	__strd		X8_X10, X9_X11, sp, 8
+	__ldrd		X8_X10, X9_X11, sp, 0
+
+	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute	nrounds
+	.set brot, 0
+	.set drot, 0
+	.rept \nrounds / 2
+	 _doubleround
+	.endr
+.endm
+
+.macro _chacha		nrounds
+
+.Lnext_block\@:
+	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+
+	// Do the core ChaCha permutation to update x0-x15.
+	_chacha_permute	\nrounds
+
+	add		sp, #8
+	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+	push		{X8_X10, X9_X11, X12, X13, X14, X15}
+
+	// Load (OUT, IN, LEN).
+	ldr		r14, [sp, #96]
+	ldr		r12, [sp, #100]
+	ldr		r11, [sp, #104]
+
+	orr		r10, r14, r12
+
+	// Use slow path if fewer than 64 bytes remain.
+	cmp		r11, #64
+	blt		.Lxor_slowpath\@
+
+	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
+	// ARMv6+, since ldmia and stmia (used below) still require alignment.
+	tst		r10, #3
+	bne		.Lxor_slowpath\@
+
+	// Fast path: XOR 64 bytes of aligned data.
+
+	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// x0-x3
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
+	ldmia		r12!, {r8-r11}
+	eor		X0, X0, r8
+	eor		X1, X1, r9
+	eor		X2, X2, r10
+	eor		X3, X3, r11
+	stmia		r14!, {X0-X3}
+
+	// x4-x7
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	ldmia		r12!, {X0-X3}
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
+	eor		X4, X4, X0
+	eor		X5, X5, X1
+	eor		X6, X6, X2
+	eor		X7, X7, X3
+	stmia		r14!, {X4-X7}
+
+	// x8-x15
+	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
+	ldmia		r12!, {r8-r11}
+	eor		r0, r0, r8		// x8
+	eor		r1, r1, r9		// x9
+	eor		r6, r6, r10		// x10
+	eor		r7, r7, r11		// x11
+	stmia		r14!, {r0,r1,r6,r7}
+	ldmia		r12!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
+	  ldr		r9, [sp, #72]		// load LEN
+	eor		r2, r2, r0		// x12
+	eor		r3, r3, r1		// x13
+	eor		r4, r4, r6		// x14
+	eor		r5, r5, r7		// x15
+	  subs		r9, #64			// decrement and check LEN
+	stmia		r14!, {r2-r5}
+
+	beq		.Ldone\@
+
+.Lprepare_for_next_block\@:
+
+	// Stack: x0-x15 OUT IN LEN
+
+	// Increment block counter (x12)
+	add		r8, #1
+
+	// Store updated (OUT, IN, LEN)
+	str		r14, [sp, #64]
+	str		r12, [sp, #68]
+	str		r9, [sp, #72]
+
+	  mov		r14, sp
+
+	// Store updated block counter (x12)
+	str		r8, [sp, #48]
+
+	  sub		sp, #16
+
+	// Reload state and do next block
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	__strd		r10, r11, sp, 8		// store x10-x11 before state
+	ldmia		r14, {r10-r12,r14}	// load x12-x15
+	b		.Lnext_block\@
+
+.Lxor_slowpath\@:
+	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+	// We handle it by storing the 64 bytes of keystream to the stack, then
+	// XOR-ing the needed portion with the data.
+
+	// Allocate keystream buffer
+	sub		sp, #64
+	mov		r14, sp
+
+	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Save keystream for x0-x3
+	__ldrd		r8, r9, sp, 96
+	__ldrd		r10, r11, sp, 104
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
+	stmia		r14!, {X0-X3}
+
+	// Save keystream for x4-x7
+	__ldrd		r8, r9, sp, 112
+	__ldrd		r10, r11, sp, 120
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
+	  add		r8, sp, #64
+	stmia		r14!, {X4-X7}
+
+	// Save keystream for x8-x15
+	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 128
+	__ldrd		r10, r11, sp, 136
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
+	stmia		r14!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 144
+	__ldrd		r10, r11, sp, 152
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
+	stmia		r14, {r2-r5}
+
+	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+	// Registers: r8 is block counter, r12 is IN.
+
+	ldr		r9, [sp, #168]		// LEN
+	ldr		r14, [sp, #160]		// OUT
+	cmp		r9, #64
+	  mov		r0, sp
+	movle		r1, r9
+	movgt		r1, #64
+	// r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+	orr		r2, r12, r14
+	tst		r2, #3			// IN or OUT misaligned?
+	bne		.Lxor_next_byte\@
+.endif
+
+	// XOR a word at a time
+.rept 16
+	subs		r1, #4
+	blt		.Lxor_words_done\@
+	ldr		r2, [r12], #4
+	ldr		r3, [r0], #4
+	eor		r2, r2, r3
+	str		r2, [r14], #4
+.endr
+	b		.Lxor_slowpath_done\@
+.Lxor_words_done\@:
+	ands		r1, r1, #3
+	beq		.Lxor_slowpath_done\@
+
+	// XOR a byte at a time
+.Lxor_next_byte\@:
+	ldrb		r2, [r12], #1
+	ldrb		r3, [r0], #1
+	eor		r2, r2, r3
+	strb		r2, [r14], #1
+	subs		r1, #1
+	bne		.Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+	subs		r9, #64
+	add		sp, #96
+	bgt		.Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm	// _chacha
+
+/*
+ * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+ *		     const u32 iv[4]);
+ */
+ENTRY(chacha20_arm)
+	cmp		r2, #0			// len == 0?
+	reteq		lr
+
+	push		{r0-r2,r4-r11,lr}
+
+	// Push state x0-x15 onto stack.
+	// Also store an extra copy of x10-x11 just before the state.
+
+	ldr		r4, [sp, #48]		// iv
+	mov		r0, sp
+	sub		sp, #80
+
+	// iv: x12-x15
+	ldm		r4, {X12,X13,X14,X15}
+	stmdb		r0!, {X12,X13,X14,X15}
+
+	// key: x4-x11
+	__ldrd		X8_X10, X9_X11, r3, 24
+	__strd		X8_X10, X9_X11, sp, 8
+	stmdb		r0!, {X8_X10, X9_X11}
+	ldm		r3, {X4-X9_X11}
+	stmdb		r0!, {X4-X9_X11}
+
+	// constants: x0-x3
+	adrl		X3, .Lexpand_32byte_k
+	ldm		X3, {X0-X3}
+	__strd		X0, X1, sp, 16
+	__strd		X2, X3, sp, 24
+
+	_chacha		20
+
+	add		sp, #76
+	pop		{r4-r11, pc}
+ENDPROC(chacha20_arm)
+
+/*
+ * void hchacha20_arm(const u32 state[16], u32 out[8]);
+ */
+ENTRY(hchacha20_arm)
+	push		{r1,r4-r11,lr}
+
+	mov		r14, r0
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	push		{r10-r11}		// store x10-x11 to stack
+	ldm		r14, {r10-r12,r14}	// load x12-x15
+	sub		sp, #8
+
+	_chacha_permute	20
+
+	// Skip over (unused0-unused1, x10-x11)
+	add		sp, #16
+
+	// Fix up rotations of x12-x15
+	ror		X12, X12, #drot
+	ror		X13, X13, #drot
+	  pop		{r4}			// load 'out'
+	ror		X14, X14, #drot
+	ror		X15, X15, #drot
+
+	// Store (x0-x3,x12-x15) to 'out'
+	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+	pop		{r4-r11,pc}
+ENDPROC(hchacha20_arm)
--- a/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0008-crypto-arm-chacha-remove-dependency-on-generic-ChaCh.patch
@ -0,0 +1,691 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:14 +0100
+Subject: [PATCH] crypto: arm/chacha - remove dependency on generic ChaCha
+ driver
+
+commit b36d8c09e710c71f6a9690b6586fea2d1c9e1e27 upstream.
+
+Instead of falling back to the generic ChaCha skcipher driver for
+non-SIMD cases, use a fast scalar implementation for ARM authored
+by Eric Biggers. This removes the module dependency on chacha-generic
+altogether, which also simplifies things when we expose the ChaCha
+library interface from this module.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig              |   4 +-
+ arch/arm/crypto/Makefile             |   3 +-
+ arch/arm/crypto/chacha-glue.c        | 304 +++++++++++++++++++++++++++
+ arch/arm/crypto/chacha-neon-glue.c   | 202 ------------------
+ arch/arm/crypto/chacha-scalar-core.S |  65 +++---
+ arch/arm64/crypto/chacha-neon-glue.c |   2 +-
+ 6 files changed, 340 insertions(+), 240 deletions(-)
+ create mode 100644 arch/arm/crypto/chacha-glue.c
+ delete mode 100644 arch/arm/crypto/chacha-neon-glue.c
+
+--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
+@@ -127,10 +127,8 @@ config CRYPTO_CRC32_ARM_CE
+ 	select CRYPTO_HASH
+ 
+ config CRYPTO_CHACHA20_NEON
+-	tristate "NEON accelerated ChaCha stream cipher algorithms"
+-	depends on KERNEL_MODE_NEON
+	tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
+ 	select CRYPTO_BLKCIPHER
+-	select CRYPTO_CHACHA20
+ 
+ config CRYPTO_NHPOLY1305_NEON
+ 	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
+@@ -53,7 +53,8 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glu
+ ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
+ crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
+ crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+ nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+ 
+ ifdef REGENERATE_ARM_CRYPTO
+--- /dev/null
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds);
+asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+			     const u32 *state, int nrounds);
+
+static inline bool neon_usable(void)
+{
+	return crypto_simd_usable();
+}
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha_4block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha_block_xor_neon(state, dst, src, nrounds);
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, buf, buf, nrounds);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha_stream_xor(struct skcipher_request *req,
+			     const struct chacha_ctx *ctx, const u8 *iv,
+			     bool neon)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	chacha_init_generic(state, ctx->key, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		if (!neon) {
+			chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
+				     nbytes, state, ctx->nrounds);
+			state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
+		} else {
+			kernel_neon_begin();
+			chacha_doneon(state, walk.dst.virt.addr,
+				      walk.src.virt.addr, nbytes, ctx->nrounds);
+			kernel_neon_end();
+		}
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int do_chacha(struct skcipher_request *req, bool neon)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	return chacha_stream_xor(req, ctx, req->iv, neon);
+}
+
+static int chacha_arm(struct skcipher_request *req)
+{
+	return do_chacha(req, false);
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+	return do_chacha(req, neon_usable());
+}
+
+static int do_xchacha(struct skcipher_request *req, bool neon)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	chacha_init_generic(state, ctx->key, req->iv);
+
+	if (!neon) {
+		hchacha_block_arm(state, subctx.key, ctx->nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, subctx.key, ctx->nrounds);
+		kernel_neon_end();
+	}
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_stream_xor(req, &subctx, real_iv, neon);
+}
+
+static int xchacha_arm(struct skcipher_request *req)
+{
+	return do_xchacha(req, false);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+	return do_xchacha(req, neon_usable());
+}
+
+static struct skcipher_alg arm_algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-arm",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= chacha_arm,
+		.decrypt		= chacha_arm,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-arm",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= xchacha_arm,
+		.decrypt		= xchacha_arm,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-arm",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha12_setkey,
+		.encrypt		= xchacha_arm,
+		.decrypt		= xchacha_arm,
+	},
+};
+
+static struct skcipher_alg neon_algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= chacha_neon,
+		.decrypt		= chacha_neon,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-neon",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.setkey			= chacha12_setkey,
+		.encrypt		= xchacha_neon,
+		.decrypt		= xchacha_neon,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	int err;
+
+	err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+	if (err)
+		return err;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		int i;
+
+		switch (read_cpuid_part()) {
+		case ARM_CPU_PART_CORTEX_A7:
+		case ARM_CPU_PART_CORTEX_A5:
+			/*
+			 * The Cortex-A7 and Cortex-A5 do not perform well with
+			 * the NEON implementation but do incredibly with the
+			 * scalar one and use less power.
+			 */
+			for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
+				neon_algs[i].base.cra_priority = 0;
+			break;
+		}
+
+		err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+		if (err)
+			crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+	}
+	return err;
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
+		crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-arm");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-arm");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-arm");
+#ifdef CONFIG_KERNEL_MODE_NEON
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
+#endif
+--- a/arch/arm/crypto/chacha-neon-glue.c
+++ /dev/null
+@@ -1,202 +0,0 @@
+-/*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+- * including ChaCha20 (RFC7539)
+- *
+- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * Based on:
+- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+- *
+- * Copyright (C) 2015 Martin Willi
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License as published by
+- * the Free Software Foundation; either version 2 of the License, or
+- * (at your option) any later version.
+- */
+-
+-#include <crypto/algapi.h>
+-#include <crypto/internal/chacha.h>
+-#include <crypto/internal/simd.h>
+-#include <crypto/internal/skcipher.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-
+-#include <asm/hwcap.h>
+-#include <asm/neon.h>
+-#include <asm/simd.h>
+-
+-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-				      int nrounds);
+-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-				       int nrounds);
+-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+-
+-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+-			  unsigned int bytes, int nrounds)
+-{
+-	u8 buf[CHACHA_BLOCK_SIZE];
+-
+-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+-		chacha_4block_xor_neon(state, dst, src, nrounds);
+-		bytes -= CHACHA_BLOCK_SIZE * 4;
+-		src += CHACHA_BLOCK_SIZE * 4;
+-		dst += CHACHA_BLOCK_SIZE * 4;
+-		state[12] += 4;
+-	}
+-	while (bytes >= CHACHA_BLOCK_SIZE) {
+-		chacha_block_xor_neon(state, dst, src, nrounds);
+-		bytes -= CHACHA_BLOCK_SIZE;
+-		src += CHACHA_BLOCK_SIZE;
+-		dst += CHACHA_BLOCK_SIZE;
+-		state[12]++;
+-	}
+-	if (bytes) {
+-		memcpy(buf, src, bytes);
+-		chacha_block_xor_neon(state, buf, buf, nrounds);
+-		memcpy(dst, buf, bytes);
+-	}
+-}
+-
+-static int chacha_neon_stream_xor(struct skcipher_request *req,
+-				  const struct chacha_ctx *ctx, const u8 *iv)
+-{
+-	struct skcipher_walk walk;
+-	u32 state[16];
+-	int err;
+-
+-	err = skcipher_walk_virt(&walk, req, false);
+-
+-	crypto_chacha_init(state, ctx, iv);
+-
+-	while (walk.nbytes > 0) {
+-		unsigned int nbytes = walk.nbytes;
+-
+-		if (nbytes < walk.total)
+-			nbytes = round_down(nbytes, walk.stride);
+-
+-		kernel_neon_begin();
+-		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+-			      nbytes, ctx->nrounds);
+-		kernel_neon_end();
+-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+-	}
+-
+-	return err;
+-}
+-
+-static int chacha_neon(struct skcipher_request *req)
+-{
+-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_chacha_crypt(req);
+-
+-	return chacha_neon_stream_xor(req, ctx, req->iv);
+-}
+-
+-static int xchacha_neon(struct skcipher_request *req)
+-{
+-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-	struct chacha_ctx subctx;
+-	u32 state[16];
+-	u8 real_iv[16];
+-
+-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+-		return crypto_xchacha_crypt(req);
+-
+-	crypto_chacha_init(state, ctx, req->iv);
+-
+-	kernel_neon_begin();
+-	hchacha_block_neon(state, subctx.key, ctx->nrounds);
+-	kernel_neon_end();
+-	subctx.nrounds = ctx->nrounds;
+-
+-	memcpy(&real_iv[0], req->iv + 24, 8);
+-	memcpy(&real_iv[8], req->iv + 16, 8);
+-	return chacha_neon_stream_xor(req, &subctx, real_iv);
+-}
+-
+-static struct skcipher_alg algs[] = {
+-	{
+-		.base.cra_name		= "chacha20",
+-		.base.cra_driver_name	= "chacha20-neon",
+-		.base.cra_priority	= 300,
+-		.base.cra_blocksize	= 1,
+-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+-		.base.cra_module	= THIS_MODULE,
+-
+-		.min_keysize		= CHACHA_KEY_SIZE,
+-		.max_keysize		= CHACHA_KEY_SIZE,
+-		.ivsize			= CHACHA_IV_SIZE,
+-		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+-		.encrypt		= chacha_neon,
+-		.decrypt		= chacha_neon,
+-	}, {
+-		.base.cra_name		= "xchacha20",
+-		.base.cra_driver_name	= "xchacha20-neon",
+-		.base.cra_priority	= 300,
+-		.base.cra_blocksize	= 1,
+-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+-		.base.cra_module	= THIS_MODULE,
+-
+-		.min_keysize		= CHACHA_KEY_SIZE,
+-		.max_keysize		= CHACHA_KEY_SIZE,
+-		.ivsize			= XCHACHA_IV_SIZE,
+-		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+-		.encrypt		= xchacha_neon,
+-		.decrypt		= xchacha_neon,
+-	}, {
+-		.base.cra_name		= "xchacha12",
+-		.base.cra_driver_name	= "xchacha12-neon",
+-		.base.cra_priority	= 300,
+-		.base.cra_blocksize	= 1,
+-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+-		.base.cra_module	= THIS_MODULE,
+-
+-		.min_keysize		= CHACHA_KEY_SIZE,
+-		.max_keysize		= CHACHA_KEY_SIZE,
+-		.ivsize			= XCHACHA_IV_SIZE,
+-		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha12_setkey,
+-		.encrypt		= xchacha_neon,
+-		.decrypt		= xchacha_neon,
+-	}
+-};
+-
+-static int __init chacha_simd_mod_init(void)
+-{
+-	if (!(elf_hwcap & HWCAP_NEON))
+-		return -ENODEV;
+-
+-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-static void __exit chacha_simd_mod_fini(void)
+-{
+-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+-}
+-
+-module_init(chacha_simd_mod_init);
+-module_exit(chacha_simd_mod_fini);
+-
+-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+-MODULE_LICENSE("GPL v2");
+-MODULE_ALIAS_CRYPTO("chacha20");
+-MODULE_ALIAS_CRYPTO("chacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha20");
+-MODULE_ALIAS_CRYPTO("xchacha20-neon");
+-MODULE_ALIAS_CRYPTO("xchacha12");
+-MODULE_ALIAS_CRYPTO("xchacha12-neon");
+--- a/arch/arm/crypto/chacha-scalar-core.S
+++ b/arch/arm/crypto/chacha-scalar-core.S
+@@ -41,14 +41,6 @@
+ 	X14	.req	r12
+ 	X15	.req	r14
+ 
+-.Lexpand_32byte_k:
+-	// "expand 32-byte k"
+-	.word	0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+-
+-#ifdef __thumb2__
+-#  define adrl adr
+-#endif
+-
+ .macro __rev		out, in,  t0, t1, t2
+ .if __LINUX_ARM_ARCH__ >= 6
+ 	rev		\out, \in
+@@ -391,61 +383,65 @@
+ .endm	// _chacha
+ 
+ /*
+- * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
+- *		     const u32 iv[4]);
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ *		     const u32 *state, int nrounds);
+  */
+-ENTRY(chacha20_arm)
+ENTRY(chacha_doarm)
+ 	cmp		r2, #0			// len == 0?
+ 	reteq		lr
+ 
+	ldr		ip, [sp]
+	cmp		ip, #12
+
+ 	push		{r0-r2,r4-r11,lr}
+ 
+ 	// Push state x0-x15 onto stack.
+ 	// Also store an extra copy of x10-x11 just before the state.
+ 
+-	ldr		r4, [sp, #48]		// iv
+-	mov		r0, sp
+-	sub		sp, #80
+-
+-	// iv: x12-x15
+-	ldm		r4, {X12,X13,X14,X15}
+-	stmdb		r0!, {X12,X13,X14,X15}
+	add		X12, r3, #48
+	ldm		X12, {X12,X13,X14,X15}
+	push		{X12,X13,X14,X15}
+	sub		sp, sp, #64
+ 
+-	// key: x4-x11
+-	__ldrd		X8_X10, X9_X11, r3, 24
+	__ldrd		X8_X10, X9_X11, r3, 40
+ 	__strd		X8_X10, X9_X11, sp, 8
+-	stmdb		r0!, {X8_X10, X9_X11}
+-	ldm		r3, {X4-X9_X11}
+-	stmdb		r0!, {X4-X9_X11}
+-
+-	// constants: x0-x3
+-	adrl		X3, .Lexpand_32byte_k
+-	ldm		X3, {X0-X3}
+	__strd		X8_X10, X9_X11, sp, 56
+	ldm		r3, {X0-X9_X11}
+ 	__strd		X0, X1, sp, 16
+ 	__strd		X2, X3, sp, 24
+	__strd		X4, X5, sp, 32
+	__strd		X6, X7, sp, 40
+	__strd		X8_X10, X9_X11, sp, 48
+ 
+	beq		1f
+ 	_chacha		20
+ 
+-	add		sp, #76
+0:	add		sp, #76
+ 	pop		{r4-r11, pc}
+-ENDPROC(chacha20_arm)
+
+1:	_chacha		12
+	b		0b
+ENDPROC(chacha_doarm)
+ 
+ /*
+- * void hchacha20_arm(const u32 state[16], u32 out[8]);
+ * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+  */
+-ENTRY(hchacha20_arm)
+ENTRY(hchacha_block_arm)
+ 	push		{r1,r4-r11,lr}
+ 
+	cmp		r2, #12			// ChaCha12 ?
+
+ 	mov		r14, r0
+ 	ldmia		r14!, {r0-r11}		// load x0-x11
+ 	push		{r10-r11}		// store x10-x11 to stack
+ 	ldm		r14, {r10-r12,r14}	// load x12-x15
+ 	sub		sp, #8
+ 
+	beq		1f
+ 	_chacha_permute	20
+ 
+ 	// Skip over (unused0-unused1, x10-x11)
+-	add		sp, #16
+0:	add		sp, #16
+ 
+ 	// Fix up rotations of x12-x15
+ 	ror		X12, X12, #drot
+@@ -458,4 +454,7 @@ ENTRY(hchacha20_arm)
+ 	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+ 
+ 	pop		{r4-r11,pc}
+-ENDPROC(hchacha20_arm)
+
+1:	_chacha_permute	12
+	b		0b
+ENDPROC(hchacha_block_arm)
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -1,5 +1,5 @@
+ /*
+- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
+  * including ChaCha20 (RFC7539)
+  *
+  * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
--- a/target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0009-crypto-arm-chacha-expose-ARM-ChaCha-routine-as-libra.patch
@ -0,0 +1,108 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:15 +0100
+Subject: [PATCH] crypto: arm/chacha - expose ARM ChaCha routine as library
+ function
+
+commit a44a3430d71bad4ee56788a59fff099b291ea54c upstream.
+
+Expose the accelerated NEON ChaCha routine directly as a symbol
+export so that users of the ChaCha library API can use it directly.
+
+Given that calls into the library API will always go through the
+routines in this module if it is enabled, switch to static keys
+to select the optimal implementation available (which may be none
+at all, in which case we defer to the generic implementation for
+all invocations).
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/Kconfig       |  1 +
+ arch/arm/crypto/chacha-glue.c | 41 ++++++++++++++++++++++++++++++++++-
+ 2 files changed, 41 insertions(+), 1 deletion(-)
+
+--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
+@@ -129,6 +129,7 @@ config CRYPTO_CRC32_ARM_CE
+ config CRYPTO_CHACHA20_NEON
+ 	tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
+ 	select CRYPTO_BLKCIPHER
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+ 
+ config CRYPTO_NHPOLY1305_NEON
+ 	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -11,6 +11,7 @@
+ #include <crypto/internal/chacha.h>
+ #include <crypto/internal/simd.h>
+ #include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ 
+@@ -29,9 +30,11 @@ asmlinkage void hchacha_block_neon(const
+ asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ 			     const u32 *state, int nrounds);
+ 
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+ static inline bool neon_usable(void)
+ {
+-	return crypto_simd_usable();
+	return static_branch_likely(&use_neon) && crypto_simd_usable();
+ }
+ 
+ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+@@ -60,6 +63,40 @@ static void chacha_doneon(u32 *state, u8
+ 	}
+ }
+ 
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+		hchacha_block_arm(state, stream, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, stream, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		       int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE) {
+		chacha_doarm(dst, src, bytes, state, nrounds);
+		state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+		return;
+	}
+
+	kernel_neon_begin();
+	chacha_doneon(state, dst, src, bytes, nrounds);
+	kernel_neon_end();
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+ static int chacha_stream_xor(struct skcipher_request *req,
+ 			     const struct chacha_ctx *ctx, const u8 *iv,
+ 			     bool neon)
+@@ -269,6 +306,8 @@ static int __init chacha_simd_mod_init(v
+ 			for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
+ 				neon_algs[i].base.cra_priority = 0;
+ 			break;
+		default:
+			static_branch_enable(&use_neon);
+ 		}
+ 
+ 		err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
--- a/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0010-crypto-mips-chacha-import-32r2-ChaCha-code-from-Zinc.patch
@ -0,0 +1,451 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:16 +0100
+Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
+
+This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
+Zinc patch set.
+
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
+ 1 file changed, 424 insertions(+)
+ create mode 100644 arch/mips/crypto/chacha-core.S
+
+--- /dev/null
+++ b/arch/mips/crypto/chacha-core.S
+@@ -0,0 +1,424 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32		0x3c
+#define CHACHA20_BLOCK_SIZE	64
+#define STACK_SIZE		32
+
+#define X0	$t0
+#define X1	$t1
+#define X2	$t2
+#define X3	$t3
+#define X4	$t4
+#define X5	$t5
+#define X6	$t6
+#define X7	$t7
+#define X8	$t8
+#define X9	$t9
+#define X10	$v1
+#define X11	$s6
+#define X12	$s5
+#define X13	$s4
+#define X14	$s3
+#define X15	$s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0	$s1
+#define T1	$s0
+#define T(n)	T ## n
+#define X(n)	X ## n
+
+/* Input arguments */
+#define STATE		$a0
+#define OUT		$a1
+#define IN		$a2
+#define BYTES		$a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$s7
+
+#define IS_UNALIGNED	$s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define	CPU_TO_LE32(n) \
+	wsbh	n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+	x( 0); \
+	x( 1); \
+	x( 2); \
+	x( 3); \
+	x( 4); \
+	x( 5); \
+	x( 6); \
+	x( 7); \
+	x( 8); \
+	x( 9); \
+	x(10); \
+	x(11); \
+	x(12); \
+	x(13); \
+	x(14); \
+	x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+	x(15); \
+	x(14); \
+	x(13); \
+	x(12); \
+	x(11); \
+	x(10); \
+	x( 9); \
+	x( 8); \
+	x( 7); \
+	x( 6); \
+	x( 5); \
+	x( 4); \
+	x( 3); \
+	x( 2); \
+	x( 1); \
+	x( 0);
+
+#define PLUS_ONE_0	 1
+#define PLUS_ONE_1	 2
+#define PLUS_ONE_2	 3
+#define PLUS_ONE_3	 4
+#define PLUS_ONE_4	 5
+#define PLUS_ONE_5	 6
+#define PLUS_ONE_6	 7
+#define PLUS_ONE_7	 8
+#define PLUS_ONE_8	 9
+#define PLUS_ONE_9	10
+#define PLUS_ONE_10	11
+#define PLUS_ONE_11	12
+#define PLUS_ONE_12	13
+#define PLUS_ONE_13	14
+#define PLUS_ONE_14	15
+#define PLUS_ONE_15	16
+#define PLUS_ONE(x)	PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c)	a ## b ## c
+#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lwl	T1, (x*4)+MSB ## (IN); \
+	lwr	T1, (x*4)+LSB ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	swl	X ## x, (x*4)+MSB ## (OUT); \
+	swr	X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lw	T1, (x*4) ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	sw	X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotl	X(V), S;    \
+	rotl	X(W), S;    \
+	rotl	X(Y), S;    \
+	rotl	X(Z), S;
+
+.text
+.set	reorder
+.set	noat
+.globl	chacha20_mips
+.ent	chacha20_mips
+chacha20_mips:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Return bytes = 0. */
+	beqz	BYTES, .Lchacha20_mips_end
+
+	lw	NONCE_0, 48(STATE)
+
+	/* Save s0-s7 */
+	sw	$s0,  0($sp)
+	sw	$s1,  4($sp)
+	sw	$s2,  8($sp)
+	sw	$s3, 12($sp)
+	sw	$s4, 16($sp)
+	sw	$s5, 20($sp)
+	sw	$s6, 24($sp)
+	sw	$s7, 28($sp)
+
+	/* Test IN or OUT is unaligned.
+	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+	 */
+	or	IS_UNALIGNED, IN, OUT
+	andi	IS_UNALIGNED, 0x3
+
+	/* Set number of rounds */
+	li	$at, 20
+
+	b	.Lchacha20_rounds_start
+
+.align 4
+.Loop_chacha20_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+.Lchacha20_rounds_start:
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+
+	move	X12, NONCE_0
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_chacha20_xor_rounds:
+	addiu	$at, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$at, .Loop_chacha20_xor_rounds
+
+	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+
+	/* Is data src/dst unaligned? Jump */
+	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned
+
+	/* Set number rounds here to fill delayslot. */
+	li	$at, 20
+
+	/* BYTES < 0, it has no full block. */
+	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned
+
+	FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha20_rounds
+
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+
+	/* BYTES < 0? Handle last bytes */
+	bltz	BYTES, .Lchacha20_mips_xor_bytes
+
+.Lchacha20_mips_xor_done:
+	/* Restore used registers */
+	lw	$s0,  0($sp)
+	lw	$s1,  4($sp)
+	lw	$s2,  8($sp)
+	lw	$s3, 12($sp)
+	lw	$s4, 16($sp)
+	lw	$s5, 20($sp)
+	lw	$s6, 24($sp)
+	lw	$s7, 28($sp)
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+.Lchacha20_mips_end:
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+
+.Lchacha20_mips_no_full_block_aligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha20_unaligned:
+	/* Set number rounds here to fill delayslot. */
+	li	$at, 20
+
+	/* BYTES > 0, it has no full block. */
+	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned
+
+	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha20_rounds
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+	.set noreorder
+	/* Fall through to byte handling */
+	bgez	BYTES, .Lchacha20_mips_xor_done
+.Lchacha20_mips_xor_unaligned_0_b:
+.Lchacha20_mips_xor_aligned_0_b:
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha20_mips_xor_bytes:
+	addu	IN, $at
+	addu	OUT, $at
+	/* First byte */
+	lbu	T1, 0(IN)
+	addiu	$at, BYTES, 1
+	CPU_TO_LE32(SAVED_X)
+	ROTR(SAVED_X)
+	xor	T1, SAVED_X
+	sb	T1, 0(OUT)
+	beqz	$at, .Lchacha20_mips_xor_done
+	/* Second byte */
+	lbu	T1, 1(IN)
+	addiu	$at, BYTES, 2
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 1(OUT)
+	beqz	$at, .Lchacha20_mips_xor_done
+	/* Third byte */
+	lbu	T1, 2(IN)
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 2(OUT)
+	b	.Lchacha20_mips_xor_done
+
+.Lchacha20_mips_no_full_block_unaligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha20_mips
+.set at
--- a/target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0011-crypto-mips-chacha-wire-up-accelerated-32r2-code-fro.patch
@ -0,0 +1,559 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:17 +0100
+Subject: [PATCH] crypto: mips/chacha - wire up accelerated 32r2 code from Zinc
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 3a2f58f3ba4f6f44e33d1a48240d5eadb882cb59 upstream.
+
+This integrates the accelerated MIPS 32r2 implementation of ChaCha
+into both the API and library interfaces of the kernel crypto stack.
+
+The significance of this is that, in addition to becoming available
+as an accelerated library implementation, it can also be used by
+existing crypto API code such as Adiantum (for block encryption on
+ultra low performance cores) or IPsec using chacha20poly1305. These
+are use cases that have already opted into using the abstract crypto
+API. In order to support Adiantum, the core assembler routine has
+been adapted to take the round count as a function argument rather
+than hardcoding it to 20.
+
+Co-developed-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: René van Dorst <opensource@vdorst.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/mips/Makefile             |   2 +-
+ arch/mips/crypto/Makefile      |   4 +
+ arch/mips/crypto/chacha-core.S | 159 ++++++++++++++++++++++++---------
+ arch/mips/crypto/chacha-glue.c | 150 +++++++++++++++++++++++++++++++
+ crypto/Kconfig                 |   6 ++
+ 5 files changed, 277 insertions(+), 44 deletions(-)
+ create mode 100644 arch/mips/crypto/chacha-glue.c
+
+--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
+@@ -334,7 +334,7 @@ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/m
+ # See arch/mips/Kbuild for content of core part of the kernel
+ core-y += arch/mips/
+ 
+-drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
+drivers-y			+= arch/mips/crypto/
+ drivers-$(CONFIG_OPROFILE)	+= arch/mips/oprofile/
+ 
+ # suspend and hibernation support
+--- a/arch/mips/crypto/Makefile
+++ b/arch/mips/crypto/Makefile
+@@ -4,3 +4,7 @@
+ #
+ 
+ obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
+
+obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+chacha-mips-y := chacha-core.o chacha-glue.o
+AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+--- a/arch/mips/crypto/chacha-core.S
+++ b/arch/mips/crypto/chacha-core.S
+@@ -125,7 +125,7 @@
+ #define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+ 
+ #define STORE_UNALIGNED(x) \
+-CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+ 	.if (x != 12); \
+ 		lw	T0, (x*4)(STATE); \
+ 	.endif; \
+@@ -142,7 +142,7 @@ CONCAT3(.Lchacha20_mips_xor_unaligned_,
+ 	swr	X ## x, (x*4)+LSB ## (OUT);
+ 
+ #define STORE_ALIGNED(x) \
+-CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+ 	.if (x != 12); \
+ 		lw	T0, (x*4)(STATE); \
+ 	.endif; \
+@@ -162,9 +162,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+  * Every jumptable entry must be equal in size.
+  */
+ #define JMPTBL_ALIGNED(x) \
+-.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+ 	.set	noreorder; \
+-	b	.Lchacha20_mips_xor_aligned_ ## x ## _b; \
+	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
+ 	.if (x == 12); \
+ 		addu	SAVED_X, X ## x, NONCE_0; \
+ 	.else; \
+@@ -173,9 +173,9 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+ 	.set	reorder
+ 
+ #define JMPTBL_UNALIGNED(x) \
+-.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+ 	.set	noreorder; \
+-	b	.Lchacha20_mips_xor_unaligned_ ## x ## _b; \
+	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
+ 	.if (x == 12); \
+ 		addu	SAVED_X, X ## x, NONCE_0; \
+ 	.else; \
+@@ -200,15 +200,18 @@ CONCAT3(.Lchacha20_mips_xor_aligned_, PL
+ .text
+ .set	reorder
+ .set	noat
+-.globl	chacha20_mips
+-.ent	chacha20_mips
+-chacha20_mips:
+.globl	chacha_crypt_arch
+.ent	chacha_crypt_arch
+chacha_crypt_arch:
+ 	.frame	$sp, STACK_SIZE, $ra
+ 
+	/* Load number of rounds */
+	lw	$at, 16($sp)
+
+ 	addiu	$sp, -STACK_SIZE
+ 
+ 	/* Return bytes = 0. */
+-	beqz	BYTES, .Lchacha20_mips_end
+	beqz	BYTES, .Lchacha_mips_end
+ 
+ 	lw	NONCE_0, 48(STATE)
+ 
+@@ -228,18 +231,15 @@ chacha20_mips:
+ 	or	IS_UNALIGNED, IN, OUT
+ 	andi	IS_UNALIGNED, 0x3
+ 
+-	/* Set number of rounds */
+-	li	$at, 20
+-
+-	b	.Lchacha20_rounds_start
+	b	.Lchacha_rounds_start
+ 
+ .align 4
+-.Loop_chacha20_rounds:
+.Loop_chacha_rounds:
+ 	addiu	IN,  CHACHA20_BLOCK_SIZE
+ 	addiu	OUT, CHACHA20_BLOCK_SIZE
+ 	addiu	NONCE_0, 1
+ 
+-.Lchacha20_rounds_start:
+.Lchacha_rounds_start:
+ 	lw	X0,  0(STATE)
+ 	lw	X1,  4(STATE)
+ 	lw	X2,  8(STATE)
+@@ -259,7 +259,7 @@ chacha20_mips:
+ 	lw	X14, 56(STATE)
+ 	lw	X15, 60(STATE)
+ 
+-.Loop_chacha20_xor_rounds:
+.Loop_chacha_xor_rounds:
+ 	addiu	$at, -2
+ 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+ 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+@@ -269,31 +269,31 @@ chacha20_mips:
+ 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+ 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+ 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+-	bnez	$at, .Loop_chacha20_xor_rounds
+	bnez	$at, .Loop_chacha_xor_rounds
+ 
+ 	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+ 
+ 	/* Is data src/dst unaligned? Jump */
+-	bnez	IS_UNALIGNED, .Loop_chacha20_unaligned
+	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
+ 
+ 	/* Set number rounds here to fill delayslot. */
+-	li	$at, 20
+	lw	$at, (STACK_SIZE+16)($sp)
+ 
+ 	/* BYTES < 0, it has no full block. */
+-	bltz	BYTES, .Lchacha20_mips_no_full_block_aligned
+	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
+ 
+ 	FOR_EACH_WORD_REV(STORE_ALIGNED)
+ 
+ 	/* BYTES > 0? Loop again. */
+-	bgtz	BYTES, .Loop_chacha20_rounds
+	bgtz	BYTES, .Loop_chacha_rounds
+ 
+ 	/* Place this here to fill delay slot */
+ 	addiu	NONCE_0, 1
+ 
+ 	/* BYTES < 0? Handle last bytes */
+-	bltz	BYTES, .Lchacha20_mips_xor_bytes
+	bltz	BYTES, .Lchacha_mips_xor_bytes
+ 
+-.Lchacha20_mips_xor_done:
+.Lchacha_mips_xor_done:
+ 	/* Restore used registers */
+ 	lw	$s0,  0($sp)
+ 	lw	$s1,  4($sp)
+@@ -307,11 +307,11 @@ chacha20_mips:
+ 	/* Write NONCE_0 back to right location in state */
+ 	sw	NONCE_0, 48(STATE)
+ 
+-.Lchacha20_mips_end:
+.Lchacha_mips_end:
+ 	addiu	$sp, STACK_SIZE
+ 	jr	$ra
+ 
+-.Lchacha20_mips_no_full_block_aligned:
+.Lchacha_mips_no_full_block_aligned:
+ 	/* Restore the offset on BYTES */
+ 	addiu	BYTES, CHACHA20_BLOCK_SIZE
+ 
+@@ -319,7 +319,7 @@ chacha20_mips:
+ 	andi	$at, BYTES, MASK_U32
+ 
+ 	/* Load upper half of jump table addr */
+-	lui	T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
+	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+ 
+ 	/* Calculate lower half jump table offset */
+ 	ins	T0, $at, 1, 6
+@@ -328,7 +328,7 @@ chacha20_mips:
+ 	addu	T1, STATE, $at
+ 
+ 	/* Add lower half jump table addr */
+-	addiu	T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+ 
+ 	/* Read value from STATE */
+ 	lw	SAVED_CA, 0(T1)
+@@ -342,31 +342,31 @@ chacha20_mips:
+ 	FOR_EACH_WORD(JMPTBL_ALIGNED)
+ 
+ 
+-.Loop_chacha20_unaligned:
+.Loop_chacha_unaligned:
+ 	/* Set number rounds here to fill delayslot. */
+-	li	$at, 20
+	lw	$at, (STACK_SIZE+16)($sp)
+ 
+ 	/* BYTES > 0, it has no full block. */
+-	bltz	BYTES, .Lchacha20_mips_no_full_block_unaligned
+	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
+ 
+ 	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+ 
+ 	/* BYTES > 0? Loop again. */
+-	bgtz	BYTES, .Loop_chacha20_rounds
+	bgtz	BYTES, .Loop_chacha_rounds
+ 
+ 	/* Write NONCE_0 back to right location in state */
+ 	sw	NONCE_0, 48(STATE)
+ 
+ 	.set noreorder
+ 	/* Fall through to byte handling */
+-	bgez	BYTES, .Lchacha20_mips_xor_done
+-.Lchacha20_mips_xor_unaligned_0_b:
+-.Lchacha20_mips_xor_aligned_0_b:
+	bgez	BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+ 	/* Place this here to fill delay slot */
+ 	addiu	NONCE_0, 1
+ 	.set reorder
+ 
+-.Lchacha20_mips_xor_bytes:
+.Lchacha_mips_xor_bytes:
+ 	addu	IN, $at
+ 	addu	OUT, $at
+ 	/* First byte */
+@@ -376,22 +376,22 @@ chacha20_mips:
+ 	ROTR(SAVED_X)
+ 	xor	T1, SAVED_X
+ 	sb	T1, 0(OUT)
+-	beqz	$at, .Lchacha20_mips_xor_done
+	beqz	$at, .Lchacha_mips_xor_done
+ 	/* Second byte */
+ 	lbu	T1, 1(IN)
+ 	addiu	$at, BYTES, 2
+ 	ROTx	SAVED_X, 8
+ 	xor	T1, SAVED_X
+ 	sb	T1, 1(OUT)
+-	beqz	$at, .Lchacha20_mips_xor_done
+	beqz	$at, .Lchacha_mips_xor_done
+ 	/* Third byte */
+ 	lbu	T1, 2(IN)
+ 	ROTx	SAVED_X, 8
+ 	xor	T1, SAVED_X
+ 	sb	T1, 2(OUT)
+-	b	.Lchacha20_mips_xor_done
+	b	.Lchacha_mips_xor_done
+ 
+-.Lchacha20_mips_no_full_block_unaligned:
+.Lchacha_mips_no_full_block_unaligned:
+ 	/* Restore the offset on BYTES */
+ 	addiu	BYTES, CHACHA20_BLOCK_SIZE
+ 
+@@ -399,7 +399,7 @@ chacha20_mips:
+ 	andi	$at, BYTES, MASK_U32
+ 
+ 	/* Load upper half of jump table addr */
+-	lui	T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
+	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+ 
+ 	/* Calculate lower half jump table offset */
+ 	ins	T0, $at, 1, 6
+@@ -408,7 +408,7 @@ chacha20_mips:
+ 	addu	T1, STATE, $at
+ 
+ 	/* Add lower half jump table addr */
+-	addiu	T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+ 
+ 	/* Read value from STATE */
+ 	lw	SAVED_CA, 0(T1)
+@@ -420,5 +420,78 @@ chacha20_mips:
+ 
+ 	/* Jump table */
+ 	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+-.end chacha20_mips
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE	$a0
+ * OUT		$a1
+ * NROUND	$a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12	$a3
+#define X13	$at
+#define X14	$v0
+#define X15	STATE
+
+.set noat
+.globl	hchacha_block_arch
+.ent	hchacha_block_arch
+hchacha_block_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Save X11(s6) */
+	sw	X11, 0($sp)
+
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+	lw	X12, 48(STATE)
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+	addiu	$a2, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$a2, .Loop_hchacha_xor_rounds
+
+	/* Restore used register */
+	lw	X11, 0($sp)
+
+	sw	X0,  0(OUT)
+	sw	X1,  4(OUT)
+	sw	X2,  8(OUT)
+	sw	X3,  12(OUT)
+	sw	X12, 16(OUT)
+	sw	X13, 20(OUT)
+	sw	X14, 24(OUT)
+	sw	X15, 28(OUT)
+
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+.end hchacha_block_arch
+ .set at
+--- /dev/null
+++ b/arch/mips/crypto/chacha-glue.c
+@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MIPS accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/byteorder.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+	chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+static int chacha_mips_stream_xor(struct skcipher_request *req,
+				  const struct chacha_ctx *ctx, const u8 *iv)
+{
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	chacha_init_generic(state, ctx->key, iv);
+
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+			     nbytes, ctx->nrounds);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+
+	return err;
+}
+
+static int chacha_mips(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+	return chacha_mips_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_mips(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct chacha_ctx subctx;
+	u32 state[16];
+	u8 real_iv[16];
+
+	chacha_init_generic(state, ctx->key, req->iv);
+
+	hchacha_block(state, subctx.key, ctx->nrounds);
+	subctx.nrounds = ctx->nrounds;
+
+	memcpy(&real_iv[0], req->iv + 24, 8);
+	memcpy(&real_iv[8], req->iv + 16, 8);
+	return chacha_mips_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+	{
+		.base.cra_name		= "chacha20",
+		.base.cra_driver_name	= "chacha20-mips",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= CHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= chacha_mips,
+		.decrypt		= chacha_mips,
+	}, {
+		.base.cra_name		= "xchacha20",
+		.base.cra_driver_name	= "xchacha20-mips",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha20_setkey,
+		.encrypt		= xchacha_mips,
+		.decrypt		= xchacha_mips,
+	}, {
+		.base.cra_name		= "xchacha12",
+		.base.cra_driver_name	= "xchacha12-mips",
+		.base.cra_priority	= 200,
+		.base.cra_blocksize	= 1,
+		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= CHACHA_KEY_SIZE,
+		.max_keysize		= CHACHA_KEY_SIZE,
+		.ivsize			= XCHACHA_IV_SIZE,
+		.chunksize		= CHACHA_BLOCK_SIZE,
+		.setkey			= chacha12_setkey,
+		.encrypt		= xchacha_mips,
+		.decrypt		= xchacha_mips,
+	}
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-mips");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-mips");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-mips");
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -1423,6 +1423,12 @@ config CRYPTO_CHACHA20_X86_64
+ 	  SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
+ 	  XChaCha20, and XChaCha12 stream ciphers.
+ 
+config CRYPTO_CHACHA_MIPS
+	tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
+	depends on CPU_MIPS32_R2
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+ config CRYPTO_SEED
+ 	tristate "SEED cipher algorithm"
+ 	select CRYPTO_ALGAPI
--- a/target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0012-crypto-chacha-unexport-chacha_generic-routines.patch
@ -0,0 +1,115 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:18 +0100
+Subject: [PATCH] crypto: chacha - unexport chacha_generic routines
+
+commit 22cf705360707ced15f9fe5423938f313c7df536 upstream.
+
+Now that all users of generic ChaCha code have moved to the core library,
+there is no longer a need for the generic ChaCha skcpiher driver to
+export parts of it implementation for reuse by other drivers. So drop
+the exports, and make the symbols static.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/chacha_generic.c          | 26 ++++++++------------------
+ include/crypto/internal/chacha.h | 10 ----------
+ 2 files changed, 8 insertions(+), 28 deletions(-)
+
+--- a/crypto/chacha_generic.c
+++ b/crypto/chacha_generic.c
+@@ -21,7 +21,7 @@ static int chacha_stream_xor(struct skci
+ 
+ 	err = skcipher_walk_virt(&walk, req, false);
+ 
+-	crypto_chacha_init(state, ctx, iv);
+	chacha_init_generic(state, ctx->key, iv);
+ 
+ 	while (walk.nbytes > 0) {
+ 		unsigned int nbytes = walk.nbytes;
+@@ -37,36 +37,27 @@ static int chacha_stream_xor(struct skci
+ 	return err;
+ }
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv)
+-{
+-	chacha_init_generic(state, ctx->key, iv);
+-}
+-EXPORT_SYMBOL_GPL(crypto_chacha_init);
+-
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize)
+static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+				  unsigned int keysize)
+ {
+ 	return chacha_setkey(tfm, key, keysize, 20);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
+ 
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize)
+static int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+				 unsigned int keysize)
+ {
+ 	return chacha_setkey(tfm, key, keysize, 12);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
+ 
+-int crypto_chacha_crypt(struct skcipher_request *req)
+static int crypto_chacha_crypt(struct skcipher_request *req)
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ 
+ 	return chacha_stream_xor(req, ctx, req->iv);
+ }
+-EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
+ 
+-int crypto_xchacha_crypt(struct skcipher_request *req)
+static int crypto_xchacha_crypt(struct skcipher_request *req)
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+@@ -75,7 +66,7 @@ int crypto_xchacha_crypt(struct skcipher
+ 	u8 real_iv[16];
+ 
+ 	/* Compute the subkey given the original key and first 128 nonce bits */
+-	crypto_chacha_init(state, ctx, req->iv);
+	chacha_init_generic(state, ctx->key, req->iv);
+ 	hchacha_block_generic(state, subctx.key, ctx->nrounds);
+ 	subctx.nrounds = ctx->nrounds;
+ 
+@@ -86,7 +77,6 @@ int crypto_xchacha_crypt(struct skcipher
+ 	/* Generate the stream and XOR it with the data */
+ 	return chacha_stream_xor(req, &subctx, real_iv);
+ }
+-EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
+ 
+ static struct skcipher_alg algs[] = {
+ 	{
+--- a/include/crypto/internal/chacha.h
+++ b/include/crypto/internal/chacha.h
+@@ -12,8 +12,6 @@ struct chacha_ctx {
+ 	int nrounds;
+ };
+ 
+-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
+-
+ static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ 				unsigned int keysize, int nrounds)
+ {
+@@ -42,12 +40,4 @@ static int inline chacha12_setkey(struct
+ 	return chacha_setkey(tfm, key, keysize, 12);
+ }
+ 
+-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize);
+-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-			   unsigned int keysize);
+-
+-int crypto_chacha_crypt(struct skcipher_request *req);
+-int crypto_xchacha_crypt(struct skcipher_request *req);
+-
+ #endif /* _CRYPTO_CHACHA_H */
--- a/target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0013-crypto-poly1305-move-core-routines-into-a-separate-l.patch
@ -0,0 +1,649 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:19 +0100
+Subject: [PATCH] crypto: poly1305 - move core routines into a separate library
+
+commit 48ea8c6ebc96bc0990e12ee1c43d0832c23576bb upstream.
+
+Move the core Poly1305 routines shared between the generic Poly1305
+shash driver and the Adiantum and NHPoly1305 drivers into a separate
+library so that using just this pieces does not pull in the crypto
+API pieces of the generic Poly1305 routine.
+
+In a subsequent patch, we will augment this generic library with
+init/update/final routines so that Poyl1305 algorithm can be used
+directly without the need for using the crypto API's shash abstraction.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    |   2 +-
+ crypto/Kconfig                     |   5 +-
+ crypto/adiantum.c                  |   5 +-
+ crypto/nhpoly1305.c                |   3 +-
+ crypto/poly1305_generic.c          | 195 ++---------------------------
+ include/crypto/internal/poly1305.h |  67 ++++++++++
+ include/crypto/poly1305.h          |  23 ----
+ lib/crypto/Kconfig                 |   3 +
+ lib/crypto/Makefile                |   3 +
+ lib/crypto/poly1305.c              | 158 +++++++++++++++++++++++
+ 10 files changed, 248 insertions(+), 216 deletions(-)
+ create mode 100644 include/crypto/internal/poly1305.h
+ create mode 100644 lib/crypto/poly1305.c
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -7,8 +7,8 @@
+ 
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+ #include <crypto/internal/simd.h>
+-#include <crypto/poly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -446,7 +446,7 @@ config CRYPTO_KEYWRAP
+ config CRYPTO_NHPOLY1305
+ 	tristate
+ 	select CRYPTO_HASH
+-	select CRYPTO_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC
+ 
+ config CRYPTO_NHPOLY1305_SSE2
+ 	tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)"
+@@ -467,7 +467,7 @@ config CRYPTO_NHPOLY1305_AVX2
+ config CRYPTO_ADIANTUM
+ 	tristate "Adiantum support"
+ 	select CRYPTO_CHACHA20
+-	select CRYPTO_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC
+ 	select CRYPTO_NHPOLY1305
+ 	select CRYPTO_MANAGER
+ 	help
+@@ -686,6 +686,7 @@ config CRYPTO_GHASH
+ config CRYPTO_POLY1305
+ 	tristate "Poly1305 authenticator algorithm"
+ 	select CRYPTO_HASH
+	select CRYPTO_LIB_POLY1305_GENERIC
+ 	help
+ 	  Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/crypto/adiantum.c
+++ b/crypto/adiantum.c
+@@ -33,6 +33,7 @@
+ #include <crypto/b128ops.h>
+ #include <crypto/chacha.h>
+ #include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+ #include <crypto/internal/skcipher.h>
+ #include <crypto/nhpoly1305.h>
+ #include <crypto/scatterwalk.h>
+@@ -242,11 +243,11 @@ static void adiantum_hash_header(struct
+ 
+ 	BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
+ 	poly1305_core_blocks(&state, &tctx->header_hash_key,
+-			     &header, sizeof(header) / POLY1305_BLOCK_SIZE);
+			     &header, sizeof(header) / POLY1305_BLOCK_SIZE, 1);
+ 
+ 	BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
+ 	poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
+-			     TWEAK_SIZE / POLY1305_BLOCK_SIZE);
+			     TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
+ 
+ 	poly1305_core_emit(&state, &rctx->header_hash);
+ }
+--- a/crypto/nhpoly1305.c
+++ b/crypto/nhpoly1305.c
+@@ -33,6 +33,7 @@
+ #include <asm/unaligned.h>
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
+#include <crypto/internal/poly1305.h>
+ #include <crypto/nhpoly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+@@ -78,7 +79,7 @@ static void process_nh_hash_value(struct
+ 	BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
+ 
+ 	poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
+-			     NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
+			     NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+ /*
+--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
+@@ -13,27 +13,12 @@
+ 
+ #include <crypto/algapi.h>
+ #include <crypto/internal/hash.h>
+-#include <crypto/poly1305.h>
+#include <crypto/internal/poly1305.h>
+ #include <linux/crypto.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <asm/unaligned.h>
+ 
+-static inline u64 mlt(u64 a, u64 b)
+-{
+-	return a * b;
+-}
+-
+-static inline u32 sr(u64 v, u_char n)
+-{
+-	return v >> n;
+-}
+-
+-static inline u32 and(u32 v, u32 mask)
+-{
+-	return v & mask;
+-}
+-
+ int crypto_poly1305_init(struct shash_desc *desc)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+@@ -47,124 +32,8 @@ int crypto_poly1305_init(struct shash_de
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_init);
+ 
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
+-{
+-	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+-	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+-	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+-	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+-	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+-	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_setkey);
+-
+-/*
+- * Poly1305 requires a unique key for each tag, which implies that we can't set
+- * it on the tfm that gets accessed by multiple users simultaneously. Instead we
+- * expect the key as the first 32 bytes in the update() call.
+- */
+-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-					const u8 *src, unsigned int srclen)
+-{
+-	if (!dctx->sset) {
+-		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-			poly1305_core_setkey(&dctx->r, src);
+-			src += POLY1305_BLOCK_SIZE;
+-			srclen -= POLY1305_BLOCK_SIZE;
+-			dctx->rset = true;
+-		}
+-		if (srclen >= POLY1305_BLOCK_SIZE) {
+-			dctx->s[0] = get_unaligned_le32(src +  0);
+-			dctx->s[1] = get_unaligned_le32(src +  4);
+-			dctx->s[2] = get_unaligned_le32(src +  8);
+-			dctx->s[3] = get_unaligned_le32(src + 12);
+-			src += POLY1305_BLOCK_SIZE;
+-			srclen -= POLY1305_BLOCK_SIZE;
+-			dctx->sset = true;
+-		}
+-	}
+-	return srclen;
+-}
+-EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
+-
+-static void poly1305_blocks_internal(struct poly1305_state *state,
+-				     const struct poly1305_key *key,
+-				     const void *src, unsigned int nblocks,
+-				     u32 hibit)
+-{
+-	u32 r0, r1, r2, r3, r4;
+-	u32 s1, s2, s3, s4;
+-	u32 h0, h1, h2, h3, h4;
+-	u64 d0, d1, d2, d3, d4;
+-
+-	if (!nblocks)
+-		return;
+-
+-	r0 = key->r[0];
+-	r1 = key->r[1];
+-	r2 = key->r[2];
+-	r3 = key->r[3];
+-	r4 = key->r[4];
+-
+-	s1 = r1 * 5;
+-	s2 = r2 * 5;
+-	s3 = r3 * 5;
+-	s4 = r4 * 5;
+-
+-	h0 = state->h[0];
+-	h1 = state->h[1];
+-	h2 = state->h[2];
+-	h3 = state->h[3];
+-	h4 = state->h[4];
+-
+-	do {
+-		/* h += m[i] */
+-		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+-		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+-		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+-		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+-		h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
+-
+-		/* h *= r */
+-		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+-		     mlt(h3, s2) + mlt(h4, s1);
+-		d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+-		     mlt(h3, s3) + mlt(h4, s2);
+-		d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+-		     mlt(h3, s4) + mlt(h4, s3);
+-		d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+-		     mlt(h3, r0) + mlt(h4, s4);
+-		d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+-		     mlt(h3, r1) + mlt(h4, r0);
+-
+-		/* (partial) h %= p */
+-		d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
+-		d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
+-		d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
+-		d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
+-		h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+-		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
+-
+-		src += POLY1305_BLOCK_SIZE;
+-	} while (--nblocks);
+-
+-	state->h[0] = h0;
+-	state->h[1] = h1;
+-	state->h[2] = h2;
+-	state->h[3] = h3;
+-	state->h[4] = h4;
+-}
+-
+-void poly1305_core_blocks(struct poly1305_state *state,
+-			  const struct poly1305_key *key,
+-			  const void *src, unsigned int nblocks)
+-{
+-	poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+-
+-static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
+-			    const u8 *src, unsigned int srclen, u32 hibit)
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+			    unsigned int srclen)
+ {
+ 	unsigned int datalen;
+ 
+@@ -174,8 +43,8 @@ static void poly1305_blocks(struct poly1
+ 		srclen = datalen;
+ 	}
+ 
+-	poly1305_blocks_internal(&dctx->h, &dctx->r,
+-				 src, srclen / POLY1305_BLOCK_SIZE, hibit);
+	poly1305_core_blocks(&dctx->h, &dctx->r, src,
+			     srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+ int crypto_poly1305_update(struct shash_desc *desc,
+@@ -193,13 +62,13 @@ int crypto_poly1305_update(struct shash_
+ 
+ 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ 			poly1305_blocks(dctx, dctx->buf,
+-					POLY1305_BLOCK_SIZE, 1 << 24);
+					POLY1305_BLOCK_SIZE);
+ 			dctx->buflen = 0;
+ 		}
+ 	}
+ 
+ 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-		poly1305_blocks(dctx, src, srclen, 1 << 24);
+		poly1305_blocks(dctx, src, srclen);
+ 		src += srclen - (srclen % POLY1305_BLOCK_SIZE);
+ 		srclen %= POLY1305_BLOCK_SIZE;
+ 	}
+@@ -213,54 +82,6 @@ int crypto_poly1305_update(struct shash_
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_update);
+ 
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst)
+-{
+-	u32 h0, h1, h2, h3, h4;
+-	u32 g0, g1, g2, g3, g4;
+-	u32 mask;
+-
+-	/* fully carry h */
+-	h0 = state->h[0];
+-	h1 = state->h[1];
+-	h2 = state->h[2];
+-	h3 = state->h[3];
+-	h4 = state->h[4];
+-
+-	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
+-	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
+-	h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
+-	h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+-	h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
+-
+-	/* compute h + -p */
+-	g0 = h0 + 5;
+-	g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
+-	g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
+-	g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
+-	g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+-
+-	/* select h if h < p, or h + -p if h >= p */
+-	mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+-	g0 &= mask;
+-	g1 &= mask;
+-	g2 &= mask;
+-	g3 &= mask;
+-	g4 &= mask;
+-	mask = ~mask;
+-	h0 = (h0 & mask) | g0;
+-	h1 = (h1 & mask) | g1;
+-	h2 = (h2 & mask) | g2;
+-	h3 = (h3 & mask) | g3;
+-	h4 = (h4 & mask) | g4;
+-
+-	/* h = h % (2^128) */
+-	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+-	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+-	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+-	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+-}
+-EXPORT_SYMBOL_GPL(poly1305_core_emit);
+-
+ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+@@ -274,7 +95,7 @@ int crypto_poly1305_final(struct shash_d
+ 		dctx->buf[dctx->buflen++] = 1;
+ 		memset(dctx->buf + dctx->buflen, 0,
+ 		       POLY1305_BLOCK_SIZE - dctx->buflen);
+-		poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+		poly1305_core_blocks(&dctx->h, &dctx->r, dctx->buf, 1, 0);
+ 	}
+ 
+ 	poly1305_core_emit(&dctx->h, digest);
+--- /dev/null
+++ b/include/crypto/internal/poly1305.h
+@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for the Poly1305 algorithm
+ */
+
+#ifndef _CRYPTO_INTERNAL_POLY1305_H
+#define _CRYPTO_INTERNAL_POLY1305_H
+
+#include <asm/unaligned.h>
+#include <linux/types.h>
+#include <crypto/poly1305.h>
+
+struct shash_desc;
+
+/*
+ * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+ * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+ * ("s key") at the end.  They also only support block-aligned inputs.
+ */
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+	*state = (struct poly1305_state){};
+}
+
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key, const void *src,
+			  unsigned int nblocks, u32 hibit);
+void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+
+/* Crypto API helper functions for the Poly1305 MAC */
+int crypto_poly1305_init(struct shash_desc *desc);
+
+int crypto_poly1305_update(struct shash_desc *desc,
+			   const u8 *src, unsigned int srclen);
+int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+
+/*
+ * Poly1305 requires a unique key for each tag, which implies that we can't set
+ * it on the tfm that gets accessed by multiple users simultaneously. Instead we
+ * expect the key as the first 32 bytes in the update() call.
+ */
+static inline
+unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+					const u8 *src, unsigned int srclen)
+{
+	if (!dctx->sset) {
+		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+			poly1305_core_setkey(&dctx->r, src);
+			src += POLY1305_BLOCK_SIZE;
+			srclen -= POLY1305_BLOCK_SIZE;
+			dctx->rset = true;
+		}
+		if (srclen >= POLY1305_BLOCK_SIZE) {
+			dctx->s[0] = get_unaligned_le32(src +  0);
+			dctx->s[1] = get_unaligned_le32(src +  4);
+			dctx->s[2] = get_unaligned_le32(src +  8);
+			dctx->s[3] = get_unaligned_le32(src + 12);
+			src += POLY1305_BLOCK_SIZE;
+			srclen -= POLY1305_BLOCK_SIZE;
+			dctx->sset = true;
+		}
+	}
+	return srclen;
+}
+
+#endif
+--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
+@@ -38,27 +38,4 @@ struct poly1305_desc_ctx {
+ 	bool sset;
+ };
+ 
+-/*
+- * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+- * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+- * ("s key") at the end.  They also only support block-aligned inputs.
+- */
+-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
+-static inline void poly1305_core_init(struct poly1305_state *state)
+-{
+-	memset(state->h, 0, sizeof(state->h));
+-}
+-void poly1305_core_blocks(struct poly1305_state *state,
+-			  const struct poly1305_key *key,
+-			  const void *src, unsigned int nblocks);
+-void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+-
+-/* Crypto API helper functions for the Poly1305 MAC */
+-int crypto_poly1305_init(struct shash_desc *desc);
+-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+-					const u8 *src, unsigned int srclen);
+-int crypto_poly1305_update(struct shash_desc *desc,
+-			   const u8 *src, unsigned int srclen);
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+-
+ #endif
+--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
+@@ -37,5 +37,8 @@ config CRYPTO_LIB_CHACHA
+ config CRYPTO_LIB_DES
+ 	tristate
+ 
+config CRYPTO_LIB_POLY1305_GENERIC
+	tristate
+
+ config CRYPTO_LIB_SHA256
+ 	tristate
+--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
+@@ -13,5 +13,8 @@ libarc4-y					:= arc4.o
+ obj-$(CONFIG_CRYPTO_LIB_DES)			+= libdes.o
+ libdes-y					:= des.o
+ 
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC)	+= libpoly1305.o
+libpoly1305-y					:= poly1305.o
+
+ obj-$(CONFIG_CRYPTO_LIB_SHA256)			+= libsha256.o
+ libsha256-y					:= sha256.o
+--- /dev/null
+++ b/lib/crypto/poly1305.c
+@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Poly1305 authenticator algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
+ */
+
+#include <crypto/internal/poly1305.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/unaligned.h>
+
+static inline u64 mlt(u64 a, u64 b)
+{
+	return a * b;
+}
+
+static inline u32 sr(u64 v, u_char n)
+{
+	return v >> n;
+}
+
+static inline u32 and(u32 v, u32 mask)
+{
+	return v & mask;
+}
+
+void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
+{
+	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
+	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
+	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
+	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
+	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
+}
+EXPORT_SYMBOL_GPL(poly1305_core_setkey);
+
+void poly1305_core_blocks(struct poly1305_state *state,
+			  const struct poly1305_key *key, const void *src,
+			  unsigned int nblocks, u32 hibit)
+{
+	u32 r0, r1, r2, r3, r4;
+	u32 s1, s2, s3, s4;
+	u32 h0, h1, h2, h3, h4;
+	u64 d0, d1, d2, d3, d4;
+
+	if (!nblocks)
+		return;
+
+	r0 = key->r[0];
+	r1 = key->r[1];
+	r2 = key->r[2];
+	r3 = key->r[3];
+	r4 = key->r[4];
+
+	s1 = r1 * 5;
+	s2 = r2 * 5;
+	s3 = r3 * 5;
+	s4 = r4 * 5;
+
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
+
+	do {
+		/* h += m[i] */
+		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
+		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
+		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
+		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
+		h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
+
+		/* h *= r */
+		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
+		     mlt(h3, s2) + mlt(h4, s1);
+		d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
+		     mlt(h3, s3) + mlt(h4, s2);
+		d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
+		     mlt(h3, s4) + mlt(h4, s3);
+		d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
+		     mlt(h3, r0) + mlt(h4, s4);
+		d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
+		     mlt(h3, r1) + mlt(h4, r0);
+
+		/* (partial) h %= p */
+		d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
+		d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
+		d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
+		d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
+		h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
+		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
+
+		src += POLY1305_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state->h[0] = h0;
+	state->h[1] = h1;
+	state->h[2] = h2;
+	state->h[3] = h3;
+	state->h[4] = h4;
+}
+EXPORT_SYMBOL_GPL(poly1305_core_blocks);
+
+void poly1305_core_emit(const struct poly1305_state *state, void *dst)
+{
+	u32 h0, h1, h2, h3, h4;
+	u32 g0, g1, g2, g3, g4;
+	u32 mask;
+
+	/* fully carry h */
+	h0 = state->h[0];
+	h1 = state->h[1];
+	h2 = state->h[2];
+	h3 = state->h[3];
+	h4 = state->h[4];
+
+	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
+	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
+	h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
+	h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
+	h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
+
+	/* compute h + -p */
+	g0 = h0 + 5;
+	g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
+	g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
+	g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
+	g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
+
+	/* select h if h < p, or h + -p if h >= p */
+	mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
+	g0 &= mask;
+	g1 &= mask;
+	g2 &= mask;
+	g3 &= mask;
+	g4 &= mask;
+	mask = ~mask;
+	h0 = (h0 & mask) | g0;
+	h1 = (h1 & mask) | g1;
+	h2 = (h2 & mask) | g2;
+	h3 = (h3 & mask) | g3;
+	h4 = (h4 & mask) | g4;
+
+	/* h = h % (2^128) */
+	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
+	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
+	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
+	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+}
+EXPORT_SYMBOL_GPL(poly1305_core_emit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
--- a/target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0014-crypto-x86-poly1305-unify-Poly1305-state-struct-with.patch
@ -0,0 +1,251 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:20 +0100
+Subject: [PATCH] crypto: x86/poly1305 - unify Poly1305 state struct with
+ generic code
+
+commit ad8f5b88383ea685f2b8df2a12ee3e08089a1287 upstream.
+
+In preparation of exposing a Poly1305 library interface directly from
+the accelerated x86 driver, align the state descriptor of the x86 code
+with the one used by the generic driver. This is needed to make the
+library interface unified between all implementations.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    | 88 ++++++++++--------------------
+ crypto/poly1305_generic.c          |  6 +-
+ include/crypto/internal/poly1305.h |  4 +-
+ include/crypto/poly1305.h          | 18 +++---
+ 4 files changed, 43 insertions(+), 73 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -14,40 +14,14 @@
+ #include <linux/module.h>
+ #include <asm/simd.h>
+ 
+-struct poly1305_simd_desc_ctx {
+-	struct poly1305_desc_ctx base;
+-	/* derived key u set? */
+-	bool uset;
+-#ifdef CONFIG_AS_AVX2
+-	/* derived keys r^3, r^4 set? */
+-	bool wset;
+-#endif
+-	/* derived Poly1305 key r^2 */
+-	u32 u[5];
+-	/* ... silently appended r^3 and r^4 when using AVX2 */
+-};
+-
+ asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+ 				    const u32 *r, unsigned int blocks);
+ asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
+ 				     unsigned int blocks, const u32 *u);
+-#ifdef CONFIG_AS_AVX2
+ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+ 				     unsigned int blocks, const u32 *u);
+-static bool poly1305_use_avx2;
+-#endif
+ 
+-static int poly1305_simd_init(struct shash_desc *desc)
+-{
+-	struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+-
+-	sctx->uset = false;
+-#ifdef CONFIG_AS_AVX2
+-	sctx->wset = false;
+-#endif
+-
+-	return crypto_poly1305_init(desc);
+-}
+static bool poly1305_use_avx2 __ro_after_init;
+ 
+ static void poly1305_simd_mult(u32 *a, const u32 *b)
+ {
+@@ -63,53 +37,49 @@ static void poly1305_simd_mult(u32 *a, c
+ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+ 					 const u8 *src, unsigned int srclen)
+ {
+-	struct poly1305_simd_desc_ctx *sctx;
+ 	unsigned int blocks, datalen;
+ 
+-	BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
+-	sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+-
+ 	if (unlikely(!dctx->sset)) {
+ 		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+ 		src += srclen - datalen;
+ 		srclen = datalen;
+ 	}
+ 
+-#ifdef CONFIG_AS_AVX2
+-	if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
+-		if (unlikely(!sctx->wset)) {
+-			if (!sctx->uset) {
+-				memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+-				poly1305_simd_mult(sctx->u, dctx->r.r);
+-				sctx->uset = true;
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    poly1305_use_avx2 &&
+	    srclen >= POLY1305_BLOCK_SIZE * 4) {
+		if (unlikely(dctx->rset < 4)) {
+			if (dctx->rset < 2) {
+				dctx->r[1] = dctx->r[0];
+				poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+ 			}
+-			memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
+-			poly1305_simd_mult(sctx->u + 5, dctx->r.r);
+-			memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
+-			poly1305_simd_mult(sctx->u + 10, dctx->r.r);
+-			sctx->wset = true;
+			dctx->r[2] = dctx->r[1];
+			poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
+			dctx->r[3] = dctx->r[2];
+			poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
+			dctx->rset = 4;
+ 		}
+ 		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
+-		poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+-				     sctx->u);
+		poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
+				     dctx->r[1].r);
+ 		src += POLY1305_BLOCK_SIZE * 4 * blocks;
+ 		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+ 	}
+-#endif
+
+ 	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
+-		if (unlikely(!sctx->uset)) {
+-			memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+-			poly1305_simd_mult(sctx->u, dctx->r.r);
+-			sctx->uset = true;
+		if (unlikely(dctx->rset < 2)) {
+			dctx->r[1] = dctx->r[0];
+			poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
+			dctx->rset = 2;
+ 		}
+ 		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
+-		poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+-				     sctx->u);
+		poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
+				     blocks, dctx->r[1].r);
+ 		src += POLY1305_BLOCK_SIZE * 2 * blocks;
+ 		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
+ 	}
+ 	if (srclen >= POLY1305_BLOCK_SIZE) {
+-		poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
+		poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
+ 		srclen -= POLY1305_BLOCK_SIZE;
+ 	}
+ 	return srclen;
+@@ -159,10 +129,10 @@ static int poly1305_simd_update(struct s
+ 
+ static struct shash_alg alg = {
+ 	.digestsize	= POLY1305_DIGEST_SIZE,
+-	.init		= poly1305_simd_init,
+	.init		= crypto_poly1305_init,
+ 	.update		= poly1305_simd_update,
+ 	.final		= crypto_poly1305_final,
+-	.descsize	= sizeof(struct poly1305_simd_desc_ctx),
+	.descsize	= sizeof(struct poly1305_desc_ctx),
+ 	.base		= {
+ 		.cra_name		= "poly1305",
+ 		.cra_driver_name	= "poly1305-simd",
+@@ -177,14 +147,14 @@ static int __init poly1305_simd_mod_init
+ 	if (!boot_cpu_has(X86_FEATURE_XMM2))
+ 		return -ENODEV;
+ 
+-#ifdef CONFIG_AS_AVX2
+-	poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+	poly1305_use_avx2 = IS_ENABLED(CONFIG_AS_AVX2) &&
+			    boot_cpu_has(X86_FEATURE_AVX) &&
+ 			    boot_cpu_has(X86_FEATURE_AVX2) &&
+ 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-	alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
+	alg.descsize = sizeof(struct poly1305_desc_ctx) + 5 * sizeof(u32);
+ 	if (poly1305_use_avx2)
+ 		alg.descsize += 10 * sizeof(u32);
+-#endif
+
+ 	return crypto_register_shash(&alg);
+ }
+ 
+--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
+@@ -25,7 +25,7 @@ int crypto_poly1305_init(struct shash_de
+ 
+ 	poly1305_core_init(&dctx->h);
+ 	dctx->buflen = 0;
+-	dctx->rset = false;
+	dctx->rset = 0;
+ 	dctx->sset = false;
+ 
+ 	return 0;
+@@ -43,7 +43,7 @@ static void poly1305_blocks(struct poly1
+ 		srclen = datalen;
+ 	}
+ 
+-	poly1305_core_blocks(&dctx->h, &dctx->r, src,
+	poly1305_core_blocks(&dctx->h, dctx->r, src,
+ 			     srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+@@ -95,7 +95,7 @@ int crypto_poly1305_final(struct shash_d
+ 		dctx->buf[dctx->buflen++] = 1;
+ 		memset(dctx->buf + dctx->buflen, 0,
+ 		       POLY1305_BLOCK_SIZE - dctx->buflen);
+-		poly1305_core_blocks(&dctx->h, &dctx->r, dctx->buf, 1, 0);
+		poly1305_core_blocks(&dctx->h, dctx->r, dctx->buf, 1, 0);
+ 	}
+ 
+ 	poly1305_core_emit(&dctx->h, digest);
+--- a/include/crypto/internal/poly1305.h
+++ b/include/crypto/internal/poly1305.h
+@@ -46,10 +46,10 @@ unsigned int crypto_poly1305_setdesckey(
+ {
+ 	if (!dctx->sset) {
+ 		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
+-			poly1305_core_setkey(&dctx->r, src);
+			poly1305_core_setkey(dctx->r, src);
+ 			src += POLY1305_BLOCK_SIZE;
+ 			srclen -= POLY1305_BLOCK_SIZE;
+-			dctx->rset = true;
+			dctx->rset = 1;
+ 		}
+ 		if (srclen >= POLY1305_BLOCK_SIZE) {
+ 			dctx->s[0] = get_unaligned_le32(src +  0);
+--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
+@@ -22,20 +22,20 @@ struct poly1305_state {
+ };
+ 
+ struct poly1305_desc_ctx {
+-	/* key */
+-	struct poly1305_key r;
+-	/* finalize key */
+-	u32 s[4];
+-	/* accumulator */
+-	struct poly1305_state h;
+ 	/* partial buffer */
+ 	u8 buf[POLY1305_BLOCK_SIZE];
+ 	/* bytes used in partial buffer */
+ 	unsigned int buflen;
+-	/* r key has been set */
+-	bool rset;
+-	/* s key has been set */
+	/* how many keys have been set in r[] */
+	unsigned short rset;
+	/* whether s[] has been set */
+ 	bool sset;
+	/* finalize key */
+	u32 s[4];
+	/* accumulator */
+	struct poly1305_state h;
+	/* key */
+	struct poly1305_key r[1];
+ };
+ 
+ #endif
--- a/target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0015-crypto-poly1305-expose-init-update-final-library-int.patch
@ -0,0 +1,224 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:21 +0100
+Subject: [PATCH] crypto: poly1305 - expose init/update/final library interface
+
+commit a1d93064094cc5e24d64e35cf093e7191d0c9344 upstream.
+
+Expose the existing generic Poly1305 code via a init/update/final
+library interface so that callers are not required to go through
+the crypto API's shash abstraction to access it. At the same time,
+make some preparations so that the library implementation can be
+superseded by an accelerated arch-specific version in the future.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/poly1305_generic.c | 22 +-----------
+ include/crypto/poly1305.h | 38 +++++++++++++++++++-
+ lib/crypto/Kconfig        | 26 ++++++++++++++
+ lib/crypto/poly1305.c     | 74 +++++++++++++++++++++++++++++++++++++++
+ 4 files changed, 138 insertions(+), 22 deletions(-)
+
+--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
+@@ -85,31 +85,11 @@ EXPORT_SYMBOL_GPL(crypto_poly1305_update
+ int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+-	__le32 digest[4];
+-	u64 f = 0;
+ 
+ 	if (unlikely(!dctx->sset))
+ 		return -ENOKEY;
+ 
+-	if (unlikely(dctx->buflen)) {
+-		dctx->buf[dctx->buflen++] = 1;
+-		memset(dctx->buf + dctx->buflen, 0,
+-		       POLY1305_BLOCK_SIZE - dctx->buflen);
+-		poly1305_core_blocks(&dctx->h, dctx->r, dctx->buf, 1, 0);
+-	}
+-
+-	poly1305_core_emit(&dctx->h, digest);
+-
+-	/* mac = (h + s) % (2^128) */
+-	f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
+-	put_unaligned_le32(f, dst + 0);
+-	f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
+-	put_unaligned_le32(f, dst + 4);
+-	f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
+-	put_unaligned_le32(f, dst + 8);
+-	f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
+-	put_unaligned_le32(f, dst + 12);
+-
+	poly1305_final_generic(dctx, dst);
+ 	return 0;
+ }
+ EXPORT_SYMBOL_GPL(crypto_poly1305_final);
+--- a/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
+@@ -35,7 +35,43 @@ struct poly1305_desc_ctx {
+ 	/* accumulator */
+ 	struct poly1305_state h;
+ 	/* key */
+-	struct poly1305_key r[1];
+	struct poly1305_key r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
+ };
+ 
+void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key);
+void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key);
+
+static inline void poly1305_init(struct poly1305_desc_ctx *desc, const u8 *key)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+		poly1305_init_arch(desc, key);
+	else
+		poly1305_init_generic(desc, key);
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *desc, const u8 *src,
+			  unsigned int nbytes);
+void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
+			     unsigned int nbytes);
+
+static inline void poly1305_update(struct poly1305_desc_ctx *desc,
+				   const u8 *src, unsigned int nbytes)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+		poly1305_update_arch(desc, src, nbytes);
+	else
+		poly1305_update_generic(desc, src, nbytes);
+}
+
+void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest);
+void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *digest);
+
+static inline void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest)
+{
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+		poly1305_final_arch(desc, digest);
+	else
+		poly1305_final_generic(desc, digest);
+}
+
+ #endif
+--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
+@@ -37,8 +37,34 @@ config CRYPTO_LIB_CHACHA
+ config CRYPTO_LIB_DES
+ 	tristate
+ 
+config CRYPTO_LIB_POLY1305_RSIZE
+	int
+	default 1
+
+config CRYPTO_ARCH_HAVE_LIB_POLY1305
+	tristate
+	help
+	  Declares whether the architecture provides an arch-specific
+	  accelerated implementation of the Poly1305 library interface,
+	  either builtin or as a module.
+
+ config CRYPTO_LIB_POLY1305_GENERIC
+ 	tristate
+	help
+	  This symbol can be depended upon by arch implementations of the
+	  Poly1305 library interface that require the generic code as a
+	  fallback, e.g., for SIMD implementations. If no arch specific
+	  implementation is enabled, this implementation serves the users
+	  of CRYPTO_LIB_POLY1305.
+
+config CRYPTO_LIB_POLY1305
+	tristate "Poly1305 library interface"
+	depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
+	help
+	  Enable the Poly1305 library interface. This interface may be fulfilled
+	  by either the generic implementation or an arch-specific one, if one
+	  is available and enabled.
+ 
+ config CRYPTO_LIB_SHA256
+ 	tristate
+--- a/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
+@@ -154,5 +154,79 @@ void poly1305_core_emit(const struct pol
+ }
+ EXPORT_SYMBOL_GPL(poly1305_core_emit);
+ 
+void poly1305_init_generic(struct poly1305_desc_ctx *desc, const u8 *key)
+{
+	poly1305_core_setkey(desc->r, key);
+	desc->s[0] = get_unaligned_le32(key + 16);
+	desc->s[1] = get_unaligned_le32(key + 20);
+	desc->s[2] = get_unaligned_le32(key + 24);
+	desc->s[3] = get_unaligned_le32(key + 28);
+	poly1305_core_init(&desc->h);
+	desc->buflen = 0;
+	desc->sset = true;
+	desc->rset = 1;
+}
+EXPORT_SYMBOL_GPL(poly1305_init_generic);
+
+void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
+			     unsigned int nbytes)
+{
+	unsigned int bytes;
+
+	if (unlikely(desc->buflen)) {
+		bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen);
+		memcpy(desc->buf + desc->buflen, src, bytes);
+		src += bytes;
+		nbytes -= bytes;
+		desc->buflen += bytes;
+
+		if (desc->buflen == POLY1305_BLOCK_SIZE) {
+			poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 1);
+			desc->buflen = 0;
+		}
+	}
+
+	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+		poly1305_core_blocks(&desc->h, desc->r, src,
+				     nbytes / POLY1305_BLOCK_SIZE, 1);
+		src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
+		nbytes %= POLY1305_BLOCK_SIZE;
+	}
+
+	if (unlikely(nbytes)) {
+		desc->buflen = nbytes;
+		memcpy(desc->buf, src, nbytes);
+	}
+}
+EXPORT_SYMBOL_GPL(poly1305_update_generic);
+
+void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
+{
+	__le32 digest[4];
+	u64 f = 0;
+
+	if (unlikely(desc->buflen)) {
+		desc->buf[desc->buflen++] = 1;
+		memset(desc->buf + desc->buflen, 0,
+		       POLY1305_BLOCK_SIZE - desc->buflen);
+		poly1305_core_blocks(&desc->h, desc->r, desc->buf, 1, 0);
+	}
+
+	poly1305_core_emit(&desc->h, digest);
+
+	/* mac = (h + s) % (2^128) */
+	f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
+	put_unaligned_le32(f, dst + 0);
+	f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
+	put_unaligned_le32(f, dst + 4);
+	f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
+	put_unaligned_le32(f, dst + 8);
+	f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
+	put_unaligned_le32(f, dst + 12);
+
+	*desc = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL_GPL(poly1305_final_generic);
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
--- a/target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0016-crypto-x86-poly1305-depend-on-generic-library-not-ge.patch
@ -0,0 +1,217 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:22 +0100
+Subject: [PATCH] crypto: x86/poly1305 - depend on generic library not generic
+ shash
+
+commit 1b2c6a5120489d41c8ea3b8dacd0b4586289b158 upstream.
+
+Remove the dependency on the generic Poly1305 driver. Instead, depend
+on the generic library so that we only reuse code without pulling in
+the generic skcipher implementation as well.
+
+While at it, remove the logic that prefers the non-SIMD path for short
+inputs - this is no longer necessary after recent FPU handling changes
+on x86.
+
+Since this removes the last remaining user of the routines exported
+by the generic shash driver, unexport them and make them static.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c    | 66 +++++++++++++++++++++++++-----
+ crypto/Kconfig                     |  2 +-
+ crypto/poly1305_generic.c          | 11 ++---
+ include/crypto/internal/poly1305.h |  9 ----
+ 4 files changed, 60 insertions(+), 28 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -34,6 +34,24 @@ static void poly1305_simd_mult(u32 *a, c
+ 	poly1305_block_sse2(a, m, b, 1);
+ }
+ 
+static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
+					   const u8 *src, unsigned int srclen)
+{
+	unsigned int datalen;
+
+	if (unlikely(!dctx->sset)) {
+		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+		src += srclen - datalen;
+		srclen = datalen;
+	}
+	if (srclen >= POLY1305_BLOCK_SIZE) {
+		poly1305_core_blocks(&dctx->h, dctx->r, src,
+				     srclen / POLY1305_BLOCK_SIZE, 1);
+		srclen %= POLY1305_BLOCK_SIZE;
+	}
+	return srclen;
+}
+
+ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+ 					 const u8 *src, unsigned int srclen)
+ {
+@@ -91,12 +109,6 @@ static int poly1305_simd_update(struct s
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 	unsigned int bytes;
+ 
+-	/* kernel_fpu_begin/end is costly, use fallback for small updates */
+-	if (srclen <= 288 || !crypto_simd_usable())
+-		return crypto_poly1305_update(desc, src, srclen);
+-
+-	kernel_fpu_begin();
+-
+ 	if (unlikely(dctx->buflen)) {
+ 		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+ 		memcpy(dctx->buf + dctx->buflen, src, bytes);
+@@ -105,25 +117,57 @@ static int poly1305_simd_update(struct s
+ 		dctx->buflen += bytes;
+ 
+ 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+-			poly1305_simd_blocks(dctx, dctx->buf,
+-					     POLY1305_BLOCK_SIZE);
+			if (likely(crypto_simd_usable())) {
+				kernel_fpu_begin();
+				poly1305_simd_blocks(dctx, dctx->buf,
+						     POLY1305_BLOCK_SIZE);
+				kernel_fpu_end();
+			} else {
+				poly1305_scalar_blocks(dctx, dctx->buf,
+						       POLY1305_BLOCK_SIZE);
+			}
+ 			dctx->buflen = 0;
+ 		}
+ 	}
+ 
+ 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-		bytes = poly1305_simd_blocks(dctx, src, srclen);
+		if (likely(crypto_simd_usable())) {
+			kernel_fpu_begin();
+			bytes = poly1305_simd_blocks(dctx, src, srclen);
+			kernel_fpu_end();
+		} else {
+			bytes = poly1305_scalar_blocks(dctx, src, srclen);
+		}
+ 		src += srclen - bytes;
+ 		srclen = bytes;
+ 	}
+ 
+-	kernel_fpu_end();
+-
+ 	if (unlikely(srclen)) {
+ 		dctx->buflen = srclen;
+ 		memcpy(dctx->buf, src, srclen);
+ 	}
+}
+
+static int crypto_poly1305_init(struct shash_desc *desc)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	poly1305_core_init(&dctx->h);
+	dctx->buflen = 0;
+	dctx->rset = 0;
+	dctx->sset = false;
+
+	return 0;
+}
+
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	if (unlikely(!dctx->sset))
+		return -ENOKEY;
+ 
+	poly1305_final_generic(dctx, dst);
+ 	return 0;
+ }
+ 
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -697,7 +697,7 @@ config CRYPTO_POLY1305
+ config CRYPTO_POLY1305_X86_64
+ 	tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
+ 	depends on X86 && 64BIT
+-	select CRYPTO_POLY1305
+	select CRYPTO_LIB_POLY1305_GENERIC
+ 	help
+ 	  Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
+@@ -19,7 +19,7 @@
+ #include <linux/module.h>
+ #include <asm/unaligned.h>
+ 
+-int crypto_poly1305_init(struct shash_desc *desc)
+static int crypto_poly1305_init(struct shash_desc *desc)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+@@ -30,7 +30,6 @@ int crypto_poly1305_init(struct shash_de
+ 
+ 	return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_init);
+ 
+ static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ 			    unsigned int srclen)
+@@ -47,8 +46,8 @@ static void poly1305_blocks(struct poly1
+ 			     srclen / POLY1305_BLOCK_SIZE, 1);
+ }
+ 
+-int crypto_poly1305_update(struct shash_desc *desc,
+-			   const u8 *src, unsigned int srclen)
+static int crypto_poly1305_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 	unsigned int bytes;
+@@ -80,9 +79,8 @@ int crypto_poly1305_update(struct shash_
+ 
+ 	return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_update);
+ 
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+ {
+ 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 
+@@ -92,7 +90,6 @@ int crypto_poly1305_final(struct shash_d
+ 	poly1305_final_generic(dctx, dst);
+ 	return 0;
+ }
+-EXPORT_SYMBOL_GPL(crypto_poly1305_final);
+ 
+ static struct shash_alg poly1305_alg = {
+ 	.digestsize	= POLY1305_DIGEST_SIZE,
+--- a/include/crypto/internal/poly1305.h
+++ b/include/crypto/internal/poly1305.h
+@@ -10,8 +10,6 @@
+ #include <linux/types.h>
+ #include <crypto/poly1305.h>
+ 
+-struct shash_desc;
+-
+ /*
+  * Poly1305 core functions.  These implement the ε-almost-∆-universal hash
+  * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
+@@ -28,13 +26,6 @@ void poly1305_core_blocks(struct poly130
+ 			  unsigned int nblocks, u32 hibit);
+ void poly1305_core_emit(const struct poly1305_state *state, void *dst);
+ 
+-/* Crypto API helper functions for the Poly1305 MAC */
+-int crypto_poly1305_init(struct shash_desc *desc);
+-
+-int crypto_poly1305_update(struct shash_desc *desc,
+-			   const u8 *src, unsigned int srclen);
+-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
+-
+ /*
+  * Poly1305 requires a unique key for each tag, which implies that we can't set
+  * it on the tfm that gets accessed by multiple users simultaneously. Instead we
--- a/target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0017-crypto-x86-poly1305-expose-existing-driver-as-poly13.patch
@ -0,0 +1,163 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:23 +0100
+Subject: [PATCH] crypto: x86/poly1305 - expose existing driver as poly1305
+ library
+
+commit f0e89bcfbb894e5844cd1bbf6b3cf7c63cb0f5ac upstream.
+
+Implement the arch init/update/final Poly1305 library routines in the
+accelerated SIMD driver for x86 so they are accessible to users of
+the Poly1305 library interface as well.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 57 ++++++++++++++++++++++++---------
+ crypto/Kconfig                  |  1 +
+ lib/crypto/Kconfig              |  1 +
+ 3 files changed, 43 insertions(+), 16 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -10,6 +10,7 @@
+ #include <crypto/internal/poly1305.h>
+ #include <crypto/internal/simd.h>
+ #include <linux/crypto.h>
+#include <linux/jump_label.h>
+ #include <linux/kernel.h>
+ #include <linux/module.h>
+ #include <asm/simd.h>
+@@ -21,7 +22,8 @@ asmlinkage void poly1305_2block_sse2(u32
+ asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
+ 				     unsigned int blocks, const u32 *u);
+ 
+-static bool poly1305_use_avx2 __ro_after_init;
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+ 
+ static void poly1305_simd_mult(u32 *a, const u32 *b)
+ {
+@@ -64,7 +66,7 @@ static unsigned int poly1305_simd_blocks
+ 	}
+ 
+ 	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+-	    poly1305_use_avx2 &&
+	    static_branch_likely(&poly1305_use_avx2) &&
+ 	    srclen >= POLY1305_BLOCK_SIZE * 4) {
+ 		if (unlikely(dctx->rset < 4)) {
+ 			if (dctx->rset < 2) {
+@@ -103,10 +105,15 @@ static unsigned int poly1305_simd_blocks
+ 	return srclen;
+ }
+ 
+-static int poly1305_simd_update(struct shash_desc *desc,
+-				const u8 *src, unsigned int srclen)
+void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+{
+	poly1305_init_generic(desc, key);
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+			  unsigned int srclen)
+ {
+-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ 	unsigned int bytes;
+ 
+ 	if (unlikely(dctx->buflen)) {
+@@ -117,7 +124,8 @@ static int poly1305_simd_update(struct s
+ 		dctx->buflen += bytes;
+ 
+ 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+-			if (likely(crypto_simd_usable())) {
+			if (static_branch_likely(&poly1305_use_simd) &&
+			    likely(crypto_simd_usable())) {
+ 				kernel_fpu_begin();
+ 				poly1305_simd_blocks(dctx, dctx->buf,
+ 						     POLY1305_BLOCK_SIZE);
+@@ -131,7 +139,8 @@ static int poly1305_simd_update(struct s
+ 	}
+ 
+ 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+-		if (likely(crypto_simd_usable())) {
+		if (static_branch_likely(&poly1305_use_simd) &&
+		    likely(crypto_simd_usable())) {
+ 			kernel_fpu_begin();
+ 			bytes = poly1305_simd_blocks(dctx, src, srclen);
+ 			kernel_fpu_end();
+@@ -147,6 +156,13 @@ static int poly1305_simd_update(struct s
+ 		memcpy(dctx->buf, src, srclen);
+ 	}
+ }
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
+{
+	poly1305_final_generic(desc, digest);
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+ 
+ static int crypto_poly1305_init(struct shash_desc *desc)
+ {
+@@ -171,6 +187,15 @@ static int crypto_poly1305_final(struct
+ 	return 0;
+ }
+ 
+static int poly1305_simd_update(struct shash_desc *desc,
+				const u8 *src, unsigned int srclen)
+{
+	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+	poly1305_update_arch(dctx, src, srclen);
+	return 0;
+}
+
+ static struct shash_alg alg = {
+ 	.digestsize	= POLY1305_DIGEST_SIZE,
+ 	.init		= crypto_poly1305_init,
+@@ -189,15 +214,15 @@ static struct shash_alg alg = {
+ static int __init poly1305_simd_mod_init(void)
+ {
+ 	if (!boot_cpu_has(X86_FEATURE_XMM2))
+-		return -ENODEV;
+		return 0;
+ 
+-	poly1305_use_avx2 = IS_ENABLED(CONFIG_AS_AVX2) &&
+-			    boot_cpu_has(X86_FEATURE_AVX) &&
+-			    boot_cpu_has(X86_FEATURE_AVX2) &&
+-			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+-	alg.descsize = sizeof(struct poly1305_desc_ctx) + 5 * sizeof(u32);
+-	if (poly1305_use_avx2)
+-		alg.descsize += 10 * sizeof(u32);
+	static_branch_enable(&poly1305_use_simd);
+
+	if (IS_ENABLED(CONFIG_AS_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		static_branch_enable(&poly1305_use_avx2);
+ 
+ 	return crypto_register_shash(&alg);
+ }
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -698,6 +698,7 @@ config CRYPTO_POLY1305_X86_64
+ 	tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
+ 	depends on X86 && 64BIT
+ 	select CRYPTO_LIB_POLY1305_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
+ 	help
+ 	  Poly1305 authenticator algorithm, RFC7539.
+ 
+--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
+@@ -39,6 +39,7 @@ config CRYPTO_LIB_DES
+ 
+ config CRYPTO_LIB_POLY1305_RSIZE
+ 	int
+	default 4 if X86_64
+ 	default 1
+ 
+ config CRYPTO_ARCH_HAVE_LIB_POLY1305
--- a/target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0018-crypto-arm64-poly1305-incorporate-OpenSSL-CRYPTOGAMS.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0019-crypto-arm-poly1305-incorporate-OpenSSL-CRYPTOGAMS-N.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0020-crypto-mips-poly1305-incorporate-OpenSSL-CRYPTOGAMS-.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0021-crypto-blake2s-generic-C-library-implementation-and-.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0022-crypto-testmgr-add-test-cases-for-Blake2s.patch
@ -0,0 +1,322 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:29 +0100
+Subject: [PATCH] crypto: testmgr - add test cases for Blake2s
+
+commit 17e1df67023a5c9ccaeb5de8bf5b88f63127ecf7 upstream.
+
+As suggested by Eric for the Blake2b implementation contributed by
+David, introduce a set of test vectors for Blake2s covering different
+digest and key sizes.
+
+          blake2s-128  blake2s-160  blake2s-224  blake2s-256
+         ---------------------------------------------------
+len=0   | klen=0       klen=1       klen=16      klen=32
+len=1   | klen=16      klen=32      klen=0       klen=1
+len=7   | klen=32      klen=0       klen=1       klen=16
+len=15  | klen=1       klen=16      klen=32      klen=0
+len=64  | klen=0       klen=1       klen=16      klen=32
+len=247 | klen=16      klen=32      klen=0       klen=1
+len=256 | klen=32      klen=0       klen=1       klen=16
+
+Cc: David Sterba <dsterba@suse.com>
+Cc: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/testmgr.c |  24 +++++
+ crypto/testmgr.h | 251 +++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 275 insertions(+)
+
+--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
+@@ -4035,6 +4035,30 @@ static const struct alg_test_desc alg_te
+ 		.test = alg_test_null,
+ 		.fips_allowed = 1,
+ 	}, {
+		.alg = "blake2s-128",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(blakes2s_128_tv_template)
+		}
+	}, {
+		.alg = "blake2s-160",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(blakes2s_160_tv_template)
+		}
+	}, {
+		.alg = "blake2s-224",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(blakes2s_224_tv_template)
+		}
+	}, {
+		.alg = "blake2s-256",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = __VECS(blakes2s_256_tv_template)
+		}
+	}, {
+ 		.alg = "cbc(aes)",
+ 		.test = alg_test_skcipher,
+ 		.fips_allowed = 1,
+--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
+@@ -31567,4 +31567,255 @@ static const struct aead_testvec essiv_h
+ 	},
+ };
+ 
+static const char blake2_ordered_sequence[] =
+	"\x00\x01\x02\x03\x04\x05\x06\x07"
+	"\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+	"\x10\x11\x12\x13\x14\x15\x16\x17"
+	"\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+	"\x20\x21\x22\x23\x24\x25\x26\x27"
+	"\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+	"\x30\x31\x32\x33\x34\x35\x36\x37"
+	"\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+	"\x40\x41\x42\x43\x44\x45\x46\x47"
+	"\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+	"\x50\x51\x52\x53\x54\x55\x56\x57"
+	"\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+	"\x60\x61\x62\x63\x64\x65\x66\x67"
+	"\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+	"\x70\x71\x72\x73\x74\x75\x76\x77"
+	"\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+	"\x80\x81\x82\x83\x84\x85\x86\x87"
+	"\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+	"\x90\x91\x92\x93\x94\x95\x96\x97"
+	"\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+	"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+	"\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+	"\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+	"\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+	"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+	"\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+	"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+	"\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+	"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+	"\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+	"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+	"\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
+
+static const struct hash_testvec blakes2s_128_tv_template[] = {{
+	.digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01,
+			  0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, },
+}, {
+	.plaintext = blake2_ordered_sequence,
+	.psize = 64,
+	.digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01,
+			  0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 1,
+	.digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82,
+			  0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 7,
+	.digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd,
+			  0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 15,
+	.digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2,
+			  0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 247,
+	.digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe,
+			  0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 256,
+	.digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6,
+			  0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, },
+}};
+
+static const struct hash_testvec blakes2s_160_tv_template[] = {{
+	.plaintext = blake2_ordered_sequence,
+	.psize = 7,
+	.digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e,
+			  0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62,
+			  0xe3, 0xf2, 0x84, 0xff, },
+}, {
+	.plaintext = blake2_ordered_sequence,
+	.psize = 256,
+	.digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2,
+			  0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1,
+			  0x9b, 0x2d, 0x35, 0x05, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3,
+			  0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1,
+			  0x79, 0x65, 0x32, 0x93, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 1,
+	.digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71,
+			  0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef,
+			  0xa2, 0x3a, 0x56, 0x9c, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 15,
+	.digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19,
+			  0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34,
+			  0x83, 0x39, 0x0f, 0x30, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 64,
+	.digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5,
+			  0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d,
+			  0xac, 0xa6, 0x81, 0x63, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 247,
+	.digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01,
+			  0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10,
+			  0x0a, 0xf6, 0x73, 0xe8, },
+}};
+
+static const struct hash_testvec blakes2s_224_tv_template[] = {{
+	.plaintext = blake2_ordered_sequence,
+	.psize = 1,
+	.digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91,
+			  0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12,
+			  0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc,
+			  0x48, 0x21, 0x97, 0xbb, },
+}, {
+	.plaintext = blake2_ordered_sequence,
+	.psize = 247,
+	.digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e,
+			  0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4,
+			  0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc,
+			  0x2b, 0xa4, 0xd5, 0xf6, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f,
+			  0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3,
+			  0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32,
+			  0xa7, 0x19, 0xfc, 0xb8, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 7,
+	.digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76,
+			  0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6,
+			  0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8,
+			  0x7b, 0x45, 0xfe, 0x05, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 15,
+	.digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36,
+			  0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43,
+			  0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e,
+			  0x25, 0xab, 0xc5, 0x02, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 64,
+	.digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7,
+			  0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c,
+			  0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93,
+			  0x6a, 0x31, 0x83, 0xb5, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 256,
+	.digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86,
+			  0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2,
+			  0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45,
+			  0xb3, 0xd7, 0xec, 0xcc, },
+}};
+
+static const struct hash_testvec blakes2s_256_tv_template[] = {{
+	.plaintext = blake2_ordered_sequence,
+	.psize = 15,
+	.digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
+			  0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
+			  0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
+			  0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b,
+			  0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b,
+			  0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a,
+			  0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 1,
+	.digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03,
+			  0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18,
+			  0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b,
+			  0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 7,
+	.digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03,
+			  0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15,
+			  0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34,
+			  0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, },
+}, {
+	.ksize = 32,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 64,
+	.digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66,
+			  0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26,
+			  0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab,
+			  0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, },
+}, {
+	.ksize = 1,
+	.key = "B",
+	.plaintext = blake2_ordered_sequence,
+	.psize = 247,
+	.digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84,
+			  0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66,
+			  0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0,
+			  0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, },
+}, {
+	.ksize = 16,
+	.key = blake2_ordered_sequence,
+	.plaintext = blake2_ordered_sequence,
+	.psize = 256,
+	.digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b,
+			  0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1,
+			  0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed,
+			  0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, },
+}};
+
+ #endif	/* _CRYPTO_TESTMGR_H */
--- a/target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0023-crypto-blake2s-implement-generic-shash-driver.patch
@ -0,0 +1,245 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:30 +0100
+Subject: [PATCH] crypto: blake2s - implement generic shash driver
+
+commit 7f9b0880925f1f9d7d59504ea0892d2ae9cfc233 upstream.
+
+Wire up our newly added Blake2s implementation via the shash API.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig                    |  18 ++++
+ crypto/Makefile                   |   1 +
+ crypto/blake2s_generic.c          | 171 ++++++++++++++++++++++++++++++
+ include/crypto/internal/blake2s.h |   5 +
+ 4 files changed, 195 insertions(+)
+ create mode 100644 crypto/blake2s_generic.c
+
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -639,6 +639,24 @@ config CRYPTO_XXHASH
+ 	  xxHash non-cryptographic hash algorithm. Extremely fast, working at
+ 	  speeds close to RAM limits.
+ 
+config CRYPTO_BLAKE2S
+	tristate "BLAKE2s digest algorithm"
+	select CRYPTO_LIB_BLAKE2S_GENERIC
+	select CRYPTO_HASH
+	help
+	  Implementation of cryptographic hash function BLAKE2s
+	  optimized for 8-32bit platforms and can produce digests of any size
+	  between 1 to 32.  The keyed hash is also implemented.
+
+	  This module provides the following algorithms:
+
+	  - blake2s-128
+	  - blake2s-160
+	  - blake2s-224
+	  - blake2s-256
+
+	  See https://blake2.net for further information.
+
+ config CRYPTO_CRCT10DIF
+ 	tristate "CRCT10DIF algorithm"
+ 	select CRYPTO_HASH
+--- a/crypto/Makefile
+++ b/crypto/Makefile
+@@ -74,6 +74,7 @@ obj-$(CONFIG_CRYPTO_STREEBOG) += streebo
+ obj-$(CONFIG_CRYPTO_WP512) += wp512.o
+ CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
+ obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
+obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
+ obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
+ obj-$(CONFIG_CRYPTO_ECB) += ecb.o
+ obj-$(CONFIG_CRYPTO_CBC) += cbc.o
+--- /dev/null
+++ b/crypto/blake2s_generic.c
+@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
+				 unsigned int keylen)
+{
+	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+	if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(tctx->key, key, keylen);
+	tctx->keylen = keylen;
+
+	return 0;
+}
+
+static int crypto_blake2s_init(struct shash_desc *desc)
+{
+	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct blake2s_state *state = shash_desc_ctx(desc);
+	const int outlen = crypto_shash_digestsize(desc->tfm);
+
+	if (tctx->keylen)
+		blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
+	else
+		blake2s_init(state, outlen);
+
+	return 0;
+}
+
+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
+				 unsigned int inlen)
+{
+	struct blake2s_state *state = shash_desc_ctx(desc);
+	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+	if (unlikely(!inlen))
+		return 0;
+	if (inlen > fill) {
+		memcpy(state->buf + state->buflen, in, fill);
+		blake2s_compress_generic(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+		state->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2S_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+		/* Hash one less (full) block than strictly possible */
+		blake2s_compress_generic(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(state->buf + state->buflen, in, inlen);
+	state->buflen += inlen;
+
+	return 0;
+}
+
+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
+{
+	struct blake2s_state *state = shash_desc_ctx(desc);
+
+	blake2s_set_lastblock(state);
+	memset(state->buf + state->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+	blake2s_compress_generic(state, state->buf, 1, state->buflen);
+	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+	memcpy(out, state->h, state->outlen);
+	memzero_explicit(state, sizeof(*state));
+
+	return 0;
+}
+
+static struct shash_alg blake2s_algs[] = {{
+	.base.cra_name		= "blake2s-128",
+	.base.cra_driver_name	= "blake2s-128-generic",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_128_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-160",
+	.base.cra_driver_name	= "blake2s-160-generic",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_160_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-224",
+	.base.cra_driver_name	= "blake2s-224-generic",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_224_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-256",
+	.base.cra_driver_name	= "blake2s-256-generic",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_256_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}};
+
+static int __init blake2s_mod_init(void)
+{
+	return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+	crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+subsys_initcall(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-generic");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-generic");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-generic");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-generic");
+MODULE_LICENSE("GPL v2");
+--- a/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
+@@ -5,6 +5,11 @@
+ 
+ #include <crypto/blake2s.h>
+ 
+struct blake2s_tfm_ctx {
+	u8 key[BLAKE2S_KEY_SIZE];
+	unsigned int keylen;
+};
+
+ void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+ 			      size_t nblocks, const u32 inc);
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0024-crypto-blake2s-x86_64-SIMD-implementation.patch
@ -0,0 +1,557 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 8 Nov 2019 13:22:31 +0100
+Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation
+
+commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
+
+These implementations from Samuel Neves support AVX and AVX-512VL.
+Originally this used AVX-512F, but Skylake thermal throttling made
+AVX-512VL more attractive and possible to do with negligable difference.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
+Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
+[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/Makefile       |   2 +
+ arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
+ arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
+ crypto/Kconfig                 |   6 +
+ 4 files changed, 499 insertions(+)
+ create mode 100644 arch/x86/crypto/blake2s-core.S
+ create mode 100644 arch/x86/crypto/blake2s-glue.c
+
+--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
+@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
+ 	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+ 	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+ 	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
+ endif
+ 
+ # These modules require assembler to support AVX2.
+@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
+ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
+ 
+ nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+ 
+ ifeq ($(avx_supported),yes)
+ 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+--- /dev/null
+++ b/arch/x86/crypto/blake2s-core.S
+@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.align 32
+IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.align 16
+ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.align 64
+SIGMA:
+.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
+.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
+.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
+.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
+.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
+.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
+.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
+.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
+.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
+.align 64
+SIGMA2:
+.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
+.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
+.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
+.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
+.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
+.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
+.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
+.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
+.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
+.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
+#endif /* CONFIG_AS_AVX512 */
+
+.text
+#ifdef CONFIG_AS_SSSE3
+ENTRY(blake2s_compress_ssse3)
+	testq		%rdx,%rdx
+	je		.Lendofloop
+	movdqu		(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqa		ROT16(%rip),%xmm12
+	movdqa		ROR328(%rip),%xmm13
+	movdqu		0x20(%rdi),%xmm14
+	movq		%rcx,%xmm15
+	leaq		SIGMA+0xa0(%rip),%r8
+	jmp		.Lbeginofloop
+	.align		32
+.Lbeginofloop:
+	movdqa		%xmm0,%xmm10
+	movdqa		%xmm1,%xmm11
+	paddq		%xmm15,%xmm14
+	movdqa		IV(%rip),%xmm2
+	movdqa		%xmm14,%xmm3
+	pxor		IV+0x10(%rip),%xmm3
+	leaq		SIGMA(%rip),%rcx
+.Lroundloop:
+	movzbl		(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0x1(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0x2(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x3(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	punpckldq	%xmm5,%xmm4
+	punpckldq	%xmm7,%xmm6
+	punpcklqdq	%xmm6,%xmm4
+	paddd		%xmm4,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0xc,%xmm1
+	pslld		$0x14,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		0x4(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0x5(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x6(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0x7(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	punpckldq	%xmm6,%xmm5
+	punpckldq	%xmm4,%xmm7
+	punpcklqdq	%xmm7,%xmm5
+	paddd		%xmm5,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0x7,%xmm1
+	pslld		$0x19,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x93,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x39,%xmm2,%xmm2
+	movzbl		0x8(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	movzbl		0x9(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0xa(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0xb(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	punpckldq	%xmm7,%xmm6
+	punpckldq	%xmm5,%xmm4
+	punpcklqdq	%xmm4,%xmm6
+	paddd		%xmm6,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm12,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0xc,%xmm1
+	pslld		$0x14,%xmm8
+	por		%xmm8,%xmm1
+	movzbl		0xc(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm7
+	movzbl		0xd(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm4
+	movzbl		0xe(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm5
+	movzbl		0xf(%rcx),%eax
+	movd		(%rsi,%rax,4),%xmm6
+	punpckldq	%xmm4,%xmm7
+	punpckldq	%xmm6,%xmm5
+	punpcklqdq	%xmm5,%xmm7
+	paddd		%xmm7,%xmm0
+	paddd		%xmm1,%xmm0
+	pxor		%xmm0,%xmm3
+	pshufb		%xmm13,%xmm3
+	paddd		%xmm3,%xmm2
+	pxor		%xmm2,%xmm1
+	movdqa		%xmm1,%xmm8
+	psrld		$0x7,%xmm1
+	pslld		$0x19,%xmm8
+	por		%xmm8,%xmm1
+	pshufd		$0x39,%xmm0,%xmm0
+	pshufd		$0x4e,%xmm3,%xmm3
+	pshufd		$0x93,%xmm2,%xmm2
+	addq		$0x10,%rcx
+	cmpq		%r8,%rcx
+	jnz		.Lroundloop
+	pxor		%xmm2,%xmm0
+	pxor		%xmm3,%xmm1
+	pxor		%xmm10,%xmm0
+	pxor		%xmm11,%xmm1
+	addq		$0x40,%rsi
+	decq		%rdx
+	jnz		.Lbeginofloop
+	movdqu		%xmm0,(%rdi)
+	movdqu		%xmm1,0x10(%rdi)
+	movdqu		%xmm14,0x20(%rdi)
+.Lendofloop:
+	ret
+ENDPROC(blake2s_compress_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX512
+ENTRY(blake2s_compress_avx512)
+	vmovdqu		(%rdi),%xmm0
+	vmovdqu		0x10(%rdi),%xmm1
+	vmovdqu		0x20(%rdi),%xmm4
+	vmovq		%rcx,%xmm5
+	vmovdqa		IV(%rip),%xmm14
+	vmovdqa		IV+16(%rip),%xmm15
+	jmp		.Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+	vmovdqa		%xmm0,%xmm10
+	vmovdqa		%xmm1,%xmm11
+	vpaddq		%xmm5,%xmm4,%xmm4
+	vmovdqa		%xmm14,%xmm2
+	vpxor		%xmm15,%xmm4,%xmm3
+	vmovdqu		(%rsi),%ymm6
+	vmovdqu		0x20(%rsi),%ymm7
+	addq		$0x40,%rsi
+	leaq		SIGMA2(%rip),%rax
+	movb		$0xa,%cl
+.Lblake2s_compress_avx512_roundloop:
+	addq		$0x40,%rax
+	vmovdqa		-0x40(%rax),%ymm8
+	vmovdqa		-0x20(%rax),%ymm9
+	vpermi2d	%ymm7,%ymm6,%ymm8
+	vpermi2d	%ymm7,%ymm6,%ymm9
+	vmovdqa		%ymm8,%ymm6
+	vmovdqa		%ymm9,%ymm7
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x10,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0xc,%xmm1,%xmm1
+	vextracti128	$0x1,%ymm8,%xmm8
+	vpaddd		%xmm8,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0x7,%xmm1,%xmm1
+	vpshufd		$0x93,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x39,%xmm2,%xmm2
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x10,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0xc,%xmm1,%xmm1
+	vextracti128	$0x1,%ymm9,%xmm9
+	vpaddd		%xmm9,%xmm0,%xmm0
+	vpaddd		%xmm1,%xmm0,%xmm0
+	vpxor		%xmm0,%xmm3,%xmm3
+	vprord		$0x8,%xmm3,%xmm3
+	vpaddd		%xmm3,%xmm2,%xmm2
+	vpxor		%xmm2,%xmm1,%xmm1
+	vprord		$0x7,%xmm1,%xmm1
+	vpshufd		$0x39,%xmm0,%xmm0
+	vpshufd		$0x4e,%xmm3,%xmm3
+	vpshufd		$0x93,%xmm2,%xmm2
+	decb		%cl
+	jne		.Lblake2s_compress_avx512_roundloop
+	vpxor		%xmm10,%xmm0,%xmm0
+	vpxor		%xmm11,%xmm1,%xmm1
+	vpxor		%xmm2,%xmm0,%xmm0
+	vpxor		%xmm3,%xmm1,%xmm1
+	decq		%rdx
+	jne		.Lblake2s_compress_avx512_mainloop
+	vmovdqu		%xmm0,(%rdi)
+	vmovdqu		%xmm1,0x10(%rdi)
+	vmovdqu		%xmm4,0x20(%rdi)
+	vzeroupper
+	retq
+ENDPROC(blake2s_compress_avx512)
+#endif /* CONFIG_AS_AVX512 */
+--- /dev/null
+++ b/arch/x86/crypto/blake2s-glue.c
+@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
+#include <asm/processor.h>
+#include <asm/simd.h>
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
+				       const u8 *block, const size_t nblocks,
+				       const u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+					const u8 *block, const size_t nblocks,
+					const u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+void blake2s_compress_arch(struct blake2s_state *state,
+			   const u8 *block, size_t nblocks,
+			   const u32 inc)
+{
+	/* SIMD disables preemption, so relax after processing each page. */
+	BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+
+	if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+		blake2s_compress_generic(state, block, nblocks, inc);
+		return;
+	}
+
+	for (;;) {
+		const size_t blocks = min_t(size_t, nblocks,
+					    PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+
+		kernel_fpu_begin();
+		if (IS_ENABLED(CONFIG_AS_AVX512) &&
+		    static_branch_likely(&blake2s_use_avx512))
+			blake2s_compress_avx512(state, block, blocks, inc);
+		else
+			blake2s_compress_ssse3(state, block, blocks, inc);
+		kernel_fpu_end();
+
+		nblocks -= blocks;
+		if (!nblocks)
+			break;
+		block += blocks * BLAKE2S_BLOCK_SIZE;
+	}
+}
+EXPORT_SYMBOL(blake2s_compress_arch);
+
+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
+				 unsigned int keylen)
+{
+	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+	if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	memcpy(tctx->key, key, keylen);
+	tctx->keylen = keylen;
+
+	return 0;
+}
+
+static int crypto_blake2s_init(struct shash_desc *desc)
+{
+	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	struct blake2s_state *state = shash_desc_ctx(desc);
+	const int outlen = crypto_shash_digestsize(desc->tfm);
+
+	if (tctx->keylen)
+		blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
+	else
+		blake2s_init(state, outlen);
+
+	return 0;
+}
+
+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
+				 unsigned int inlen)
+{
+	struct blake2s_state *state = shash_desc_ctx(desc);
+	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+	if (unlikely(!inlen))
+		return 0;
+	if (inlen > fill) {
+		memcpy(state->buf + state->buflen, in, fill);
+		blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+		state->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2S_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+		/* Hash one less (full) block than strictly possible */
+		blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(state->buf + state->buflen, in, inlen);
+	state->buflen += inlen;
+
+	return 0;
+}
+
+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
+{
+	struct blake2s_state *state = shash_desc_ctx(desc);
+
+	blake2s_set_lastblock(state);
+	memset(state->buf + state->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+	blake2s_compress_arch(state, state->buf, 1, state->buflen);
+	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+	memcpy(out, state->h, state->outlen);
+	memzero_explicit(state, sizeof(*state));
+
+	return 0;
+}
+
+static struct shash_alg blake2s_algs[] = {{
+	.base.cra_name		= "blake2s-128",
+	.base.cra_driver_name	= "blake2s-128-x86",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_128_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-160",
+	.base.cra_driver_name	= "blake2s-160-x86",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_160_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-224",
+	.base.cra_driver_name	= "blake2s-224-x86",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_224_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}, {
+	.base.cra_name		= "blake2s-256",
+	.base.cra_driver_name	= "blake2s-256-x86",
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
+	.base.cra_priority	= 200,
+	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+
+	.digestsize		= BLAKE2S_256_HASH_SIZE,
+	.setkey			= crypto_blake2s_setkey,
+	.init			= crypto_blake2s_init,
+	.update			= crypto_blake2s_update,
+	.final			= crypto_blake2s_final,
+	.descsize		= sizeof(struct blake2s_state),
+}};
+
+static int __init blake2s_mod_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_SSSE3))
+		return 0;
+
+	static_branch_enable(&blake2s_use_ssse3);
+
+	if (IS_ENABLED(CONFIG_AS_AVX512) &&
+	    boot_cpu_has(X86_FEATURE_AVX) &&
+	    boot_cpu_has(X86_FEATURE_AVX2) &&
+	    boot_cpu_has(X86_FEATURE_AVX512F) &&
+	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+			      XFEATURE_MASK_AVX512, NULL))
+		static_branch_enable(&blake2s_use_avx512);
+
+	return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
+ 
+ 	  See https://blake2.net for further information.
+ 
+config CRYPTO_BLAKE2S_X86
+	tristate "BLAKE2s digest algorithm (x86 accelerated version)"
+	depends on X86 && 64BIT
+	select CRYPTO_LIB_BLAKE2S_GENERIC
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+
+ config CRYPTO_CRCT10DIF
+ 	tristate "CRCT10DIF algorithm"
+ 	select CRYPTO_HASH
--- a/target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0025-crypto-curve25519-generic-C-library-implementations.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0026-crypto-curve25519-add-kpp-selftest.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0027-crypto-curve25519-implement-generic-KPP-driver.patch
@ -0,0 +1,136 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:34 +0100
+Subject: [PATCH] crypto: curve25519 - implement generic KPP driver
+
+commit ee772cb641135739c1530647391d5a04c39db192 upstream.
+
+Expose the generic Curve25519 library via the crypto API KPP interface.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig              |  5 +++
+ crypto/Makefile             |  1 +
+ crypto/curve25519-generic.c | 90 +++++++++++++++++++++++++++++++++++++
+ 3 files changed, 96 insertions(+)
+ create mode 100644 crypto/curve25519-generic.c
+
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -264,6 +264,11 @@ config CRYPTO_ECRDSA
+ 	  standard algorithms (called GOST algorithms). Only signature verification
+ 	  is implemented.
+ 
+config CRYPTO_CURVE25519
+	tristate "Curve25519 algorithm"
+	select CRYPTO_KPP
+	select CRYPTO_LIB_CURVE25519_GENERIC
+
+ comment "Authenticated Encryption with Associated Data"
+ 
+ config CRYPTO_CCM
+--- a/crypto/Makefile
+++ b/crypto/Makefile
+@@ -167,6 +167,7 @@ obj-$(CONFIG_CRYPTO_ZSTD) += zstd.o
+ obj-$(CONFIG_CRYPTO_OFB) += ofb.o
+ obj-$(CONFIG_CRYPTO_ECC) += ecc.o
+ obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o
+obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
+ 
+ ecdh_generic-y += ecdh.o
+ ecdh_generic-y += ecdh_helper.o
+--- /dev/null
+++ b/crypto/curve25519-generic.c
+@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/curve25519.h>
+#include <crypto/internal/kpp.h>
+#include <crypto/kpp.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+				 unsigned int len)
+{
+	u8 *secret = kpp_tfm_ctx(tfm);
+
+	if (!len)
+		curve25519_generate_secret(secret);
+	else if (len == CURVE25519_KEY_SIZE &&
+		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+		memcpy(secret, buf, CURVE25519_KEY_SIZE);
+	else
+		return -EINVAL;
+	return 0;
+}
+
+static int curve25519_compute_value(struct kpp_request *req)
+{
+	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+	const u8 *secret = kpp_tfm_ctx(tfm);
+	u8 public_key[CURVE25519_KEY_SIZE];
+	u8 buf[CURVE25519_KEY_SIZE];
+	int copied, nbytes;
+	u8 const *bp;
+
+	if (req->src) {
+		copied = sg_copy_to_buffer(req->src,
+					   sg_nents_for_len(req->src,
+							    CURVE25519_KEY_SIZE),
+					   public_key, CURVE25519_KEY_SIZE);
+		if (copied != CURVE25519_KEY_SIZE)
+			return -EINVAL;
+		bp = public_key;
+	} else {
+		bp = curve25519_base_point;
+	}
+
+	curve25519_generic(buf, secret, bp);
+
+	/* might want less than we've got */
+	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+								nbytes),
+				     buf, nbytes);
+	if (copied != nbytes)
+		return -EINVAL;
+	return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+	return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+	.base.cra_name		= "curve25519",
+	.base.cra_driver_name	= "curve25519-generic",
+	.base.cra_priority	= 100,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
+
+	.set_secret		= curve25519_set_secret,
+	.generate_public_key	= curve25519_compute_value,
+	.compute_shared_secret	= curve25519_compute_value,
+	.max_size		= curve25519_max_size,
+};
+
+static int curve25519_init(void)
+{
+	return crypto_register_kpp(&curve25519_alg);
+}
+
+static void curve25519_exit(void)
+{
+	crypto_unregister_kpp(&curve25519_alg);
+}
+
+subsys_initcall(curve25519_init);
+module_exit(curve25519_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-generic");
+MODULE_LICENSE("GPL");
--- a/target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0028-crypto-lib-curve25519-work-around-Clang-stack-spilli.patch
@ -0,0 +1,75 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:35 +0100
+Subject: [PATCH] crypto: lib/curve25519 - work around Clang stack spilling
+ issue
+
+commit 660bb8e1f833ea63185fe80fde847e3e42f18e3b upstream.
+
+Arnd reports that the 32-bit generic library code for Curve25119 ends
+up using an excessive amount of stack space when built with Clang:
+
+  lib/crypto/curve25519-fiat32.c:756:6: error: stack frame size
+      of 1384 bytes in function 'curve25519_generic'
+      [-Werror,-Wframe-larger-than=]
+
+Let's give some hints to the compiler regarding which routines should
+not be inlined, to prevent it from running out of registers and spilling
+to the stack. The resulting code performs identically under both GCC
+and Clang, and makes the warning go away.
+
+Suggested-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/curve25519-fiat32.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/lib/crypto/curve25519-fiat32.c
+++ b/lib/crypto/curve25519-fiat32.c
+@@ -223,7 +223,7 @@ static __always_inline void fe_1(fe *h)
+ 	h->v[0] = 1;
+ }
+ 
+-static void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+static noinline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+ 	{ const u32 x20 = in1[9];
+ 	{ const u32 x21 = in1[8];
+@@ -266,7 +266,7 @@ static __always_inline void fe_add(fe_lo
+ 	fe_add_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+static noinline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+ 	{ const u32 x20 = in1[9];
+ 	{ const u32 x21 = in1[8];
+@@ -309,7 +309,7 @@ static __always_inline void fe_sub(fe_lo
+ 	fe_sub_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+static noinline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+ {
+ 	{ const u32 x20 = in1[9];
+ 	{ const u32 x21 = in1[8];
+@@ -441,7 +441,7 @@ fe_mul_tll(fe *h, const fe_loose *f, con
+ 	fe_mul_impl(h->v, f->v, g->v);
+ }
+ 
+-static void fe_sqr_impl(u32 out[10], const u32 in1[10])
+static noinline void fe_sqr_impl(u32 out[10], const u32 in1[10])
+ {
+ 	{ const u32 x17 = in1[9];
+ 	{ const u32 x18 = in1[8];
+@@ -619,7 +619,7 @@ static __always_inline void fe_invert(fe
+  *
+  * Preconditions: b in {0,1}
+  */
+-static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b)
+static noinline void fe_cswap(fe *f, fe *g, unsigned int b)
+ {
+ 	unsigned i;
+ 	b = 0 - b;
--- a/target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0029-crypto-curve25519-x86_64-library-and-KPP-implementat.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0030-crypto-arm-curve25519-import-Bernstein-and-Schwabe-s.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0031-crypto-arm-curve25519-wire-up-NEON-implementation.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0032-crypto-chacha20poly1305-import-construction-and-self.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0033-crypto-lib-chacha20poly1305-reimplement-crypt_from_s.patch
@ -0,0 +1,295 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 8 Nov 2019 13:22:40 +0100
+Subject: [PATCH] crypto: lib/chacha20poly1305 - reimplement crypt_from_sg()
+ routine
+
+commit d95312a3ccc0cd544d374be2fc45aeaa803e5fd9 upstream.
+
+Reimplement the library routines to perform chacha20poly1305 en/decryption
+on scatterlists, without [ab]using the [deprecated] blkcipher interface,
+which is rather heavyweight and does things we don't really need.
+
+Instead, we use the sg_miter API in a novel and clever way, to iterate
+over the scatterlist in-place (i.e., source == destination, which is the
+only way this library is expected to be used). That way, we don't have to
+iterate over two scatterlists in parallel.
+
+Another optimization is that, instead of relying on the blkcipher walker
+to present the input in suitable chunks, we recognize that ChaCha is a
+streamcipher, and so we can simply deal with partial blocks by keeping a
+block of cipherstream on the stack and use crypto_xor() to mix it with
+the in/output.
+
+Finally, we omit the scatterwalk_and_copy() call if the last element of
+the scatterlist covers the MAC as well (which is the common case),
+avoiding the need to walk the scatterlist and kmap() the page twice.
+
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/chacha20poly1305.h      |  11 ++
+ lib/crypto/chacha20poly1305-selftest.c |  45 ++++++++
+ lib/crypto/chacha20poly1305.c          | 150 +++++++++++++++++++++++++
+ 3 files changed, 206 insertions(+)
+
+--- a/include/crypto/chacha20poly1305.h
+++ b/include/crypto/chacha20poly1305.h
+@@ -7,6 +7,7 @@
+ #define __CHACHA20POLY1305_H
+ 
+ #include <linux/types.h>
+#include <linux/scatterlist.h>
+ 
+ enum chacha20poly1305_lengths {
+ 	XCHACHA20POLY1305_NONCE_SIZE = 24,
+@@ -34,4 +35,14 @@ bool __must_check xchacha20poly1305_decr
+ 	const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
+ 	const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ 
+bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
+					 const u8 *ad, const size_t ad_len,
+					 const u64 nonce,
+					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+
+bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
+					 const u8 *ad, const size_t ad_len,
+					 const u64 nonce,
+					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+
+ #endif /* __CHACHA20POLY1305_H */
+--- a/lib/crypto/chacha20poly1305-selftest.c
+++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -7250,6 +7250,7 @@ bool __init chacha20poly1305_selftest(vo
+ 	enum { MAXIMUM_TEST_BUFFER_LEN = 1UL << 12 };
+ 	size_t i;
+ 	u8 *computed_output = NULL, *heap_src = NULL;
+	struct scatterlist sg_src;
+ 	bool success = true, ret;
+ 
+ 	heap_src = kmalloc(MAXIMUM_TEST_BUFFER_LEN, GFP_KERNEL);
+@@ -7280,6 +7281,29 @@ bool __init chacha20poly1305_selftest(vo
+ 		}
+ 	}
+ 
+	for (i = 0; i < ARRAY_SIZE(chacha20poly1305_enc_vectors); ++i) {
+		if (chacha20poly1305_enc_vectors[i].nlen != 8)
+			continue;
+		memcpy(heap_src, chacha20poly1305_enc_vectors[i].input,
+		       chacha20poly1305_enc_vectors[i].ilen);
+		sg_init_one(&sg_src, heap_src,
+			    chacha20poly1305_enc_vectors[i].ilen + POLY1305_DIGEST_SIZE);
+		chacha20poly1305_encrypt_sg_inplace(&sg_src,
+			chacha20poly1305_enc_vectors[i].ilen,
+			chacha20poly1305_enc_vectors[i].assoc,
+			chacha20poly1305_enc_vectors[i].alen,
+			get_unaligned_le64(chacha20poly1305_enc_vectors[i].nonce),
+			chacha20poly1305_enc_vectors[i].key);
+		if (memcmp(heap_src,
+				   chacha20poly1305_enc_vectors[i].output,
+				   chacha20poly1305_enc_vectors[i].ilen +
+							POLY1305_DIGEST_SIZE)) {
+			pr_err("chacha20poly1305 sg encryption self-test %zu: FAIL\n",
+			       i + 1);
+			success = false;
+		}
+	}
+
+ 	for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
+ 		memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
+ 		ret = chacha20poly1305_decrypt(computed_output,
+@@ -7301,6 +7325,27 @@ bool __init chacha20poly1305_selftest(vo
+ 		}
+ 	}
+ 
+	for (i = 0; i < ARRAY_SIZE(chacha20poly1305_dec_vectors); ++i) {
+		memcpy(heap_src, chacha20poly1305_dec_vectors[i].input,
+		       chacha20poly1305_dec_vectors[i].ilen);
+		sg_init_one(&sg_src, heap_src,
+			    chacha20poly1305_dec_vectors[i].ilen);
+		ret = chacha20poly1305_decrypt_sg_inplace(&sg_src,
+			chacha20poly1305_dec_vectors[i].ilen,
+			chacha20poly1305_dec_vectors[i].assoc,
+			chacha20poly1305_dec_vectors[i].alen,
+			get_unaligned_le64(chacha20poly1305_dec_vectors[i].nonce),
+			chacha20poly1305_dec_vectors[i].key);
+		if (!decryption_success(ret,
+			chacha20poly1305_dec_vectors[i].failure,
+			memcmp(heap_src, chacha20poly1305_dec_vectors[i].output,
+			       chacha20poly1305_dec_vectors[i].ilen -
+							POLY1305_DIGEST_SIZE))) {
+			pr_err("chacha20poly1305 sg decryption self-test %zu: FAIL\n",
+			       i + 1);
+			success = false;
+		}
+	}
+ 
+ 	for (i = 0; i < ARRAY_SIZE(xchacha20poly1305_enc_vectors); ++i) {
+ 		memset(computed_output, 0, MAXIMUM_TEST_BUFFER_LEN);
+--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
+@@ -11,6 +11,7 @@
+ #include <crypto/chacha20poly1305.h>
+ #include <crypto/chacha.h>
+ #include <crypto/poly1305.h>
+#include <crypto/scatterwalk.h>
+ 
+ #include <asm/unaligned.h>
+ #include <linux/kernel.h>
+@@ -205,6 +206,155 @@ bool xchacha20poly1305_decrypt(u8 *dst,
+ }
+ EXPORT_SYMBOL(xchacha20poly1305_decrypt);
+ 
+static
+bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
+				       const size_t src_len,
+				       const u8 *ad, const size_t ad_len,
+				       const u64 nonce,
+				       const u8 key[CHACHA20POLY1305_KEY_SIZE],
+				       int encrypt)
+{
+	const u8 *pad0 = page_address(ZERO_PAGE(0));
+	struct poly1305_desc_ctx poly1305_state;
+	u32 chacha_state[CHACHA_STATE_WORDS];
+	struct sg_mapping_iter miter;
+	size_t partial = 0;
+	unsigned int flags;
+	bool ret = true;
+	int sl;
+	union {
+		struct {
+			u32 k[CHACHA_KEY_WORDS];
+			__le64 iv[2];
+		};
+		u8 block0[POLY1305_KEY_SIZE];
+		u8 chacha_stream[CHACHA_BLOCK_SIZE];
+		struct {
+			u8 mac[2][POLY1305_DIGEST_SIZE];
+		};
+		__le64 lens[2];
+	} b __aligned(16);
+
+	chacha_load_key(b.k, key);
+
+	b.iv[0] = 0;
+	b.iv[1] = cpu_to_le64(nonce);
+
+	chacha_init(chacha_state, b.k, (u8 *)b.iv);
+	chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
+	poly1305_init(&poly1305_state, b.block0);
+
+	if (unlikely(ad_len)) {
+		poly1305_update(&poly1305_state, ad, ad_len);
+		if (ad_len & 0xf)
+			poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
+	}
+
+	flags = SG_MITER_TO_SG;
+	if (!preemptible())
+		flags |= SG_MITER_ATOMIC;
+
+	sg_miter_start(&miter, src, sg_nents(src), flags);
+
+	for (sl = src_len; sl > 0 && sg_miter_next(&miter); sl -= miter.length) {
+		u8 *addr = miter.addr;
+		size_t length = min_t(size_t, sl, miter.length);
+
+		if (!encrypt)
+			poly1305_update(&poly1305_state, addr, length);
+
+		if (unlikely(partial)) {
+			size_t l = min(length, CHACHA_BLOCK_SIZE - partial);
+
+			crypto_xor(addr, b.chacha_stream + partial, l);
+			partial = (partial + l) & (CHACHA_BLOCK_SIZE - 1);
+
+			addr += l;
+			length -= l;
+		}
+
+		if (likely(length >= CHACHA_BLOCK_SIZE || length == sl)) {
+			size_t l = length;
+
+			if (unlikely(length < sl))
+				l &= ~(CHACHA_BLOCK_SIZE - 1);
+			chacha_crypt(chacha_state, addr, addr, l, 20);
+			addr += l;
+			length -= l;
+		}
+
+		if (unlikely(length > 0)) {
+			chacha_crypt(chacha_state, b.chacha_stream, pad0,
+				     CHACHA_BLOCK_SIZE, 20);
+			crypto_xor(addr, b.chacha_stream, length);
+			partial = length;
+		}
+
+		if (encrypt)
+			poly1305_update(&poly1305_state, miter.addr,
+					min_t(size_t, sl, miter.length));
+	}
+
+	if (src_len & 0xf)
+		poly1305_update(&poly1305_state, pad0, 0x10 - (src_len & 0xf));
+
+	b.lens[0] = cpu_to_le64(ad_len);
+	b.lens[1] = cpu_to_le64(src_len);
+	poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens));
+
+	if (likely(sl <= -POLY1305_DIGEST_SIZE)) {
+		if (encrypt) {
+			poly1305_final(&poly1305_state,
+				       miter.addr + miter.length + sl);
+			ret = true;
+		} else {
+			poly1305_final(&poly1305_state, b.mac[0]);
+			ret = !crypto_memneq(b.mac[0],
+					     miter.addr + miter.length + sl,
+					     POLY1305_DIGEST_SIZE);
+		}
+	}
+
+	sg_miter_stop(&miter);
+
+	if (unlikely(sl > -POLY1305_DIGEST_SIZE)) {
+		poly1305_final(&poly1305_state, b.mac[1]);
+		scatterwalk_map_and_copy(b.mac[encrypt], src, src_len,
+					 sizeof(b.mac[1]), encrypt);
+		ret = encrypt ||
+		      !crypto_memneq(b.mac[0], b.mac[1], POLY1305_DIGEST_SIZE);
+	}
+
+	memzero_explicit(chacha_state, sizeof(chacha_state));
+	memzero_explicit(&b, sizeof(b));
+
+	return ret;
+}
+
+bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
+					 const u8 *ad, const size_t ad_len,
+					 const u64 nonce,
+					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+{
+	return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
+						 nonce, key, 1);
+}
+EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
+
+bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
+					 const u8 *ad, const size_t ad_len,
+					 const u64 nonce,
+					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+{
+	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
+		return false;
+
+	return chacha20poly1305_crypt_sg_inplace(src,
+						 src_len - POLY1305_DIGEST_SIZE,
+						 ad, ad_len, nonce, key, 0);
+}
+EXPORT_SYMBOL(chacha20poly1305_decrypt_sg_inplace);
+
+ static int __init mod_init(void)
+ {
+ 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
--- a/target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0034-crypto-chacha_generic-remove-unnecessary-setkey-func.patch
@ -0,0 +1,68 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:21:29 -0800
+Subject: [PATCH] crypto: chacha_generic - remove unnecessary setkey()
+ functions
+
+commit 2043323a799a660bc84bbee404cf7a2617ec6157 upstream.
+
+Use chacha20_setkey() and chacha12_setkey() from
+<crypto/internal/chacha.h> instead of defining them again in
+chacha_generic.c.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/chacha_generic.c | 18 +++---------------
+ 1 file changed, 3 insertions(+), 15 deletions(-)
+
+--- a/crypto/chacha_generic.c
+++ b/crypto/chacha_generic.c
+@@ -37,18 +37,6 @@ static int chacha_stream_xor(struct skci
+ 	return err;
+ }
+ 
+-static int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-				  unsigned int keysize)
+-{
+-	return chacha_setkey(tfm, key, keysize, 20);
+-}
+-
+-static int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+-				 unsigned int keysize)
+-{
+-	return chacha_setkey(tfm, key, keysize, 12);
+-}
+-
+ static int crypto_chacha_crypt(struct skcipher_request *req)
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+@@ -91,7 +79,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= CHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= crypto_chacha_crypt,
+ 		.decrypt		= crypto_chacha_crypt,
+ 	}, {
+@@ -106,7 +94,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
+ 		.encrypt		= crypto_xchacha_crypt,
+ 		.decrypt		= crypto_xchacha_crypt,
+ 	}, {
+@@ -121,7 +109,7 @@ static struct skcipher_alg algs[] = {
+ 		.max_keysize		= CHACHA_KEY_SIZE,
+ 		.ivsize			= XCHACHA_IV_SIZE,
+ 		.chunksize		= CHACHA_BLOCK_SIZE,
+-		.setkey			= crypto_chacha12_setkey,
+		.setkey			= chacha12_setkey,
+ 		.encrypt		= crypto_xchacha_crypt,
+ 		.decrypt		= crypto_xchacha_crypt,
+ 	}
--- a/target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0035-crypto-x86-chacha-only-unregister-algorithms-if-regi.patch
@ -0,0 +1,31 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:21:58 -0800
+Subject: [PATCH] crypto: x86/chacha - only unregister algorithms if registered
+
+commit b62755aed3a3f5ca9edd2718339ccea3b6bbbe57 upstream.
+
+It's not valid to call crypto_unregister_skciphers() without a prior
+call to crypto_register_skciphers().
+
+Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha_glue.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -304,7 +304,8 @@ static int __init chacha_simd_mod_init(v
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+	if (boot_cpu_has(X86_FEATURE_SSSE3))
+		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
--- a/target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0036-crypto-lib-chacha20poly1305-use-chacha20_crypt.patch
@ -0,0 +1,83 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Sun, 17 Nov 2019 23:22:16 -0800
+Subject: [PATCH] crypto: lib/chacha20poly1305 - use chacha20_crypt()
+
+commit 413808b71e6204b0cc1eeaa77960f7c3cd381d33 upstream.
+
+Use chacha20_crypt() instead of chacha_crypt(), since it's not really
+appropriate for users of the ChaCha library API to be passing the number
+of rounds as an argument.
+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
+@@ -66,14 +66,14 @@ __chacha20poly1305_encrypt(u8 *dst, cons
+ 		__le64 lens[2];
+ 	} b;
+ 
+-	chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
+	chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+ 	poly1305_init(&poly1305_state, b.block0);
+ 
+ 	poly1305_update(&poly1305_state, ad, ad_len);
+ 	if (ad_len & 0xf)
+ 		poly1305_update(&poly1305_state, pad0, 0x10 - (ad_len & 0xf));
+ 
+-	chacha_crypt(chacha_state, dst, src, src_len, 20);
+	chacha20_crypt(chacha_state, dst, src, src_len);
+ 
+ 	poly1305_update(&poly1305_state, dst, src_len);
+ 	if (src_len & 0xf)
+@@ -140,7 +140,7 @@ __chacha20poly1305_decrypt(u8 *dst, cons
+ 	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
+ 		return false;
+ 
+-	chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
+	chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+ 	poly1305_init(&poly1305_state, b.block0);
+ 
+ 	poly1305_update(&poly1305_state, ad, ad_len);
+@@ -160,7 +160,7 @@ __chacha20poly1305_decrypt(u8 *dst, cons
+ 
+ 	ret = crypto_memneq(b.mac, src + dst_len, POLY1305_DIGEST_SIZE);
+ 	if (likely(!ret))
+-		chacha_crypt(chacha_state, dst, src, dst_len, 20);
+		chacha20_crypt(chacha_state, dst, src, dst_len);
+ 
+ 	memzero_explicit(&b, sizeof(b));
+ 
+@@ -241,7 +241,7 @@ bool chacha20poly1305_crypt_sg_inplace(s
+ 	b.iv[1] = cpu_to_le64(nonce);
+ 
+ 	chacha_init(chacha_state, b.k, (u8 *)b.iv);
+-	chacha_crypt(chacha_state, b.block0, pad0, sizeof(b.block0), 20);
+	chacha20_crypt(chacha_state, b.block0, pad0, sizeof(b.block0));
+ 	poly1305_init(&poly1305_state, b.block0);
+ 
+ 	if (unlikely(ad_len)) {
+@@ -278,14 +278,14 @@ bool chacha20poly1305_crypt_sg_inplace(s
+ 
+ 			if (unlikely(length < sl))
+ 				l &= ~(CHACHA_BLOCK_SIZE - 1);
+-			chacha_crypt(chacha_state, addr, addr, l, 20);
+			chacha20_crypt(chacha_state, addr, addr, l);
+ 			addr += l;
+ 			length -= l;
+ 		}
+ 
+ 		if (unlikely(length > 0)) {
+-			chacha_crypt(chacha_state, b.chacha_stream, pad0,
+-				     CHACHA_BLOCK_SIZE, 20);
+			chacha20_crypt(chacha_state, b.chacha_stream, pad0,
+				       CHACHA_BLOCK_SIZE);
+ 			crypto_xor(addr, b.chacha_stream, length);
+ 			partial = length;
+ 		}
--- a/target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0037-crypto-arch-conditionalize-crypto-api-in-arch-glue-f.patch
@ -0,0 +1,275 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 25 Nov 2019 11:31:12 +0100
+Subject: [PATCH] crypto: arch - conditionalize crypto api in arch glue for lib
+ code
+
+commit 8394bfec51e0e565556101bcc4e2fe7551104cd8 upstream.
+
+For glue code that's used by Zinc, the actual Crypto API functions might
+not necessarily exist, and don't need to exist either. Before this
+patch, there are valid build configurations that lead to a unbuildable
+kernel. This fixes it to conditionalize those symbols on the existence
+of the proper config entry.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c        | 26 ++++++++++++++++----------
+ arch/arm/crypto/curve25519-glue.c    |  5 +++--
+ arch/arm/crypto/poly1305-glue.c      |  9 ++++++---
+ arch/arm64/crypto/chacha-neon-glue.c |  5 +++--
+ arch/arm64/crypto/poly1305-glue.c    |  5 +++--
+ arch/mips/crypto/chacha-glue.c       |  6 ++++--
+ arch/mips/crypto/poly1305-glue.c     |  6 ++++--
+ arch/x86/crypto/blake2s-glue.c       |  6 ++++--
+ arch/x86/crypto/chacha_glue.c        |  5 +++--
+ arch/x86/crypto/curve25519-x86_64.c  |  7 ++++---
+ arch/x86/crypto/poly1305_glue.c      |  5 +++--
+ 11 files changed, 53 insertions(+), 32 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -286,11 +286,13 @@ static struct skcipher_alg neon_algs[] =
+ 
+ static int __init chacha_simd_mod_init(void)
+ {
+-	int err;
+	int err = 0;
+ 
+-	err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+-	if (err)
+-		return err;
+	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+		err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+		if (err)
+			return err;
+	}
+ 
+ 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+ 		int i;
+@@ -310,18 +312,22 @@ static int __init chacha_simd_mod_init(v
+ 			static_branch_enable(&use_neon);
+ 		}
+ 
+-		err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+-		if (err)
+-			crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+		if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+			err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+			if (err)
+				crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+		}
+ 	}
+ 	return err;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
+-		crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+		crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+		if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
+			crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+	}
+ }
+ 
+ module_init(chacha_simd_mod_init);
+--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
+@@ -108,14 +108,15 @@ static int __init mod_init(void)
+ {
+ 	if (elf_hwcap & HWCAP_NEON) {
+ 		static_branch_enable(&have_neon);
+-		return crypto_register_kpp(&curve25519_alg);
+		return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+			crypto_register_kpp(&curve25519_alg) : 0;
+ 	}
+ 	return 0;
+ }
+ 
+ static void __exit mod_exit(void)
+ {
+-	if (elf_hwcap & HWCAP_NEON)
+	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
+ 		crypto_unregister_kpp(&curve25519_alg);
+ }
+ 
+--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
+@@ -249,16 +249,19 @@ static int __init arm_poly1305_mod_init(
+ 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ 	    (elf_hwcap & HWCAP_NEON))
+ 		static_branch_enable(&have_neon);
+-	else
+	else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+ 		/* register only the first entry */
+ 		return crypto_register_shash(&arm_poly1305_algs[0]);
+ 
+-	return crypto_register_shashes(arm_poly1305_algs,
+-				       ARRAY_SIZE(arm_poly1305_algs));
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+		crypto_register_shashes(arm_poly1305_algs,
+					ARRAY_SIZE(arm_poly1305_algs)) : 0;
+ }
+ 
+ static void __exit arm_poly1305_mod_exit(void)
+ {
+	if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
+		return;
+ 	if (!static_branch_likely(&have_neon)) {
+ 		crypto_unregister_shash(&arm_poly1305_algs[0]);
+ 		return;
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -211,12 +211,13 @@ static int __init chacha_simd_mod_init(v
+ 
+ 	static_branch_enable(&have_neon);
+ 
+-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	if (cpu_have_named_feature(ASIMD))
+	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && cpu_have_named_feature(ASIMD))
+ 		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -220,12 +220,13 @@ static int __init neon_poly1305_mod_init
+ 
+ 	static_branch_enable(&have_neon);
+ 
+-	return crypto_register_shash(&neon_poly1305_alg);
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+		crypto_register_shash(&neon_poly1305_alg) : 0;
+ }
+ 
+ static void __exit neon_poly1305_mod_exit(void)
+ {
+-	if (cpu_have_named_feature(ASIMD))
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
+ 		crypto_unregister_shash(&neon_poly1305_alg);
+ }
+ 
+--- a/arch/mips/crypto/chacha-glue.c
+++ b/arch/mips/crypto/chacha-glue.c
+@@ -128,12 +128,14 @@ static struct skcipher_alg algs[] = {
+ 
+ static int __init chacha_simd_mod_init(void)
+ {
+-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
+		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+ module_init(chacha_simd_mod_init);
+--- a/arch/mips/crypto/poly1305-glue.c
+++ b/arch/mips/crypto/poly1305-glue.c
+@@ -187,12 +187,14 @@ static struct shash_alg mips_poly1305_al
+ 
+ static int __init mips_poly1305_mod_init(void)
+ {
+-	return crypto_register_shash(&mips_poly1305_alg);
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+		crypto_register_shash(&mips_poly1305_alg) : 0;
+ }
+ 
+ static void __exit mips_poly1305_mod_exit(void)
+ {
+-	crypto_unregister_shash(&mips_poly1305_alg);
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+		crypto_unregister_shash(&mips_poly1305_alg);
+ }
+ 
+ module_init(mips_poly1305_mod_init);
+--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
+@@ -210,12 +210,14 @@ static int __init blake2s_mod_init(void)
+ 			      XFEATURE_MASK_AVX512, NULL))
+ 		static_branch_enable(&blake2s_use_avx512);
+ 
+-	return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+		crypto_register_shashes(blake2s_algs,
+					ARRAY_SIZE(blake2s_algs)) : 0;
+ }
+ 
+ static void __exit blake2s_mod_exit(void)
+ {
+-	if (boot_cpu_has(X86_FEATURE_SSSE3))
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ 		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ }
+ 
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -299,12 +299,13 @@ static int __init chacha_simd_mod_init(v
+ 		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+ 			static_branch_enable(&chacha_use_avx512vl);
+ 	}
+-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+ }
+ 
+ static void __exit chacha_simd_mod_fini(void)
+ {
+-	if (boot_cpu_has(X86_FEATURE_SSSE3))
+	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
+ 		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ }
+ 
+--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -2457,13 +2457,14 @@ static int __init curve25519_mod_init(vo
+ 		static_branch_enable(&curve25519_use_adx);
+ 	else
+ 		return 0;
+-	return crypto_register_kpp(&curve25519_alg);
+	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
+		crypto_register_kpp(&curve25519_alg) : 0;
+ }
+ 
+ static void __exit curve25519_mod_exit(void)
+ {
+-	if (boot_cpu_has(X86_FEATURE_BMI2) ||
+-	    boot_cpu_has(X86_FEATURE_ADX))
+	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
+	    (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
+ 		crypto_unregister_kpp(&curve25519_alg);
+ }
+ 
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -224,12 +224,13 @@ static int __init poly1305_simd_mod_init
+ 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ 		static_branch_enable(&poly1305_use_avx2);
+ 
+-	return crypto_register_shash(&alg);
+	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
+ }
+ 
+ static void __exit poly1305_simd_mod_exit(void)
+ {
+-	crypto_unregister_shash(&alg);
+	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+		crypto_unregister_shash(&alg);
+ }
+ 
+ module_init(poly1305_simd_mod_init);
--- a/target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0038-crypto-chacha-fix-warning-message-in-header-file.patch
@ -0,0 +1,35 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= <valdis.kletnieks@vt.edu>
+Date: Thu, 5 Dec 2019 20:58:36 -0500
+Subject: [PATCH] crypto: chacha - fix warning message in header file
+
+commit 579d705cd64e44f3fcda1a6cfd5f37468a5ddf63 upstream.
+
+Building with W=1 causes a warning:
+
+  CC [M]  arch/x86/crypto/chacha_glue.o
+In file included from arch/x86/crypto/chacha_glue.c:10:
+./include/crypto/internal/chacha.h:37:1: warning: 'inline' is not at beginning of declaration [-Wold-style-declaration]
+   37 | static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+      | ^~~~~~
+
+Straighten out the order to match the rest of the header file.
+
+Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/internal/chacha.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/crypto/internal/chacha.h
+++ b/include/crypto/internal/chacha.h
+@@ -34,7 +34,7 @@ static inline int chacha20_setkey(struct
+ 	return chacha_setkey(tfm, key, keysize, 20);
+ }
+ 
+-static int inline chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+static inline int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ 				  unsigned int keysize)
+ {
+ 	return chacha_setkey(tfm, key, keysize, 12);
--- a/target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0039-crypto-arm-curve25519-add-arch-specific-key-generati.patch
@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 11 Dec 2019 10:26:39 +0100
+Subject: [PATCH] crypto: arm/curve25519 - add arch-specific key generation
+ function
+
+commit 84faa307249b341f6ad8de3e1869d77a65e26669 upstream.
+
+Somehow this was forgotten when Zinc was being split into oddly shaped
+pieces, resulting in linker errors. The x86_64 glue has a specific key
+generation implementation, but the Arm one does not. However, it can
+still receive the NEON speedups by calling the ordinary DH function
+using the base point.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/curve25519-glue.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
+@@ -38,6 +38,13 @@ void curve25519_arch(u8 out[CURVE25519_K
+ }
+ EXPORT_SYMBOL(curve25519_arch);
+ 
+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+			  const u8 secret[CURVE25519_KEY_SIZE])
+{
+	return curve25519_arch(pub, secret, curve25519_base_point);
+}
+EXPORT_SYMBOL(curve25519_base_arch);
+
+ static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+ 				 unsigned int len)
+ {
--- a/target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0040-crypto-lib-curve25519-re-add-selftests.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0041-crypto-poly1305-add-new-32-and-64-bit-generic-versio.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0042-crypto-x86-poly1305-import-unmodified-cryptogams-imp.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0043-crypto-x86-poly1305-wire-up-faster-implementations-f.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0044-crypto-arm-arm64-mips-poly1305-remove-redundant-non-.patch
@ -0,0 +1,171 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 5 Jan 2020 22:40:49 -0500
+Subject: [PATCH] crypto: {arm,arm64,mips}/poly1305 - remove redundant
+ non-reduction from emit
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+commit 31899908a0d248b030b4464425b86c717e0007d4 upstream.
+
+This appears to be some kind of copy and paste error, and is actually
+dead code.
+
+Pre: f = 0 ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[0]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[1]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 4);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[2]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 8);
+
+Pre: 0 ≤ f < 2³² ⇒ (f >> 32) = 0
+    f = (f >> 32) + le32_to_cpu(digest[3]);
+Post: 0 ≤ f < 2³²
+    put_unaligned_le32(f, dst + 12);
+
+Therefore this sequence is redundant. And Andy's code appears to handle
+misalignment acceptably.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Tested-by: Ard Biesheuvel <ardb@kernel.org>
+Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/poly1305-glue.c   | 18 ++----------------
+ arch/arm64/crypto/poly1305-glue.c | 18 ++----------------
+ arch/mips/crypto/poly1305-glue.c  | 18 ++----------------
+ 3 files changed, 6 insertions(+), 48 deletions(-)
+
+--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
+@@ -20,7 +20,7 @@
+ 
+ void poly1305_init_arm(void *state, const u8 *key);
+ void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+-void poly1305_emit_arm(void *state, __le32 *digest, const u32 *nonce);
+void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+ 
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+ {
+@@ -179,9 +179,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-	__le32 digest[4];
+-	u64 f = 0;
+-
+ 	if (unlikely(dctx->buflen)) {
+ 		dctx->buf[dctx->buflen++] = 1;
+ 		memset(dctx->buf + dctx->buflen, 0,
+@@ -189,18 +186,7 @@ void poly1305_final_arch(struct poly1305
+ 		poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ 	}
+ 
+-	poly1305_emit_arm(&dctx->h, digest, dctx->s);
+-
+-	/* mac = (h + s) % (2^128) */
+-	f = (f >> 32) + le32_to_cpu(digest[0]);
+-	put_unaligned_le32(f, dst);
+-	f = (f >> 32) + le32_to_cpu(digest[1]);
+-	put_unaligned_le32(f, dst + 4);
+-	f = (f >> 32) + le32_to_cpu(digest[2]);
+-	put_unaligned_le32(f, dst + 8);
+-	f = (f >> 32) + le32_to_cpu(digest[3]);
+-	put_unaligned_le32(f, dst + 12);
+-
+	poly1305_emit_arm(&dctx->h, dst, dctx->s);
+ 	*dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -21,7 +21,7 @@
+ asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
+ asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
+ asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+-asmlinkage void poly1305_emit(void *state, __le32 *digest, const u32 *nonce);
+asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
+ 
+ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+ 
+@@ -162,9 +162,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-	__le32 digest[4];
+-	u64 f = 0;
+-
+ 	if (unlikely(dctx->buflen)) {
+ 		dctx->buf[dctx->buflen++] = 1;
+ 		memset(dctx->buf + dctx->buflen, 0,
+@@ -172,18 +169,7 @@ void poly1305_final_arch(struct poly1305
+ 		poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ 	}
+ 
+-	poly1305_emit(&dctx->h, digest, dctx->s);
+-
+-	/* mac = (h + s) % (2^128) */
+-	f = (f >> 32) + le32_to_cpu(digest[0]);
+-	put_unaligned_le32(f, dst);
+-	f = (f >> 32) + le32_to_cpu(digest[1]);
+-	put_unaligned_le32(f, dst + 4);
+-	f = (f >> 32) + le32_to_cpu(digest[2]);
+-	put_unaligned_le32(f, dst + 8);
+-	f = (f >> 32) + le32_to_cpu(digest[3]);
+-	put_unaligned_le32(f, dst + 12);
+-
+	poly1305_emit(&dctx->h, dst, dctx->s);
+ 	*dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
+--- a/arch/mips/crypto/poly1305-glue.c
+++ b/arch/mips/crypto/poly1305-glue.c
+@@ -15,7 +15,7 @@
+ 
+ asmlinkage void poly1305_init_mips(void *state, const u8 *key);
+ asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
+-asmlinkage void poly1305_emit_mips(void *state, __le32 *digest, const u32 *nonce);
+asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
+ 
+ void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+ {
+@@ -134,9 +134,6 @@ EXPORT_SYMBOL(poly1305_update_arch);
+ 
+ void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+ {
+-	__le32 digest[4];
+-	u64 f = 0;
+-
+ 	if (unlikely(dctx->buflen)) {
+ 		dctx->buf[dctx->buflen++] = 1;
+ 		memset(dctx->buf + dctx->buflen, 0,
+@@ -144,18 +141,7 @@ void poly1305_final_arch(struct poly1305
+ 		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ 	}
+ 
+-	poly1305_emit_mips(&dctx->h, digest, dctx->s);
+-
+-	/* mac = (h + s) % (2^128) */
+-	f = (f >> 32) + le32_to_cpu(digest[0]);
+-	put_unaligned_le32(f, dst);
+-	f = (f >> 32) + le32_to_cpu(digest[1]);
+-	put_unaligned_le32(f, dst + 4);
+-	f = (f >> 32) + le32_to_cpu(digest[2]);
+-	put_unaligned_le32(f, dst + 8);
+-	f = (f >> 32) + le32_to_cpu(digest[3]);
+-	put_unaligned_le32(f, dst + 12);
+-
+	poly1305_emit_mips(&dctx->h, dst, dctx->s);
+ 	*dctx = (struct poly1305_desc_ctx){};
+ }
+ EXPORT_SYMBOL(poly1305_final_arch);
--- a/target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0045-crypto-curve25519-Fix-selftest-build-error.patch
@ -0,0 +1,102 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Wed, 8 Jan 2020 12:37:35 +0800
+Subject: [PATCH] crypto: curve25519 - Fix selftest build error
+
+commit a8bdf2c42ee4d1ee42af1f3601f85de94e70a421 upstream.
+
+If CRYPTO_CURVE25519 is y, CRYPTO_LIB_CURVE25519_GENERIC will be
+y, but CRYPTO_LIB_CURVE25519 may be set to m, this causes build
+errors:
+
+lib/crypto/curve25519-selftest.o: In function `curve25519':
+curve25519-selftest.c:(.text.unlikely+0xc): undefined reference to `curve25519_arch'
+lib/crypto/curve25519-selftest.o: In function `curve25519_selftest':
+curve25519-selftest.c:(.init.text+0x17e): undefined reference to `curve25519_base_arch'
+
+This is because the curve25519 self-test code is being controlled
+by the GENERIC option rather than the overall CURVE25519 option,
+as is the case with blake2s.  To recap, the GENERIC and ARCH options
+for CURVE25519 are internal only and selected by users such as
+the Crypto API, or the externally visible CURVE25519 option which
+in turn is selected by wireguard.  The self-test is specific to the
+the external CURVE25519 option and should not be enabled by the
+Crypto API.
+
+This patch fixes this by splitting the GENERIC module from the
+CURVE25519 module with the latter now containing just the self-test.
+
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Fixes: aa127963f1ca ("crypto: lib/curve25519 - re-add selftests")
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/Makefile             |  9 ++++++---
+ lib/crypto/curve25519-generic.c | 24 ++++++++++++++++++++++++
+ lib/crypto/curve25519.c         |  7 -------
+ 3 files changed, 30 insertions(+), 10 deletions(-)
+ create mode 100644 lib/crypto/curve25519-generic.c
+
+--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
+@@ -19,9 +19,12 @@ libblake2s-y					+= blake2s.o
+ obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305)	+= libchacha20poly1305.o
+ libchacha20poly1305-y				+= chacha20poly1305.o
+ 
+-obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)	+= libcurve25519.o
+-libcurve25519-y					:= curve25519-fiat32.o
+-libcurve25519-$(CONFIG_ARCH_SUPPORTS_INT128)	:= curve25519-hacl64.o
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC)	+= libcurve25519-generic.o
+libcurve25519-generic-y				:= curve25519-fiat32.o
+libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128)	:= curve25519-hacl64.o
+libcurve25519-generic-y				+= curve25519-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519)		+= libcurve25519.o
+ libcurve25519-y					+= curve25519.o
+ 
+ obj-$(CONFIG_CRYPTO_LIB_DES)			+= libdes.o
+--- /dev/null
+++ b/lib/crypto/curve25519-generic.c
+@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is an implementation of the Curve25519 ECDH algorithm, using either
+ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * depending on what is supported by the target compiler.
+ *
+ * Information: https://cr.yp.to/ecdh.html
+ */
+
+#include <crypto/curve25519.h>
+#include <linux/module.h>
+
+const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+
+EXPORT_SYMBOL(curve25519_null_point);
+EXPORT_SYMBOL(curve25519_base_point);
+EXPORT_SYMBOL(curve25519_generic);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+--- a/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
+@@ -15,13 +15,6 @@
+ 
+ bool curve25519_selftest(void);
+ 
+-const u8 curve25519_null_point[CURVE25519_KEY_SIZE] __aligned(32) = { 0 };
+-const u8 curve25519_base_point[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
+-
+-EXPORT_SYMBOL(curve25519_null_point);
+-EXPORT_SYMBOL(curve25519_base_point);
+-EXPORT_SYMBOL(curve25519_generic);
+-
+ static int __init mod_init(void)
+ {
+ 	if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
--- a/target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0046-crypto-x86-poly1305-fix-.gitignore-typo.patch
@ -0,0 +1,23 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 16 Jan 2020 18:23:55 +0100
+Subject: [PATCH] crypto: x86/poly1305 - fix .gitignore typo
+
+commit 1f6868995326cc82102049e349d8dbd116bdb656 upstream.
+
+Admist the kbuild robot induced changes, the .gitignore file for the
+generated file wasn't updated with the non-clashing filename. This
+commit adjusts that.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/.gitignore | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/crypto/.gitignore
+++ b/arch/x86/crypto/.gitignore
+@@ -1 +1 @@
+-poly1305-x86_64.S
+poly1305-x86_64-cryptogams.S
--- a/target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0047-crypto-chacha20poly1305-add-back-missing-test-vector.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0048-crypto-x86-poly1305-emit-does-base-conversion-itself.patch
@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 17 Jan 2020 11:42:22 +0100
+Subject: [PATCH] crypto: x86/poly1305 - emit does base conversion itself
+
+commit f9e7fe32a792726186301423ff63a465d63386e1 upstream.
+
+The emit code does optional base conversion itself in assembly, so we
+don't need to do that here. Also, neither one of these functions uses
+simd instructions, so checking for that doesn't make sense either.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -123,13 +123,9 @@ static void poly1305_simd_blocks(void *c
+ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ 			       const u32 nonce[4])
+ {
+-	struct poly1305_arch_internal *state = ctx;
+-
+-	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+-	    !state->is_base2_26 || !crypto_simd_usable()) {
+-		convert_to_base2_64(ctx);
+	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+ 		poly1305_emit_x86_64(ctx, mac, nonce);
+-	} else
+	else
+ 		poly1305_emit_avx(ctx, mac, nonce);
+ }
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0049-crypto-arm-chacha-fix-build-failured-when-kernel-mod.patch
@ -0,0 +1,58 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 17 Jan 2020 17:43:18 +0100
+Subject: [PATCH] crypto: arm/chacha - fix build failured when kernel mode NEON
+ is disabled
+
+commit 0bc81767c5bd9d005fae1099fb39eb3688370cb1 upstream.
+
+When the ARM accelerated ChaCha driver is built as part of a configuration
+that has kernel mode NEON disabled, we expect the compiler to propagate
+the build time constant expression IS_ENABLED(CONFIG_KERNEL_MODE_NEON) in
+a way that eliminates all the cross-object references to the actual NEON
+routines, which allows the chacha-neon-core.o object to be omitted from
+the build entirely.
+
+Unfortunately, this fails to work as expected in some cases, and we may
+end up with a build error such as
+
+  chacha-glue.c:(.text+0xc0): undefined reference to `chacha_4block_xor_neon'
+
+caused by the fact that chacha_doneon() has not been eliminated from the
+object code, even though it will never be called in practice.
+
+Let's fix this by adding some IS_ENABLED(CONFIG_KERNEL_MODE_NEON) tests
+that are not strictly needed from a logical point of view, but should
+help the compiler infer that the NEON code paths are unreachable in
+those cases.
+
+Fixes: b36d8c09e710c71f ("crypto: arm/chacha - remove dependency on generic ...")
+Reported-by: Russell King <linux@armlinux.org.uk>
+Cc: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -115,7 +115,7 @@ static int chacha_stream_xor(struct skci
+ 		if (nbytes < walk.total)
+ 			nbytes = round_down(nbytes, walk.stride);
+ 
+-		if (!neon) {
+		if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+ 			chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
+ 				     nbytes, state, ctx->nrounds);
+ 			state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
+@@ -159,7 +159,7 @@ static int do_xchacha(struct skcipher_re
+ 
+ 	chacha_init_generic(state, ctx->key, req->iv);
+ 
+-	if (!neon) {
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+ 		hchacha_block_arm(state, subctx.key, ctx->nrounds);
+ 	} else {
+ 		kernel_neon_begin();
--- a/target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0050-crypto-Kconfig-allow-tests-to-be-disabled-when-manag.patch
@ -0,0 +1,40 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 17 Jan 2020 12:01:36 +0100
+Subject: [PATCH] crypto: Kconfig - allow tests to be disabled when manager is
+ disabled
+
+commit 2343d1529aff8b552589f622c23932035ed7a05d upstream.
+
+The library code uses CRYPTO_MANAGER_DISABLE_TESTS to conditionalize its
+tests, but the library code can also exist without CRYPTO_MANAGER. That
+means on minimal configs, the test code winds up being built with no way
+to disable it.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig | 4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -136,8 +136,6 @@ config CRYPTO_USER
+ 	  Userspace configuration for cryptographic instantiations such as
+ 	  cbc(aes).
+ 
+-if CRYPTO_MANAGER2
+-
+ config CRYPTO_MANAGER_DISABLE_TESTS
+ 	bool "Disable run-time self tests"
+ 	default y
+@@ -155,8 +153,6 @@ config CRYPTO_MANAGER_EXTRA_TESTS
+ 	  This is intended for developer use only, as these tests take much
+ 	  longer to run than the normal self tests.
+ 
+-endif	# if CRYPTO_MANAGER2
+-
+ config CRYPTO_GF128MUL
+ 	tristate
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0051-crypto-chacha20poly1305-prevent-integer-overflow-on-.patch
@ -0,0 +1,40 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 6 Feb 2020 12:42:01 +0100
+Subject: [PATCH] crypto: chacha20poly1305 - prevent integer overflow on large
+ input
+
+commit c9cc0517bba9f0213f1e55172feceb99e5512daf upstream.
+
+This code assigns src_len (size_t) to sl (int), which causes problems
+when src_len is very large. Probably nobody in the kernel should be
+passing this much data to chacha20poly1305 all in one go anyway, so I
+don't think we need to change the algorithm or introduce larger types
+or anything. But we should at least error out early in this case and
+print a warning so that we get reports if this does happen and can look
+into why anybody is possibly passing it that much data or if they're
+accidently passing -1 or similar.
+
+Fixes: d95312a3ccc0 ("crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine")
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: stable@vger.kernel.org # 5.5+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
+@@ -235,6 +235,9 @@ bool chacha20poly1305_crypt_sg_inplace(s
+ 		__le64 lens[2];
+ 	} b __aligned(16);
+ 
+	if (WARN_ON(src_len > INT_MAX))
+		return false;
+
+ 	chacha_load_key(b.k, key);
+ 
+ 	b.iv[0] = 0;
--- a/target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0052-crypto-x86-curve25519-support-assemblers-with-no-adx.patch
@ -0,0 +1,84 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 1 Mar 2020 22:52:35 +0800
+Subject: [PATCH] crypto: x86/curve25519 - support assemblers with no adx
+ support
+
+commit 1579f1bc3b753d17a44de3457d5c6f4a5b14c752 upstream.
+
+Some older version of GAS do not support the ADX instructions, similarly
+to how they also don't support AVX and such. This commit adds the same
+build-time detection mechanisms we use for AVX and others for ADX, and
+then makes sure that the curve25519 library dispatcher calls the right
+functions.
+
+Reported-by: Willy Tarreau <w@1wt.eu>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/Makefile           | 5 +++--
+ arch/x86/crypto/Makefile    | 7 ++++++-
+ include/crypto/curve25519.h | 6 ++++--
+ 3 files changed, 13 insertions(+), 5 deletions(-)
+
+--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
+@@ -198,9 +198,10 @@ avx2_instr :=$(call as-instr,vpbroadcast
+ avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
+ sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
+ sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
+adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
+ 
+-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
+ 
+ KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+ 
+--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
+@@ -11,6 +11,7 @@ avx2_supported := $(call as-instr,vpgath
+ avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
+ sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
+ sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
+adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
+ 
+ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
+ 
+@@ -39,7 +40,11 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2)
+ 
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+ obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+-obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+
+# These modules require the assembler to support ADX.
+ifeq ($(adx_supported),yes)
+	obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+endif
+ 
+ # These modules require assembler to support AVX.
+ ifeq ($(avx_supported),yes)
+--- a/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
+@@ -33,7 +33,8 @@ bool __must_check curve25519(u8 mypublic
+ 			     const u8 secret[CURVE25519_KEY_SIZE],
+ 			     const u8 basepoint[CURVE25519_KEY_SIZE])
+ {
+-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
+	    (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ 		curve25519_arch(mypublic, secret, basepoint);
+ 	else
+ 		curve25519_generic(mypublic, secret, basepoint);
+@@ -49,7 +50,8 @@ __must_check curve25519_generate_public(
+ 				    CURVE25519_KEY_SIZE)))
+ 		return false;
+ 
+-	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519))
+	if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
+	    (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ 		curve25519_base_arch(pub, secret);
+ 	else
+ 		curve25519_generic(pub, secret, curve25519_base_point);
--- a/target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0053-crypto-arm64-chacha-correctly-walk-through-blocks.patch
@ -0,0 +1,68 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 18 Mar 2020 20:27:32 -0600
+Subject: [PATCH] crypto: arm64/chacha - correctly walk through blocks
+
+commit c8cfcb78c65877313cda7bcbace624d3dbd1f3b3 upstream.
+
+Prior, passing in chunks of 2, 3, or 4, followed by any additional
+chunks would result in the chacha state counter getting out of sync,
+resulting in incorrect encryption/decryption, which is a pretty nasty
+crypto vuln: "why do images look weird on webpages?" WireGuard users
+never experienced this prior, because we have always, out of tree, used
+a different crypto library, until the recent Frankenzinc addition. This
+commit fixes the issue by advancing the pointers and state counter by
+the actual size processed. It also fixes up a bug in the (optional,
+costly) stride test that prevented it from running on arm64.
+
+Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
+Reported-and-tested-by: Emil Renner Berthing <kernel@esmil.dk>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: stable@vger.kernel.org # v5.5+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/chacha-neon-glue.c   |  8 ++++----
+ lib/crypto/chacha20poly1305-selftest.c | 11 ++++++++---
+ 2 files changed, 12 insertions(+), 7 deletions(-)
+
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -55,10 +55,10 @@ static void chacha_doneon(u32 *state, u8
+ 			break;
+ 		}
+ 		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+-		bytes -= CHACHA_BLOCK_SIZE * 5;
+-		src += CHACHA_BLOCK_SIZE * 5;
+-		dst += CHACHA_BLOCK_SIZE * 5;
+-		state[12] += 5;
+		bytes -= l;
+		src += l;
+		dst += l;
+		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ 	}
+ }
+ 
+--- a/lib/crypto/chacha20poly1305-selftest.c
+++ b/lib/crypto/chacha20poly1305-selftest.c
+@@ -9028,10 +9028,15 @@ bool __init chacha20poly1305_selftest(vo
+ 	     && total_len <= 1 << 10; ++total_len) {
+ 		for (i = 0; i <= total_len; ++i) {
+ 			for (j = i; j <= total_len; ++j) {
+				k = 0;
+ 				sg_init_table(sg_src, 3);
+-				sg_set_buf(&sg_src[0], input, i);
+-				sg_set_buf(&sg_src[1], input + i, j - i);
+-				sg_set_buf(&sg_src[2], input + j, total_len - j);
+				if (i)
+					sg_set_buf(&sg_src[k++], input, i);
+				if (j - i)
+					sg_set_buf(&sg_src[k++], input + i, j - i);
+				if (total_len - j)
+					sg_set_buf(&sg_src[k++], input + j, total_len - j);
+				sg_init_marker(sg_src, k);
+ 				memset(computed_output, 0, total_len);
+ 				memset(input, 0, total_len);
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0054-crypto-x86-curve25519-replace-with-formally-verified.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0055-crypto-x86-curve25519-leave-r12-as-spare-register.patch
@ -0,0 +1,376 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 1 Mar 2020 16:06:56 +0800
+Subject: [PATCH] crypto: x86/curve25519 - leave r12 as spare register
+
+commit dc7fc3a53ae158263196b1892b672aedf67796c5 upstream.
+
+This updates to the newer register selection proved by HACL*, which
+leads to a more compact instruction encoding, and saves around 100
+cycles.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 110 ++++++++++++++--------------
+ 1 file changed, 55 insertions(+), 55 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -167,28 +167,28 @@ static inline void fmul(u64 *out, const
+ 		"  movq 0(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 8(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 16(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 24(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+ 		/* Line up pointers */
+@@ -202,11 +202,11 @@ static inline void fmul(u64 *out, const
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+ 		"  xor %3, %3;"
+ 		"  adoxq 0(%1), %%r8;"
+-		"  mulxq 40(%1), %%r9, %%r12;"
+		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 8(%1), %%r9;"
+ 		"  mulxq 48(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 16(%1), %%r10;"
+ 		"  mulxq 56(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -231,7 +231,7 @@ static inline void fmul(u64 *out, const
+ 		"  movq %%r8, 0(%0);"
+ 	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+ 	:
+-	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
+ 	);
+ }
+ 
+@@ -248,28 +248,28 @@ static inline void fmul2(u64 *out, const
+ 		"  movq 0(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 8(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 16(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 24(%1), %%rdx;"
+ 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+-		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
+-		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
+		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
+ 
+@@ -279,28 +279,28 @@ static inline void fmul2(u64 *out, const
+ 		"  movq 32(%1), %%rdx;"
+ 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
+ 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+-		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
+		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 40(%1), %%rdx;"
+ 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 80(%0);"
+-		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 48(%1), %%rdx;"
+ 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 88(%0);"
+-		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
+		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 56(%1), %%rdx;"
+ 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 96(%0);"
+-		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 104(%0);"    "  mov $0, %%r8;"
+		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
+ 		/* Line up pointers */
+@@ -314,11 +314,11 @@ static inline void fmul2(u64 *out, const
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+ 		"  xor %3, %3;"
+ 		"  adoxq 0(%1), %%r8;"
+-		"  mulxq 40(%1), %%r9, %%r12;"
+		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 8(%1), %%r9;"
+ 		"  mulxq 48(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 16(%1), %%r10;"
+ 		"  mulxq 56(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -347,11 +347,11 @@ static inline void fmul2(u64 *out, const
+ 		"  mulxq 96(%1), %%r8, %%r13;"
+ 		"  xor %3, %3;"
+ 		"  adoxq 64(%1), %%r8;"
+-		"  mulxq 104(%1), %%r9, %%r12;"
+		"  mulxq 104(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 72(%1), %%r9;"
+ 		"  mulxq 112(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 80(%1), %%r10;"
+ 		"  mulxq 120(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -376,7 +376,7 @@ static inline void fmul2(u64 *out, const
+ 		"  movq %%r8, 32(%0);"
+ 	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+ 	:
+-	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
+ 	);
+ }
+ 
+@@ -388,11 +388,11 @@ static inline void fmul_scalar(u64 *out,
+ 	asm volatile(
+ 		/* Compute the raw multiplication of f1*f2 */
+ 		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
+-		"  mulxq 8(%2), %%r9, %%r12;"      /* f1[1]*f2 */
+		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
+ 		"  add %%rcx, %%r9;"
+ 		"  mov $0, %%rcx;"
+ 		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
+ 		"  adcx %%r13, %%r11;"
+ 		"  adcx %%rcx, %%rax;"
+@@ -419,7 +419,7 @@ static inline void fmul_scalar(u64 *out,
+ 		"  movq %%r8, 0(%1);"
+ 	: "+&r" (f2_r)
+ 	: "r" (out), "r" (f1)
+-	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
+	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
+ 	);
+ }
+ 
+@@ -520,8 +520,8 @@ static inline void fsqr(u64 *out, const
+ 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+-		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+ 		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+ 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -531,12 +531,12 @@ static inline void fsqr(u64 *out, const
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+ 		"  adcx %%r9, %%r9;"
+-		"  adox %%r15, %%r12;"
+		"  adox %%r15, %%rbx;"
+ 		"  adcx %%r10, %%r10;"
+ 		"  adox %%r15, %%r13;"
+ 		"  adcx %%r11, %%r11;"
+ 		"  adox %%r15, %%r14;"
+-		"  adcx %%r12, %%r12;"
+		"  adcx %%rbx, %%rbx;"
+ 		"  adcx %%r13, %%r13;"
+ 		"  adcx %%r14, %%r14;"
+ 
+@@ -549,7 +549,7 @@ static inline void fsqr(u64 *out, const
+ 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
+ 		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+ 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
+-		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
+		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
+ 		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+ 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
+ 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+@@ -565,11 +565,11 @@ static inline void fsqr(u64 *out, const
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+ 		"  xor %%rcx, %%rcx;"
+ 		"  adoxq 0(%1), %%r8;"
+-		"  mulxq 40(%1), %%r9, %%r12;"
+		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 8(%1), %%r9;"
+ 		"  mulxq 48(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 16(%1), %%r10;"
+ 		"  mulxq 56(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -594,7 +594,7 @@ static inline void fsqr(u64 *out, const
+ 		"  movq %%r8, 0(%0);"
+ 	: "+&r" (tmp), "+&r" (f), "+&r" (out)
+ 	:
+-	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
+ 	);
+ }
+ 
+@@ -611,8 +611,8 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+-		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+ 		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+ 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -622,12 +622,12 @@ static inline void fsqr2(u64 *out, const
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+ 		"  adcx %%r9, %%r9;"
+-		"  adox %%r15, %%r12;"
+		"  adox %%r15, %%rbx;"
+ 		"  adcx %%r10, %%r10;"
+ 		"  adox %%r15, %%r13;"
+ 		"  adcx %%r11, %%r11;"
+ 		"  adox %%r15, %%r14;"
+-		"  adcx %%r12, %%r12;"
+		"  adcx %%rbx, %%rbx;"
+ 		"  adcx %%r13, %%r13;"
+ 		"  adcx %%r14, %%r14;"
+ 
+@@ -640,7 +640,7 @@ static inline void fsqr2(u64 *out, const
+ 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
+ 		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+ 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
+-		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
+		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
+ 		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+ 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
+ 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
+@@ -651,8 +651,8 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 56(%1), %%rdx;"                                      /* f[3] */
+-		"  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+-		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+ 		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+ 		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+@@ -662,12 +662,12 @@ static inline void fsqr2(u64 *out, const
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+ 		"  adcx %%r9, %%r9;"
+-		"  adox %%r15, %%r12;"
+		"  adox %%r15, %%rbx;"
+ 		"  adcx %%r10, %%r10;"
+ 		"  adox %%r15, %%r13;"
+ 		"  adcx %%r11, %%r11;"
+ 		"  adox %%r15, %%r14;"
+-		"  adcx %%r12, %%r12;"
+		"  adcx %%rbx, %%rbx;"
+ 		"  adcx %%r13, %%r13;"
+ 		"  adcx %%r14, %%r14;"
+ 
+@@ -680,7 +680,7 @@ static inline void fsqr2(u64 *out, const
+ 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
+ 		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
+ 		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
+-		"  adcx %%rcx, %%r12;"     "  movq %%r12, 104(%0);"
+		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
+ 		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
+ 		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
+ 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
+@@ -694,11 +694,11 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+ 		"  xor %%rcx, %%rcx;"
+ 		"  adoxq 0(%1), %%r8;"
+-		"  mulxq 40(%1), %%r9, %%r12;"
+		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 8(%1), %%r9;"
+ 		"  mulxq 48(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 16(%1), %%r10;"
+ 		"  mulxq 56(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -727,11 +727,11 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 96(%1), %%r8, %%r13;"
+ 		"  xor %%rcx, %%rcx;"
+ 		"  adoxq 64(%1), %%r8;"
+-		"  mulxq 104(%1), %%r9, %%r12;"
+		"  mulxq 104(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+ 		"  adoxq 72(%1), %%r9;"
+ 		"  mulxq 112(%1), %%r10, %%r13;"
+-		"  adcx %%r12, %%r10;"
+		"  adcx %%rbx, %%r10;"
+ 		"  adoxq 80(%1), %%r10;"
+ 		"  mulxq 120(%1), %%r11, %%rax;"
+ 		"  adcx %%r13, %%r11;"
+@@ -756,7 +756,7 @@ static inline void fsqr2(u64 *out, const
+ 		"  movq %%r8, 32(%0);"
+ 	: "+&r" (tmp), "+&r" (f), "+&r" (out)
+ 	:
+-	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
+ 	);
+ }
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0056-crypto-arm-64-poly1305-add-artifact-to-.gitignore-fi.patch
@ -0,0 +1,35 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 19 Mar 2020 11:56:17 -0600
+Subject: [PATCH] crypto: arm[64]/poly1305 - add artifact to .gitignore files
+
+commit 6e4e00d8b68ca7eb30d08afb740033e0d36abe55 upstream.
+
+The .S_shipped yields a .S, and the pattern in these directories is to
+add that to .gitignore so that git-status doesn't raise a fuss.
+
+Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Reported-by: Emil Renner Berthing <kernel@esmil.dk>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/.gitignore   | 1 +
+ arch/arm64/crypto/.gitignore | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
+@@ -1,3 +1,4 @@
+ aesbs-core.S
+ sha256-core.S
+ sha512-core.S
+poly1305-core.S
+--- a/arch/arm64/crypto/.gitignore
+++ b/arch/arm64/crypto/.gitignore
+@@ -1,2 +1,3 @@
+ sha256-core.S
+ sha512-core.S
+poly1305-core.S
--- a/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0057-crypto-arch-lib-limit-simd-usage-to-4k-chunks.patch
@ -0,0 +1,243 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 23 Apr 2020 15:54:04 -0600
+Subject: [PATCH] crypto: arch/lib - limit simd usage to 4k chunks
+
+commit 706024a52c614b478b63f7728d202532ce6591a9 upstream.
+
+The initial Zinc patchset, after some mailing list discussion, contained
+code to ensure that kernel_fpu_enable would not be kept on for more than
+a 4k chunk, since it disables preemption. The choice of 4k isn't totally
+scientific, but it's not a bad guess either, and it's what's used in
+both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
+of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
+former two).
+
+Ard did some back of the envelope calculations and found that
+at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
+means we have a maximum preemption disabling of 20us, which Sebastian
+confirmed was probably a good limit.
+
+Unfortunately the chunking appears to have been left out of the final
+patchset that added the glue code. So, this commit adds it back in.
+
+Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
+Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
+Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
+Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
+Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
+Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: stable@vger.kernel.org
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c        | 14 +++++++++++---
+ arch/arm/crypto/poly1305-glue.c      | 15 +++++++++++----
+ arch/arm64/crypto/chacha-neon-glue.c | 14 +++++++++++---
+ arch/arm64/crypto/poly1305-glue.c    | 15 +++++++++++----
+ arch/x86/crypto/blake2s-glue.c       | 10 ++++------
+ arch/x86/crypto/chacha_glue.c        | 14 +++++++++++---
+ arch/x86/crypto/poly1305_glue.c      | 13 ++++++-------
+ 7 files changed, 65 insertions(+), 30 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -91,9 +91,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+ 		return;
+ 	}
+ 
+-	kernel_neon_begin();
+-	chacha_doneon(state, dst, src, bytes, nrounds);
+-	kernel_neon_end();
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
+@@ -160,13 +160,20 @@ void poly1305_update_arch(struct poly130
+ 		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+ 
+ 		if (static_branch_likely(&have_neon) && do_neon) {
+-			kernel_neon_begin();
+-			poly1305_blocks_neon(&dctx->h, src, len, 1);
+-			kernel_neon_end();
+			do {
+				unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+				kernel_neon_begin();
+				poly1305_blocks_neon(&dctx->h, src, todo, 1);
+				kernel_neon_end();
+
+				len -= todo;
+				src += todo;
+			} while (len);
+ 		} else {
+ 			poly1305_blocks_arm(&dctx->h, src, len, 1);
+			src += len;
+ 		}
+-		src += len;
+ 		nbytes %= POLY1305_BLOCK_SIZE;
+ 	}
+ 
+--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
+@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+ 	    !crypto_simd_usable())
+ 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ 
+-	kernel_neon_begin();
+-	chacha_doneon(state, dst, src, bytes, nrounds);
+-	kernel_neon_end();
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
+@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly130
+ 		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+ 
+ 		if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+-			kernel_neon_begin();
+-			poly1305_blocks_neon(&dctx->h, src, len, 1);
+-			kernel_neon_end();
+			do {
+				unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+				kernel_neon_begin();
+				poly1305_blocks_neon(&dctx->h, src, todo, 1);
+				kernel_neon_end();
+
+				len -= todo;
+				src += todo;
+			} while (len);
+ 		} else {
+ 			poly1305_blocks(&dctx->h, src, len, 1);
+			src += len;
+ 		}
+-		src += len;
+ 		nbytes %= POLY1305_BLOCK_SIZE;
+ 	}
+ 
+--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
+@@ -32,16 +32,16 @@ void blake2s_compress_arch(struct blake2
+ 			   const u32 inc)
+ {
+ 	/* SIMD disables preemption, so relax after processing each page. */
+-	BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
+	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+ 
+ 	if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+ 		blake2s_compress_generic(state, block, nblocks, inc);
+ 		return;
+ 	}
+ 
+-	for (;;) {
+	do {
+ 		const size_t blocks = min_t(size_t, nblocks,
+-					    PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
+					    SZ_4K / BLAKE2S_BLOCK_SIZE);
+ 
+ 		kernel_fpu_begin();
+ 		if (IS_ENABLED(CONFIG_AS_AVX512) &&
+@@ -52,10 +52,8 @@ void blake2s_compress_arch(struct blake2
+ 		kernel_fpu_end();
+ 
+ 		nblocks -= blocks;
+-		if (!nblocks)
+-			break;
+ 		block += blocks * BLAKE2S_BLOCK_SIZE;
+-	}
+	} while (nblocks);
+ }
+ EXPORT_SYMBOL(blake2s_compress_arch);
+ 
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -154,9 +154,17 @@ void chacha_crypt_arch(u32 *state, u8 *d
+ 	    bytes <= CHACHA_BLOCK_SIZE)
+ 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+ 
+-	kernel_fpu_begin();
+-	chacha_dosimd(state, dst, src, bytes, nrounds);
+-	kernel_fpu_end();
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_fpu_begin();
+		chacha_dosimd(state, dst, src, todo, nrounds);
+		kernel_fpu_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+ }
+ EXPORT_SYMBOL(chacha_crypt_arch);
+ 
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -91,8 +91,8 @@ static void poly1305_simd_blocks(void *c
+ 	struct poly1305_arch_internal *state = ctx;
+ 
+ 	/* SIMD disables preemption, so relax after processing each page. */
+-	BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+-		     PAGE_SIZE % POLY1305_BLOCK_SIZE);
+	BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+		     SZ_4K % POLY1305_BLOCK_SIZE);
+ 
+ 	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+ 	    (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+@@ -102,8 +102,8 @@ static void poly1305_simd_blocks(void *c
+ 		return;
+ 	}
+ 
+-	for (;;) {
+-		const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+	do {
+		const size_t bytes = min_t(size_t, len, SZ_4K);
+ 
+ 		kernel_fpu_begin();
+ 		if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
+@@ -113,11 +113,10 @@ static void poly1305_simd_blocks(void *c
+ 		else
+ 			poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ 		kernel_fpu_end();
+
+ 		len -= bytes;
+-		if (!len)
+-			break;
+ 		inp += bytes;
+-	}
+	} while (len);
+ }
+ 
+ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
--- a/target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0058-crypto-lib-chacha20poly1305-Add-missing-function-dec.patch
@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Wed, 8 Jul 2020 12:41:13 +1000
+Subject: [PATCH] crypto: lib/chacha20poly1305 - Add missing function
+ declaration
+
+commit 06cc2afbbdf9a9e8df3e2f8db724997dd6e1b4ac upstream.
+
+This patch adds a declaration for chacha20poly1305_selftest to
+silence a sparse warning.
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ include/crypto/chacha20poly1305.h | 2 ++
+ lib/crypto/chacha20poly1305.c     | 2 --
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/crypto/chacha20poly1305.h
+++ b/include/crypto/chacha20poly1305.h
+@@ -45,4 +45,6 @@ bool chacha20poly1305_decrypt_sg_inplace
+ 					 const u64 nonce,
+ 					 const u8 key[CHACHA20POLY1305_KEY_SIZE]);
+ 
+bool chacha20poly1305_selftest(void);
+
+ #endif /* __CHACHA20POLY1305_H */
+--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
+@@ -21,8 +21,6 @@
+ 
+ #define CHACHA_KEY_WORDS	(CHACHA_KEY_SIZE / sizeof(u32))
+ 
+-bool __init chacha20poly1305_selftest(void);
+-
+ static void chacha_load_key(u32 *k, const u8 *in)
+ {
+ 	k[0] = get_unaligned_le32(in);
--- a/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0059-crypto-x86-chacha-sse3-use-unaligned-loads-for-state.patch
@ -0,0 +1,147 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Wed, 8 Jul 2020 12:11:18 +0300
+Subject: [PATCH] crypto: x86/chacha-sse3 - use unaligned loads for state array
+
+commit e79a31715193686e92dadb4caedfbb1f5de3659c upstream.
+
+Due to the fact that the x86 port does not support allocating objects
+on the stack with an alignment that exceeds 8 bytes, we have a rather
+ugly hack in the x86 code for ChaCha to ensure that the state array is
+aligned to 16 bytes, allowing the SSE3 implementation of the algorithm
+to use aligned loads.
+
+Given that the performance benefit of using of aligned loads appears to
+be limited (~0.25% for 1k blocks using tcrypt on a Corei7-8650U), and
+the fact that this hack has leaked into generic ChaCha code, let's just
+remove it.
+
+Cc: Martin Willi <martin@strongswan.org>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: Eric Biggers <ebiggers@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Reviewed-by: Martin Willi <martin@strongswan.org>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/chacha-ssse3-x86_64.S | 16 ++++++++--------
+ arch/x86/crypto/chacha_glue.c         | 17 ++---------------
+ include/crypto/chacha.h               |  4 ----
+ 3 files changed, 10 insertions(+), 27 deletions(-)
+
+--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
+@@ -120,10 +120,10 @@ ENTRY(chacha_block_xor_ssse3)
+ 	FRAME_BEGIN
+ 
+ 	# x0..3 = s0..3
+-	movdqa		0x00(%rdi),%xmm0
+-	movdqa		0x10(%rdi),%xmm1
+-	movdqa		0x20(%rdi),%xmm2
+-	movdqa		0x30(%rdi),%xmm3
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+ 	movdqa		%xmm0,%xmm8
+ 	movdqa		%xmm1,%xmm9
+ 	movdqa		%xmm2,%xmm10
+@@ -205,10 +205,10 @@ ENTRY(hchacha_block_ssse3)
+ 	# %edx: nrounds
+ 	FRAME_BEGIN
+ 
+-	movdqa		0x00(%rdi),%xmm0
+-	movdqa		0x10(%rdi),%xmm1
+-	movdqa		0x20(%rdi),%xmm2
+-	movdqa		0x30(%rdi),%xmm3
+	movdqu		0x00(%rdi),%xmm0
+	movdqu		0x10(%rdi),%xmm1
+	movdqu		0x20(%rdi),%xmm2
+	movdqu		0x30(%rdi),%xmm3
+ 
+ 	mov		%edx,%r8d
+ 	call		chacha_permute
+--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
+@@ -14,8 +14,6 @@
+ #include <linux/module.h>
+ #include <asm/simd.h>
+ 
+-#define CHACHA_STATE_ALIGN 16
+-
+ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ 				       unsigned int len, int nrounds);
+ asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+@@ -125,8 +123,6 @@ static void chacha_dosimd(u32 *state, u8
+ 
+ void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+ {
+-	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ 	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+ 		hchacha_block_generic(state, stream, nrounds);
+ 	} else {
+@@ -139,8 +135,6 @@ EXPORT_SYMBOL(hchacha_block_arch);
+ 
+ void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+ {
+-	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ 	chacha_init_generic(state, key, iv);
+ }
+ EXPORT_SYMBOL(chacha_init_arch);
+@@ -148,8 +142,6 @@ EXPORT_SYMBOL(chacha_init_arch);
+ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ 		       int nrounds)
+ {
+-	state = PTR_ALIGN(state, CHACHA_STATE_ALIGN);
+-
+ 	if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+ 	    bytes <= CHACHA_BLOCK_SIZE)
+ 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+@@ -171,15 +163,12 @@ EXPORT_SYMBOL(chacha_crypt_arch);
+ static int chacha_simd_stream_xor(struct skcipher_request *req,
+ 				  const struct chacha_ctx *ctx, const u8 *iv)
+ {
+-	u32 *state, state_buf[16 + 2] __aligned(8);
+	u32 state[CHACHA_STATE_WORDS] __aligned(8);
+ 	struct skcipher_walk walk;
+ 	int err;
+ 
+ 	err = skcipher_walk_virt(&walk, req, false);
+ 
+-	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+-	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+-
+ 	chacha_init_generic(state, ctx->key, iv);
+ 
+ 	while (walk.nbytes > 0) {
+@@ -218,12 +207,10 @@ static int xchacha_simd(struct skcipher_
+ {
+ 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+-	u32 *state, state_buf[16 + 2] __aligned(8);
+	u32 state[CHACHA_STATE_WORDS] __aligned(8);
+ 	struct chacha_ctx subctx;
+ 	u8 real_iv[16];
+ 
+-	BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+-	state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ 	chacha_init_generic(state, ctx->key, req->iv);
+ 
+ 	if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+--- a/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
+@@ -25,11 +25,7 @@
+ #define CHACHA_BLOCK_SIZE	64
+ #define CHACHAPOLY_IV_SIZE	12
+ 
+-#ifdef CONFIG_X86_64
+-#define CHACHA_STATE_WORDS	((CHACHA_BLOCK_SIZE + 12) / sizeof(u32))
+-#else
+ #define CHACHA_STATE_WORDS	(CHACHA_BLOCK_SIZE / sizeof(u32))
+-#endif
+ 
+ /* 192-bit nonce, then 64-bit stream position */
+ #define XCHACHA_IV_SIZE		32
--- a/target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0060-crypto-x86-curve25519-Remove-unused-carry-variables.patch
@ -0,0 +1,46 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Thu, 23 Jul 2020 17:50:48 +1000
+Subject: [PATCH] crypto: x86/curve25519 - Remove unused carry variables
+
+commit 054a5540fb8f7268e2c79e9deab4242db15c8cba upstream.
+
+The carry variables are assigned but never used, which upsets
+the compiler.  This patch removes them.
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Reviewed-by: Karthikeyan Bhargavan <karthik.bhargavan@gmail.com>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -948,10 +948,8 @@ static void store_felem(u64 *b, u64 *f)
+ {
+ 	u64 f30 = f[3U];
+ 	u64 top_bit0 = f30 >> (u32)63U;
+-	u64 carry0;
+ 	u64 f31;
+ 	u64 top_bit;
+-	u64 carry;
+ 	u64 f0;
+ 	u64 f1;
+ 	u64 f2;
+@@ -970,11 +968,11 @@ static void store_felem(u64 *b, u64 *f)
+ 	u64 o2;
+ 	u64 o3;
+ 	f[3U] = f30 & (u64)0x7fffffffffffffffU;
+-	carry0 = add_scalar(f, f, (u64)19U * top_bit0);
+	add_scalar(f, f, (u64)19U * top_bit0);
+ 	f31 = f[3U];
+ 	top_bit = f31 >> (u32)63U;
+ 	f[3U] = f31 & (u64)0x7fffffffffffffffU;
+-	carry = add_scalar(f, f, (u64)19U * top_bit);
+	add_scalar(f, f, (u64)19U * top_bit);
+ 	f0 = f[0U];
+ 	f1 = f[1U];
+ 	f2 = f[2U];
--- a/target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0061-crypto-arm-curve25519-include-linux-scatterlist.h.patch
@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabio Estevam <festevam@gmail.com>
+Date: Mon, 24 Aug 2020 11:09:53 -0300
+Subject: [PATCH] crypto: arm/curve25519 - include <linux/scatterlist.h>
+
+commit 6779d0e6b0fe193ab3010ea201782ca6f75a3862 upstream.
+
+Building ARM allmodconfig leads to the following warnings:
+
+arch/arm/crypto/curve25519-glue.c:73:12: error: implicit declaration of function 'sg_copy_to_buffer' [-Werror=implicit-function-declaration]
+arch/arm/crypto/curve25519-glue.c:74:9: error: implicit declaration of function 'sg_nents_for_len' [-Werror=implicit-function-declaration]
+arch/arm/crypto/curve25519-glue.c:88:11: error: implicit declaration of function 'sg_copy_from_buffer' [-Werror=implicit-function-declaration]
+
+Include <linux/scatterlist.h> to fix such warnings
+
+Reported-by: Olof's autobuilder <build@lixom.net>
+Fixes: 0c3dc787a62a ("crypto: algapi - Remove skbuff.h inclusion")
+Signed-off-by: Fabio Estevam <festevam@gmail.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/curve25519-glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
+@@ -16,6 +16,7 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+ #include <linux/jump_label.h>
+#include <linux/scatterlist.h>
+ #include <crypto/curve25519.h>
+ 
+ asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
--- a/target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0062-crypto-arm-poly1305-Add-prototype-for-poly1305_block.patch
@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Tue, 25 Aug 2020 11:23:00 +1000
+Subject: [PATCH] crypto: arm/poly1305 - Add prototype for poly1305_blocks_neon
+
+commit 51982ea02aef972132eb35c583d3e4c5b83166e5 upstream.
+
+This patch adds a prototype for poly1305_blocks_neon to slience
+a compiler warning:
+
+  CC [M]  arch/arm/crypto/poly1305-glue.o
+../arch/arm/crypto/poly1305-glue.c:25:13: warning: no previous prototype for `poly1305_blocks_neon' [-Wmissing-prototypes]
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+             ^~~~~~~~~~~~~~~~~~~~
+
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/poly1305-glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
+@@ -20,6 +20,7 @@
+ 
+ void poly1305_init_arm(void *state, const u8 *key);
+ void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+ void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+ 
+ void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
--- a/target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0063-crypto-curve25519-x86_64-Use-XORL-r32-32.patch
@ -0,0 +1,261 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Uros Bizjak <ubizjak@gmail.com>
+Date: Thu, 27 Aug 2020 19:30:58 +0200
+Subject: [PATCH] crypto: curve25519-x86_64 - Use XORL r32,32
+
+commit db719539fd3889836900bf912755aa30a5985e9a upstream.
+
+x86_64 zero extends 32bit operations, so for 64bit operands,
+XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
+a REX prefix byte when legacy registers are used.
+
+Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: "David S. Miller" <davem@davemloft.net>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
+ 1 file changed, 34 insertions(+), 34 deletions(-)
+
+--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
+@@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, c
+ 
+ 	asm volatile(
+ 		/* Clear registers to propagate the carry bit */
+-		"  xor %%r8, %%r8;"
+-		"  xor %%r9, %%r9;"
+-		"  xor %%r10, %%r10;"
+-		"  xor %%r11, %%r11;"
+-		"  xor %1, %1;"
+		"  xor %%r8d, %%r8d;"
+		"  xor %%r9d, %%r9d;"
+		"  xor %%r10d, %%r10d;"
+		"  xor %%r11d, %%r11d;"
+		"  xor %k1, %k1;"
+ 
+ 		/* Begin addition chain */
+ 		"  addq 0(%3), %0;"
+@@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const
+ 		"  cmovc %0, %%rax;"
+ 
+ 		/* Step 2: Add carry*38 to the original sum */
+-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
+ 		"  add %%rax, %%r8;"
+ 		"  adcx %%rcx, %%r9;"
+ 		"  movq %%r9, 8(%1);"
+@@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const
+ 
+ 		/* Compute src1[0] * src2 */
+ 		"  movq 0(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 8(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 16(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 24(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+@@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
+ 		"  adoxq 0(%1), %%r8;"
+ 		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+@@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const
+ 
+ 		/* Compute src1[0] * src2 */
+ 		"  movq 0(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 8(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 16(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 24(%1), %%rdx;"
+-		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
+		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
+ 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
+ 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
+@@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const
+ 
+ 		/* Compute src1[0] * src2 */
+ 		"  movq 32(%1), %%rdx;"
+-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
+ 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"
+ 		/* Compute src1[1] * src2 */
+ 		"  movq 40(%1), %%rdx;"
+-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
+ 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[2] * src2 */
+ 		"  movq 48(%1), %%rdx;"
+-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
+ 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
+ 		/* Compute src1[3] * src2 */
+ 		"  movq 56(%1), %%rdx;"
+-		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
+-		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
+		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
+ 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
+ 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
+ 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
+@@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
+ 		"  adoxq 0(%1), %%r8;"
+ 		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+@@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 96(%1), %%r8, %%r13;"
+-		"  xor %3, %3;"
+		"  xor %k3, %k3;"
+ 		"  adoxq 64(%1), %%r8;"
+ 		"  mulxq 104(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+@@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const
+ 
+ 		/* Step 1: Compute all partial products */
+ 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+ 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+@@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const
+ 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+ 		/* Step 2: Compute two parallel carry chains */
+-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
+ 		"  adox %%rax, %%r10;"
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+@@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
+ 		"  adoxq 0(%1), %%r8;"
+ 		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+@@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const
+ 	asm volatile(
+ 		/* Step 1: Compute all partial products */
+ 		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+ 		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+@@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+ 		/* Step 2: Compute two parallel carry chains */
+-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
+ 		"  adox %%rax, %%r10;"
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+@@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const
+ 
+ 		/* Step 1: Compute all partial products */
+ 		"  movq 32(%1), %%rdx;"                                       /* f[0] */
+-		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
+ 		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+ 		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+ 		"  movq 56(%1), %%rdx;"                                      /* f[3] */
+@@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const
+ 		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+ 
+ 		/* Step 2: Compute two parallel carry chains */
+-		"  xor %%r15, %%r15;"
+		"  xor %%r15d, %%r15d;"
+ 		"  adox %%rax, %%r10;"
+ 		"  adcx %%r8, %%r8;"
+ 		"  adox %%rcx, %%r11;"
+@@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 32(%1), %%r8, %%r13;"
+-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
+ 		"  adoxq 0(%1), %%r8;"
+ 		"  mulxq 40(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
+@@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const
+ 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ 		"  mov $38, %%rdx;"
+ 		"  mulxq 96(%1), %%r8, %%r13;"
+-		"  xor %%rcx, %%rcx;"
+		"  xor %%ecx, %%ecx;"
+ 		"  adoxq 64(%1), %%r8;"
+ 		"  mulxq 104(%1), %%r9, %%rbx;"
+ 		"  adcx %%r13, %%r9;"
--- a/target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0064-crypto-poly1305-x86_64-Use-XORL-r32-32.patch
@ -0,0 +1,59 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Uros Bizjak <ubizjak@gmail.com>
+Date: Thu, 27 Aug 2020 19:38:31 +0200
+Subject: [PATCH] crypto: poly1305-x86_64 - Use XORL r32,32
+
+commit 7dfd1e01b3dfc13431b1b25720cf2692a7e111ef upstream.
+
+x86_64 zero extends 32bit operations, so for 64bit operands,
+XORL r32,r32 is functionally equal to XORQ r64,r64, but avoids
+a REX prefix byte when legacy registers are used.
+
+Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
+Cc: Herbert Xu <herbert@gondor.apana.org.au>
+Cc: "David S. Miller" <davem@davemloft.net>
+Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+@@ -246,7 +246,7 @@ $code.=<<___ if (!$kernel);
+ ___
+ &declare_function("poly1305_init_x86_64", 32, 3);
+ $code.=<<___;
+-	xor	%rax,%rax
+	xor	%eax,%eax
+ 	mov	%rax,0($ctx)		# initialize hash value
+ 	mov	%rax,8($ctx)
+ 	mov	%rax,16($ctx)
+@@ -2869,7 +2869,7 @@ $code.=<<___;
+ .type	poly1305_init_base2_44,\@function,3
+ .align	32
+ poly1305_init_base2_44:
+-	xor	%rax,%rax
+	xor	%eax,%eax
+ 	mov	%rax,0($ctx)		# initialize hash value
+ 	mov	%rax,8($ctx)
+ 	mov	%rax,16($ctx)
+@@ -3963,7 +3963,7 @@ xor128_decrypt_n_pad:
+ 	mov	\$16,$len
+ 	sub	%r10,$len
+ 	xor	%eax,%eax
+-	xor	%r11,%r11
+	xor	%r11d,%r11d
+ .Loop_dec_byte:
+ 	mov	($inp,$otp),%r11b
+ 	mov	($otp),%al
+@@ -4101,7 +4101,7 @@ avx_handler:
+ 	.long	0xa548f3fc		# cld; rep movsq
+ 
+ 	mov	$disp,%rsi
+-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	xor	%ecx,%ecx		# arg1, UNW_FLAG_NHANDLER
+ 	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+ 	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+ 	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
--- a/target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0065-crypto-x86-poly1305-Remove-assignments-with-no-effec.patch
@ -0,0 +1,29 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Herbert Xu <herbert@gondor.apana.org.au>
+Date: Thu, 24 Sep 2020 13:29:04 +1000
+Subject: [PATCH] crypto: x86/poly1305 - Remove assignments with no effect
+
+commit 4a0c1de64bf9d9027a6f19adfba89fc27893db23 upstream.
+
+This patch removes a few ineffectual assignments from the function
+crypto_poly1305_setdctxkey.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -157,9 +157,6 @@ static unsigned int crypto_poly1305_setd
+ 			dctx->s[1] = get_unaligned_le32(&inp[4]);
+ 			dctx->s[2] = get_unaligned_le32(&inp[8]);
+ 			dctx->s[3] = get_unaligned_le32(&inp[12]);
+-			inp += POLY1305_BLOCK_SIZE;
+-			len -= POLY1305_BLOCK_SIZE;
+-			acc += POLY1305_BLOCK_SIZE;
+ 			dctx->sset = true;
+ 		}
+ 	}
--- a/target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0066-crypto-x86-poly1305-add-back-a-needed-assignment.patch
@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Fri, 23 Oct 2020 15:27:48 -0700
+Subject: [PATCH] crypto: x86/poly1305 - add back a needed assignment
+
+commit c3a98c3ad5c0dc60a1ac66bf91147a3f39cac96b upstream.
+
+One of the assignments that was removed by commit 4a0c1de64bf9 ("crypto:
+x86/poly1305 - Remove assignments with no effect") is actually needed,
+since it affects the return value.
+
+This fixes the following crypto self-test failure:
+
+    alg: shash: poly1305-simd test failed (wrong result) on test vector 2, cfg="init+update+final aligned buffer"
+
+Fixes: 4a0c1de64bf9 ("crypto: x86/poly1305 - Remove assignments with no effect")
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/x86/crypto/poly1305_glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
+@@ -157,6 +157,7 @@ static unsigned int crypto_poly1305_setd
+ 			dctx->s[1] = get_unaligned_le32(&inp[4]);
+ 			dctx->s[2] = get_unaligned_le32(&inp[8]);
+ 			dctx->s[3] = get_unaligned_le32(&inp[12]);
+			acc += POLY1305_BLOCK_SIZE;
+ 			dctx->sset = true;
+ 		}
+ 	}
--- a/target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0067-crypto-Kconfig-CRYPTO_MANAGER_EXTRA_TESTS-requires-t.patch
@ -0,0 +1,33 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 2 Nov 2020 14:48:15 +0100
+Subject: [PATCH] crypto: Kconfig - CRYPTO_MANAGER_EXTRA_TESTS requires the
+ manager
+
+commit 6569e3097f1c4a490bdf2b23d326855e04942dfd upstream.
+
+The extra tests in the manager actually require the manager to be
+selected too. Otherwise the linker gives errors like:
+
+ld: arch/x86/crypto/chacha_glue.o: in function `chacha_simd_stream_xor':
+chacha_glue.c:(.text+0x422): undefined reference to `crypto_simd_disabled_for_test'
+
+Fixes: 2343d1529aff ("crypto: Kconfig - allow tests to be disabled when manager is disabled")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ crypto/Kconfig | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/crypto/Kconfig
+++ b/crypto/Kconfig
+@@ -145,7 +145,7 @@ config CRYPTO_MANAGER_DISABLE_TESTS
+ 
+ config CRYPTO_MANAGER_EXTRA_TESTS
+ 	bool "Enable extra run-time crypto self tests"
+-	depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS
+	depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS && CRYPTO_MANAGER
+ 	help
+ 	  Enable extra run-time self tests of registered crypto algorithms,
+ 	  including randomized fuzz tests.
--- a/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0068-crypto-arm-chacha-neon-optimize-for-non-block-size-m.patch
@ -0,0 +1,272 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Tue, 3 Nov 2020 17:28:09 +0100
+Subject: [PATCH] crypto: arm/chacha-neon - optimize for non-block size
+ multiples
+
+commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
+
+The current NEON based ChaCha implementation for ARM is optimized for
+multiples of 4x the ChaCha block size (64 bytes). This makes sense for
+block encryption, but given that ChaCha is also often used in the
+context of networking, it makes sense to consider arbitrary length
+inputs as well.
+
+For example, WireGuard typically uses 1420 byte packets, and performing
+ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
+and 3 invocations of chacha_block_xor_neon(), where the last one also
+involves a memcpy() using a buffer on the stack to process the final
+chunk of 1420 % 64 == 12 bytes.
+
+Let's optimize for this case as well, by letting chacha_4block_xor_neon()
+deal with any input size between 64 and 256 bytes, using NEON permutation
+instructions and overlapping loads and stores. This way, the 140 byte
+tail of a 1420 byte input buffer can simply be processed in one go.
+
+This results in the following performance improvements for 1420 byte
+blocks, without significant impact on power-of-2 input sizes. (Note
+that Raspberry Pi is widely used in combination with a 32-bit kernel,
+even though the core is 64-bit capable)
+
+   Cortex-A8  (BeagleBone)       :   7%
+   Cortex-A15 (Calxeda Midway)   :  21%
+   Cortex-A53 (Raspberry Pi 3)   :   3%
+   Cortex-A72 (Raspberry Pi 4)   :  19%
+
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c      | 34 +++++------
+ arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
+ 2 files changed, 107 insertions(+), 24 deletions(-)
+
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -23,7 +23,7 @@
+ asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ 				      int nrounds);
+ asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+-				       int nrounds);
+				       int nrounds, unsigned int nbytes);
+ asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
+ asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+ 
+@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
+ {
+ 	u8 buf[CHACHA_BLOCK_SIZE];
+ 
+-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+-		chacha_4block_xor_neon(state, dst, src, nrounds);
+-		bytes -= CHACHA_BLOCK_SIZE * 4;
+-		src += CHACHA_BLOCK_SIZE * 4;
+-		dst += CHACHA_BLOCK_SIZE * 4;
+-		state[12] += 4;
+-	}
+-	while (bytes >= CHACHA_BLOCK_SIZE) {
+-		chacha_block_xor_neon(state, dst, src, nrounds);
+-		bytes -= CHACHA_BLOCK_SIZE;
+-		src += CHACHA_BLOCK_SIZE;
+-		dst += CHACHA_BLOCK_SIZE;
+-		state[12]++;
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ 	}
+ 	if (bytes) {
+-		memcpy(buf, src, bytes);
+-		chacha_block_xor_neon(state, buf, buf, nrounds);
+-		memcpy(dst, buf, bytes);
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
+ 	}
+ }
+ 
+--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
+@@ -47,6 +47,7 @@
+   */
+ 
+ #include <linux/linkage.h>
+#include <asm/cache.h>
+ 
+ 	.text
+ 	.fpu		neon
+@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
+ 
+ 	.align		5
+ ENTRY(chacha_4block_xor_neon)
+-	push		{r4-r5}
+	push		{r4, lr}
+ 	mov		r4, sp			// preserve the stack pointer
+ 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+ 	bic		ip, ip, #0x1f		// aligned to 32 bytes
+@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
+ 	vld1.32		{q0-q1}, [r0]
+ 	vld1.32		{q2-q3}, [ip]
+ 
+-	adr		r5, .Lctrinc
+	adr		lr, .Lctrinc
+ 	vdup.32		q15, d7[1]
+ 	vdup.32		q14, d7[0]
+-	vld1.32		{q4}, [r5, :128]
+	vld1.32		{q4}, [lr, :128]
+ 	vdup.32		q13, d6[1]
+ 	vdup.32		q12, d6[0]
+ 	vdup.32		q11, d5[1]
+@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
+ 
+ 	// Re-interleave the words in the first two rows of each block (x0..7).
+ 	// Also add the counter values 0-3 to x12[0-3].
+-	  vld1.32	{q8}, [r5, :128]	// load counter values 0-3
+	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
+ 	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+ 	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+ 	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
+ 
+ 	// Re-interleave the words in the last two rows of each block (x8..15).
+ 	vld1.32		{q8-q9}, [sp, :256]
+	  mov		sp, r4		// restore original stack pointer
+	  ldr		r4, [r4, #8]	// load number of bytes
+ 	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+ 	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+ 	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
+ 	// XOR the rest of the data with the keystream
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #96
+ 	veor		q0, q0, q8
+ 	veor		q1, q1, q12
+	ble		.Lle96
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+ 	veor		q0, q0, q2
+ 	veor		q1, q1, q6
+	ble		.Lle128
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+ 	veor		q0, q0, q10
+ 	veor		q1, q1, q14
+	ble		.Lle160
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+ 	veor		q0, q0, q4
+ 	veor		q1, q1, q5
+	ble		.Lle192
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+ 	veor		q0, q0, q9
+ 	veor		q1, q1, q13
+	ble		.Lle224
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+ 	veor		q0, q0, q3
+ 	veor		q1, q1, q7
+	blt		.Llt256
+.Lout:
+ 	vst1.8		{q0-q1}, [r1]!
+ 
+ 	vld1.8		{q0-q1}, [r2]
+-	  mov		sp, r4		// restore original stack pointer
+ 	veor		q0, q0, q11
+ 	veor		q1, q1, q15
+ 	vst1.8		{q0-q1}, [r1]
+ 
+-	pop		{r4-r5}
+-	bx		lr
+	pop		{r4, pc}
+
+.Lle192:
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// nothing to do
+
+.Lfinalblock:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at
+	// [r1] in q0-q1.
+	beq		.Lfullblock
+
+.Lpartialblock:
+	adr		lr, .Lpermute + 32
+	add		r2, r2, r4
+	add		lr, lr, r4
+	add		r4, r4, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	add		r4, r4, #32
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [r4]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4, pc}
+
+.Lfullblock:
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Lout
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lfinalblock
+.Lle128:
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lfinalblock
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lfinalblock
+.Llt256:
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lpartialblock
+ ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
--- a/target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0069-crypto-arm64-chacha-simplify-tail-block-handling.patch
@ -0,0 +1,324 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Fri, 6 Nov 2020 17:39:38 +0100
+Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
+
+commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
+
+Based on lessons learnt from optimizing the 32-bit version of this driver,
+we can simplify the arm64 version considerably, by reordering the final
+two stores when the last block is not a multiple of 64 bytes. This removes
+the need to use permutation instructions to calculate the elements that are
+clobbered by the final overlapping store, given that the store of the
+penultimate block now follows it, and that one carries the correct values
+for those elements already.
+
+While at it, simplify the overlapping loads as well, by calculating the
+address of the final overlapping load upfront, and switching to this
+address for every load that would otherwise extend past the end of the
+source buffer.
+
+There is no impact on performance, but the resulting code is substantially
+smaller and easier to follow.
+
+Cc: Eric Biggers <ebiggers@google.com>
+Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
+ 1 file changed, 69 insertions(+), 124 deletions(-)
+
+--- a/arch/arm64/crypto/chacha-neon-core.S
+++ b/arch/arm64/crypto/chacha-neon-core.S
+@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
+ 	adr_l		x10, .Lpermute
+ 	and		x5, x4, #63
+ 	add		x10, x10, x5
+-	add		x11, x10, #64
+ 
+ 	//
+ 	// This function encrypts four consecutive ChaCha blocks by loading
+@@ -645,11 +644,11 @@ CPU_BE(	  rev		a15, a15	)
+ 	zip2		v31.4s, v14.4s, v15.4s
+ 	  eor		a15, a15, w9
+ 
+-	mov		x3, #64
+	add		x3, x2, x4
+	sub		x3, x3, #128		// start of last block
+
+ 	subs		x5, x4, #128
+-	add		x6, x5, x2
+-	csel		x3, x3, xzr, ge
+-	csel		x2, x2, x6, ge
+	csel		x2, x2, x3, ge
+ 
+ 	// interleave 64-bit words in state n, n+2
+ 	zip1		v0.2d, v16.2d, v18.2d
+@@ -658,13 +657,10 @@ CPU_BE(	  rev		a15, a15	)
+ 	zip1		v8.2d, v17.2d, v19.2d
+ 	zip2		v12.2d, v17.2d, v19.2d
+ 	  stp		a2, a3, [x1, #-56]
+-	ld1		{v16.16b-v19.16b}, [x2], x3
+ 
+ 	subs		x6, x4, #192
+-	ccmp		x3, xzr, #4, lt
+-	add		x7, x6, x2
+-	csel		x3, x3, xzr, eq
+-	csel		x2, x2, x7, eq
+	ld1		{v16.16b-v19.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+ 
+ 	zip1		v1.2d, v20.2d, v22.2d
+ 	zip2		v5.2d, v20.2d, v22.2d
+@@ -672,13 +668,10 @@ CPU_BE(	  rev		a15, a15	)
+ 	zip1		v9.2d, v21.2d, v23.2d
+ 	zip2		v13.2d, v21.2d, v23.2d
+ 	  stp		a6, a7, [x1, #-40]
+-	ld1		{v20.16b-v23.16b}, [x2], x3
+ 
+ 	subs		x7, x4, #256
+-	ccmp		x3, xzr, #4, lt
+-	add		x8, x7, x2
+-	csel		x3, x3, xzr, eq
+-	csel		x2, x2, x8, eq
+	ld1		{v20.16b-v23.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+ 
+ 	zip1		v2.2d, v24.2d, v26.2d
+ 	zip2		v6.2d, v24.2d, v26.2d
+@@ -686,12 +679,10 @@ CPU_BE(	  rev		a15, a15	)
+ 	zip1		v10.2d, v25.2d, v27.2d
+ 	zip2		v14.2d, v25.2d, v27.2d
+ 	  stp		a10, a11, [x1, #-24]
+-	ld1		{v24.16b-v27.16b}, [x2], x3
+ 
+ 	subs		x8, x4, #320
+-	ccmp		x3, xzr, #4, lt
+-	add		x9, x8, x2
+-	csel		x2, x2, x9, eq
+	ld1		{v24.16b-v27.16b}, [x2], #64
+	csel		x2, x2, x3, ge
+ 
+ 	zip1		v3.2d, v28.2d, v30.2d
+ 	zip2		v7.2d, v28.2d, v30.2d
+@@ -699,151 +690,105 @@ CPU_BE(	  rev		a15, a15	)
+ 	zip1		v11.2d, v29.2d, v31.2d
+ 	zip2		v15.2d, v29.2d, v31.2d
+ 	  stp		a14, a15, [x1, #-8]
+
+	tbnz		x5, #63, .Lt128
+ 	ld1		{v28.16b-v31.16b}, [x2]
+ 
+ 	// xor with corresponding input, write to output
+-	tbnz		x5, #63, 0f
+ 	eor		v16.16b, v16.16b, v0.16b
+ 	eor		v17.16b, v17.16b, v1.16b
+ 	eor		v18.16b, v18.16b, v2.16b
+ 	eor		v19.16b, v19.16b, v3.16b
+-	st1		{v16.16b-v19.16b}, [x1], #64
+-	cbz		x5, .Lout
+ 
+-	tbnz		x6, #63, 1f
+	tbnz		x6, #63, .Lt192
+
+ 	eor		v20.16b, v20.16b, v4.16b
+ 	eor		v21.16b, v21.16b, v5.16b
+ 	eor		v22.16b, v22.16b, v6.16b
+ 	eor		v23.16b, v23.16b, v7.16b
+-	st1		{v20.16b-v23.16b}, [x1], #64
+-	cbz		x6, .Lout
+ 
+-	tbnz		x7, #63, 2f
+	st1		{v16.16b-v19.16b}, [x1], #64
+	tbnz		x7, #63, .Lt256
+
+ 	eor		v24.16b, v24.16b, v8.16b
+ 	eor		v25.16b, v25.16b, v9.16b
+ 	eor		v26.16b, v26.16b, v10.16b
+ 	eor		v27.16b, v27.16b, v11.16b
+-	st1		{v24.16b-v27.16b}, [x1], #64
+-	cbz		x7, .Lout
+ 
+-	tbnz		x8, #63, 3f
+	st1		{v20.16b-v23.16b}, [x1], #64
+	tbnz		x8, #63, .Lt320
+
+ 	eor		v28.16b, v28.16b, v12.16b
+ 	eor		v29.16b, v29.16b, v13.16b
+ 	eor		v30.16b, v30.16b, v14.16b
+ 	eor		v31.16b, v31.16b, v15.16b
+
+	st1		{v24.16b-v27.16b}, [x1], #64
+ 	st1		{v28.16b-v31.16b}, [x1]
+ 
+ .Lout:	frame_pop
+ 	ret
+ 
+-	// fewer than 128 bytes of in/output
+-0:	ld1		{v8.16b}, [x10]
+-	ld1		{v9.16b}, [x11]
+-	movi		v10.16b, #16
+-	sub		x2, x1, #64
+-	add		x1, x1, x5
+-	ld1		{v16.16b-v19.16b}, [x2]
+-	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
+-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
+-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
+-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
+-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+-
+-	eor		v20.16b, v20.16b, v4.16b
+-	eor		v21.16b, v21.16b, v5.16b
+-	eor		v22.16b, v22.16b, v6.16b
+-	eor		v23.16b, v23.16b, v7.16b
+-	st1		{v20.16b-v23.16b}, [x1]
+-	b		.Lout
+-
+ 	// fewer than 192 bytes of in/output
+-1:	ld1		{v8.16b}, [x10]
+-	ld1		{v9.16b}, [x11]
+-	movi		v10.16b, #16
+-	add		x1, x1, x6
+-	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
+-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
+-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
+-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
+-	add		v8.16b, v8.16b, v10.16b
+-	add		v9.16b, v9.16b, v10.16b
+-	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
+-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
+-
+-	eor		v20.16b, v20.16b, v0.16b
+-	eor		v21.16b, v21.16b, v1.16b
+-	eor		v22.16b, v22.16b, v2.16b
+-	eor		v23.16b, v23.16b, v3.16b
+-	st1		{v20.16b-v23.16b}, [x1]
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
+	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:	eor		v20.16b, v20.16b, v28.16b
+	eor		v21.16b, v21.16b, v29.16b
+	eor		v22.16b, v22.16b, v30.16b
+	eor		v23.16b, v23.16b, v31.16b
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
+1:	st1		{v16.16b-v19.16b}, [x1]
+ 	b		.Lout
+ 
+	// fewer than 128 bytes of in/output
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
+	add		x5, x5, x1
+	sub		x1, x1, #64
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
+	b		0b
+
+ 	// fewer than 256 bytes of in/output
+-2:	ld1		{v4.16b}, [x10]
+-	ld1		{v5.16b}, [x11]
+-	movi		v6.16b, #16
+-	add		x1, x1, x7
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x6, x6, x1
+ 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
+-	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
+-	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
+-	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
+-	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
+-
+-	eor		v24.16b, v24.16b, v0.16b
+-	eor		v25.16b, v25.16b, v1.16b
+-	eor		v26.16b, v26.16b, v2.16b
+-	eor		v27.16b, v27.16b, v3.16b
+-	st1		{v24.16b-v27.16b}, [x1]
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
+
+	eor		v28.16b, v28.16b, v0.16b
+	eor		v29.16b, v29.16b, v1.16b
+	eor		v30.16b, v30.16b, v2.16b
+	eor		v31.16b, v31.16b, v3.16b
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
+2:	st1		{v20.16b-v23.16b}, [x1]
+ 	b		.Lout
+ 
+ 	// fewer than 320 bytes of in/output
+-3:	ld1		{v4.16b}, [x10]
+-	ld1		{v5.16b}, [x11]
+-	movi		v6.16b, #16
+-	add		x1, x1, x8
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
+	ld1		{v4.16b-v7.16b}, [x10]
+	add		x7, x7, x1
+ 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
+-	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
+-	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
+-	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
+-	add		v4.16b, v4.16b, v6.16b
+-	add		v5.16b, v5.16b, v6.16b
+-	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
+-	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
+ 
+ 	eor		v28.16b, v28.16b, v0.16b
+ 	eor		v29.16b, v29.16b, v1.16b
+ 	eor		v30.16b, v30.16b, v2.16b
+ 	eor		v31.16b, v31.16b, v3.16b
+-	st1		{v28.16b-v31.16b}, [x1]
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
+3:	st1		{v24.16b-v27.16b}, [x1]
+ 	b		.Lout
+ ENDPROC(chacha_4block_xor_neon)
+ 
+@@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
+ 	.align		L1_CACHE_SHIFT
+ .Lpermute:
+ 	.set		.Li, 0
+-	.rept		192
+	.rept		128
+ 	.byte		(.Li - 64)
+ 	.set		.Li, .Li + 1
+ 	.endr
--- a/target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0070-crypto-lib-chacha20poly1305-define-empty-module-exit.patch
@ -0,0 +1,37 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Fri, 15 Jan 2021 20:30:12 +0100
+Subject: [PATCH] crypto: lib/chacha20poly1305 - define empty module exit
+ function
+
+commit ac88c322d0f2917d41d13553c69e9d7f043c8b6f upstream.
+
+With no mod_exit function, users are unable to unload the module after
+use. I'm not aware of any reason why module unloading should be
+prohibited for this one, so this commit simply adds an empty exit
+function.
+
+Reported-and-tested-by: John Donnelly <john.p.donnelly@oracle.com>
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ lib/crypto/chacha20poly1305.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
+@@ -364,7 +364,12 @@ static int __init mod_init(void)
+ 	return 0;
+ }
+ 
+static void __exit mod_exit(void)
+{
+}
+
+ module_init(mod_init);
+module_exit(mod_exit);
+ MODULE_LICENSE("GPL v2");
+ MODULE_DESCRIPTION("ChaCha20Poly1305 AEAD construction");
+ MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
--- a/target/linux/generic/backport-5.4/080-wireguard-0071-crypto-arm-chacha-neon-add-missing-counter-increment.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0071-crypto-arm-chacha-neon-add-missing-counter-increment.patch
@ -0,0 +1,38 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ard Biesheuvel <ardb@kernel.org>
+Date: Sun, 13 Dec 2020 15:39:29 +0100
+Subject: [PATCH] crypto: arm/chacha-neon - add missing counter increment
+
+commit fd16931a2f518a32753920ff20895e5cf04c8ff1 upstream.
+
+Commit 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block
+size multiples") refactored the chacha block handling in the glue code in
+a way that may result in the counter increment to be omitted when calling
+chacha_block_xor_neon() to process a full block. This violates the skcipher
+API, which requires that the output IV is suitable for handling more input
+as long as the preceding input has been presented in round multiples of the
+block size. Also, the same code is exposed via the chacha library interface
+whose callers may actually rely on this increment to occur even for final
+blocks that are smaller than the chacha block size.
+
+So increment the counter after calling chacha_block_xor_neon().
+
+Fixes: 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples")
+Reported-by: Eric Biggers <ebiggers@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ arch/arm/crypto/chacha-glue.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
+@@ -60,6 +60,7 @@ static void chacha_doneon(u32 *state, u8
+ 		chacha_block_xor_neon(state, d, s, nrounds);
+ 		if (d != dst)
+ 			memcpy(dst, buf, bytes);
+		state[12]++;
+ 	}
+ }
+ 
--- a/target/linux/generic/backport-5.4/080-wireguard-0072-net-WireGuard-secure-network-tunnel.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0072-net-WireGuard-secure-network-tunnel.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0073-wireguard-selftests-import-harness-makefile-for-test.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0073-wireguard-selftests-import-harness-makefile-for-test.patch
--- a/target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-Kconfig-select-parent-dependency-for-crypt.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0074-wireguard-Kconfig-select-parent-dependency-for-crypt.patch
@ -0,0 +1,30 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Sun, 15 Dec 2019 22:08:01 +0100
+Subject: [PATCH] wireguard: Kconfig: select parent dependency for crypto
+
+commit d7c68a38bb4f9b7c1a2e4a772872c752ee5c44a6 upstream.
+
+This fixes the crypto selection submenu depenencies. Otherwise, we'd
+wind up issuing warnings in which certain dependencies we also select
+couldn't be satisfied. This condition was triggered by the addition of
+the test suite autobuilder in the previous commit.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/Kconfig | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
+@@ -85,6 +85,8 @@ config WIREGUARD
+ 	select CRYPTO_POLY1305_X86_64 if X86 && 64BIT
+ 	select CRYPTO_BLAKE2S_X86 if X86 && 64BIT
+ 	select CRYPTO_CURVE25519_X86 if X86 && 64BIT
+	select ARM_CRYPTO if ARM
+	select ARM64_CRYPTO if ARM64
+ 	select CRYPTO_CHACHA20_NEON if (ARM || ARM64) && KERNEL_MODE_NEON
+ 	select CRYPTO_POLY1305_NEON if ARM64 && KERNEL_MODE_NEON
+ 	select CRYPTO_POLY1305_ARM if ARM
--- a/target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-global-fix-spelling-mistakes-in-comments.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0075-wireguard-global-fix-spelling-mistakes-in-comments.patch
@ -0,0 +1,66 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Josh Soref <jsoref@gmail.com>
+Date: Sun, 15 Dec 2019 22:08:02 +0100
+Subject: [PATCH] wireguard: global: fix spelling mistakes in comments
+
+commit a2ec8b5706944d228181c8b91d815f41d6dd8e7b upstream.
+
+This fixes two spelling errors in source code comments.
+
+Signed-off-by: Josh Soref <jsoref@gmail.com>
+[Jason: rewrote commit message]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/receive.c | 2 +-
+ include/uapi/linux/wireguard.h  | 8 ++++----
+ 2 files changed, 5 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/wireguard/receive.c
+++ b/drivers/net/wireguard/receive.c
+@@ -380,7 +380,7 @@ static void wg_packet_consume_data_done(
+ 	/* We've already verified the Poly1305 auth tag, which means this packet
+ 	 * was not modified in transit. We can therefore tell the networking
+ 	 * stack that all checksums of every layer of encapsulation have already
+-	 * been checked "by the hardware" and therefore is unneccessary to check
+	 * been checked "by the hardware" and therefore is unnecessary to check
+ 	 * again in software.
+ 	 */
+ 	skb->ip_summed = CHECKSUM_UNNECESSARY;
+--- a/include/uapi/linux/wireguard.h
+++ b/include/uapi/linux/wireguard.h
+@@ -18,13 +18,13 @@
+  * one but not both of:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
+ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *
+  * The kernel will then return several messages (NLM_F_MULTI) containing the
+  * following tree of nested items:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
+ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *    WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
+  *    WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN
+  *    WGDEVICE_A_LISTEN_PORT: NLA_U16
+@@ -77,7 +77,7 @@
+  * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME:
+  *
+  *    WGDEVICE_A_IFINDEX: NLA_U32
+- *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMESIZ - 1
+ *    WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1
+  *    WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current
+  *                      peers should be removed prior to adding the list below.
+  *    WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove
+@@ -121,7 +121,7 @@
+  * filling in information not contained in the prior. Note that if
+  * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably
+  * should not be specified in fragments that come after, so that the list
+- * of peers is only cleared the first time but appened after. Likewise for
+ * of peers is only cleared the first time but appended after. Likewise for
+  * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message
+  * of a peer, it likely should not be specified in subsequent fragments.
+  *
--- a/target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-main-remove-unused-include-linux-version.h.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0076-wireguard-main-remove-unused-include-linux-version.h.patch
@ -0,0 +1,28 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: YueHaibing <yuehaibing@huawei.com>
+Date: Sun, 15 Dec 2019 22:08:03 +0100
+Subject: [PATCH] wireguard: main: remove unused include <linux/version.h>
+
+commit 43967b6ff91e53bcce5ae08c16a0588a475b53a1 upstream.
+
+Remove <linux/version.h> from the includes for main.c, which is unused.
+
+Signed-off-by: YueHaibing <yuehaibing@huawei.com>
+[Jason: reworded commit message]
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/main.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/drivers/net/wireguard/main.c
+++ b/drivers/net/wireguard/main.c
+@@ -12,7 +12,6 @@
+ 
+ #include <uapi/linux/wireguard.h>
+ 
+-#include <linux/version.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+ #include <linux/genetlink.h>
--- a/target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0077-wireguard-allowedips-use-kfree_rcu-instead-of-call_r.patch
@ -0,0 +1,41 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Wei Yongjun <weiyongjun1@huawei.com>
+Date: Sun, 15 Dec 2019 22:08:04 +0100
+Subject: [PATCH] wireguard: allowedips: use kfree_rcu() instead of call_rcu()
+
+commit d89ee7d5c73af15c1c6f12b016cdf469742b5726 upstream.
+
+The callback function of call_rcu() just calls a kfree(), so we
+can use kfree_rcu() instead of call_rcu() + callback function.
+
+Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/allowedips.c | 7 +------
+ 1 file changed, 1 insertion(+), 6 deletions(-)
+
+--- a/drivers/net/wireguard/allowedips.c
+++ b/drivers/net/wireguard/allowedips.c
+@@ -31,11 +31,6 @@ static void copy_and_assign_cidr(struct
+ #define CHOOSE_NODE(parent, key) \
+ 	parent->bit[(key[parent->bit_at_a] >> parent->bit_at_b) & 1]
+ 
+-static void node_free_rcu(struct rcu_head *rcu)
+-{
+-	kfree(container_of(rcu, struct allowedips_node, rcu));
+-}
+-
+ static void push_rcu(struct allowedips_node **stack,
+ 		     struct allowedips_node __rcu *p, unsigned int *len)
+ {
+@@ -112,7 +107,7 @@ static void walk_remove_by_peer(struct a
+ 				if (!node->bit[0] || !node->bit[1]) {
+ 					rcu_assign_pointer(*nptr, DEREF(
+ 					       &node->bit[!REF(node->bit[0])]));
+-					call_rcu(&node->rcu, node_free_rcu);
+					kfree_rcu(node, rcu);
+ 					node = DEREF(nptr);
+ 				}
+ 			}
--- a/target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-selftests-remove-ancient-kernel-compatibil.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0078-wireguard-selftests-remove-ancient-kernel-compatibil.patch
@ -0,0 +1,373 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 2 Jan 2020 17:47:49 +0100
+Subject: [PATCH] wireguard: selftests: remove ancient kernel compatibility
+ code
+
+commit 9a69a4c8802adf642bc4a13d471b5a86b44ed434 upstream.
+
+Quite a bit of the test suite was designed to work with ancient kernels.
+Thankfully we no longer have to deal with this. This commit updates
+things that we can finally update and removes things that we can finally
+remove, to avoid the build-up of the last several years as a result of
+having to support ancient kernels. We can finally rely on suppress_
+prefixlength being available. On the build side of things, the no-PIE
+hack is no longer required, and we can bump some of the tools, repair
+our m68k and i686-kvm support, and get better coverage of the static
+branches used in the crypto lib and in udp_tunnel.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ tools/testing/selftests/wireguard/netns.sh    | 11 +--
+ .../testing/selftests/wireguard/qemu/Makefile | 82 ++++++++++---------
+ .../selftests/wireguard/qemu/arch/m68k.config |  2 +-
+ tools/testing/selftests/wireguard/qemu/init.c |  1 +
+ .../selftests/wireguard/qemu/kernel.config    |  2 +
+ 5 files changed, 50 insertions(+), 48 deletions(-)
+
+--- a/tools/testing/selftests/wireguard/netns.sh
+++ b/tools/testing/selftests/wireguard/netns.sh
+@@ -37,7 +37,7 @@ n2() { pretty 2 "$*"; maybe_exec ip netn
+ ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; }
+ ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+-sleep() { read -t "$1" -N 0 || true; }
+sleep() { read -t "$1" -N 1 || true; }
+ waitiperf() { pretty "${1//*-}" "wait for iperf:5201"; while [[ $(ss -N "$1" -tlp 'sport = 5201') != *iperf3* ]]; do sleep 0.1; done; }
+ waitncatudp() { pretty "${1//*-}" "wait for udp:1111"; while [[ $(ss -N "$1" -ulp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
+ waitncattcp() { pretty "${1//*-}" "wait for tcp:1111"; while [[ $(ss -N "$1" -tlp 'sport = 1111') != *ncat* ]]; do sleep 0.1; done; }
+@@ -294,12 +294,9 @@ ip1 -6 rule add table main suppress_pref
+ ip1 -4 route add default dev wg0 table 51820
+ ip1 -4 rule add not fwmark 51820 table 51820
+ ip1 -4 rule add table main suppress_prefixlength 0
+-# suppress_prefixlength only got added in 3.12, and we want to support 3.10+.
+-if [[ $(ip1 -4 rule show all) == *suppress_prefixlength* ]]; then
+-	# Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
+-	n1 ping -W 1 -c 100 -f 192.168.99.7
+-	n1 ping -W 1 -c 100 -f abab::1111
+-fi
+# Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
+n1 ping -W 1 -c 100 -f 192.168.99.7
+n1 ping -W 1 -c 100 -f abab::1111
+ 
+ n0 iptables -t nat -F
+ ip0 link del vethrc
+--- a/tools/testing/selftests/wireguard/qemu/Makefile
+++ b/tools/testing/selftests/wireguard/qemu/Makefile
+@@ -5,6 +5,7 @@
+ PWD := $(shell pwd)
+ 
+ CHOST := $(shell gcc -dumpmachine)
+HOST_ARCH := $(firstword $(subst -, ,$(CHOST)))
+ ifneq (,$(ARCH))
+ CBUILD := $(subst -gcc,,$(lastword $(subst /, ,$(firstword $(wildcard $(foreach bindir,$(subst :, ,$(PATH)),$(bindir)/$(ARCH)-*-gcc))))))
+ ifeq (,$(CBUILD))
+@@ -37,19 +38,19 @@ endef
+ define file_download =
+ $(DISTFILES_PATH)/$(1):
+ 	mkdir -p $(DISTFILES_PATH)
+-	flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -t inf --retry-on-http-error=404 -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
+	flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp'
+ 	if echo "$(3)  $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi
+ endef
+ 
+-$(eval $(call tar_download,MUSL,musl,1.1.20,.tar.gz,https://www.musl-libc.org/releases/,44be8771d0e6c6b5f82dd15662eb2957c9a3173a19a8b49966ac0542bbd40d61))
+$(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3))
+ $(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81))
+-$(eval $(call tar_download,IPERF,iperf,3.1.7,.tar.gz,http://downloads.es.net/pub/iperf/,a4ef73406fe92250602b8da2ae89ec53211f805df97a1d1d629db5a14043734f))
+$(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c))
+ $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
+-$(eval $(call tar_download,IPROUTE2,iproute2,5.1.0,.tar.gz,https://www.kernel.org/pub/linux/utils/net/iproute2/,9b43707d6075ecdca14803ca8ce0c8553848c49fa1586d12fd508d66577243f2))
+-$(eval $(call tar_download,IPTABLES,iptables,1.6.1,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,0fc2d7bd5d7be11311726466789d4c65fb4c8e096c9182b56ce97440864f0cf5))
+-$(eval $(call tar_download,NMAP,nmap,7.60,.tar.bz2,https://nmap.org/dist/,a8796ecc4fa6c38aad6139d9515dc8113023a82e9d787e5a5fb5fa1b05516f21))
+-$(eval $(call tar_download,IPUTILS,iputils,s20161105,.tar.gz,https://github.com/iputils/iputils/archive/s20161105.tar.gz/#,f813092f03d17294fd23544b129b95cdb87fe19f7970a51908a6b88509acad8a))
+-$(eval $(call tar_download,WIREGUARD_TOOLS,WireGuard,0.0.20191212,.tar.xz,https://git.zx2c4.com/WireGuard/snapshot/,b0d718380f7a8822b2f12d75e462fa4eafa3a77871002981f367cd4fe2a1b071))
+$(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae))
+$(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c))
+$(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa))
+$(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a))
+$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20191226,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,aa8af0fdc9872d369d8c890a84dbc2a2466b55795dccd5b47721b2d97644b04f))
+ 
+ KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug)
+ rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+@@ -59,23 +60,21 @@ export CFLAGS ?= -O3 -pipe
+ export LDFLAGS ?=
+ export CPPFLAGS := -I$(BUILD_PATH)/include
+ 
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ CROSS_COMPILE_FLAG := --host=$(CHOST)
+-NOPIE_GCC := gcc -fno-PIE
+ CFLAGS += -march=native
+ STRIP := strip
+ else
+ $(info Cross compilation: building for $(CBUILD) using $(CHOST))
+ CROSS_COMPILE_FLAG := --build=$(CBUILD) --host=$(CHOST)
+ export CROSS_COMPILE=$(CBUILD)-
+-NOPIE_GCC := $(CBUILD)-gcc -fno-PIE
+ STRIP := $(CBUILD)-strip
+ endif
+ ifeq ($(ARCH),aarch64)
+ QEMU_ARCH := aarch64
+ KERNEL_ARCH := arm64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a53 -machine virt
+@@ -85,7 +84,7 @@ else ifeq ($(ARCH),aarch64_be)
+ QEMU_ARCH := aarch64
+ KERNEL_ARCH := arm64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a53 -machine virt
+@@ -95,7 +94,7 @@ else ifeq ($(ARCH),arm)
+ QEMU_ARCH := arm
+ KERNEL_ARCH := arm
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a15 -machine virt
+@@ -105,7 +104,7 @@ else ifeq ($(ARCH),armeb)
+ QEMU_ARCH := arm
+ KERNEL_ARCH := arm
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+ else
+ QEMU_MACHINE := -cpu cortex-a15 -machine virt
+@@ -116,7 +115,7 @@ else ifeq ($(ARCH),x86_64)
+ QEMU_ARCH := x86_64
+ KERNEL_ARCH := x86_64
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+ else
+ QEMU_MACHINE := -cpu Skylake-Server -machine q35
+@@ -126,7 +125,7 @@ else ifeq ($(ARCH),i686)
+ QEMU_ARCH := i386
+ KERNEL_ARCH := x86
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+-ifeq ($(subst i686,x86_64,$(CBUILD)),$(CHOST))
+ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+ else
+ QEMU_MACHINE := -cpu coreduo -machine q35
+@@ -136,7 +135,7 @@ else ifeq ($(ARCH),mips64)
+ QEMU_ARCH := mips64
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EB
+ else
+@@ -147,7 +146,7 @@ else ifeq ($(ARCH),mips64el)
+ QEMU_ARCH := mips64el
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EL
+ else
+@@ -158,7 +157,7 @@ else ifeq ($(ARCH),mips)
+ QEMU_ARCH := mips
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EB
+ else
+@@ -169,7 +168,7 @@ else ifeq ($(ARCH),mipsel)
+ QEMU_ARCH := mipsel
+ KERNEL_ARCH := mips
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+ CFLAGS += -EL
+ else
+@@ -180,7 +179,7 @@ else ifeq ($(ARCH),powerpc64le)
+ QEMU_ARCH := ppc64
+ KERNEL_ARCH := powerpc
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host,accel=kvm -machine pseries
+ else
+ QEMU_MACHINE := -machine pseries
+@@ -190,7 +189,7 @@ else ifeq ($(ARCH),powerpc)
+ QEMU_ARCH := ppc
+ KERNEL_ARCH := powerpc
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/powerpc/boot/uImage
+-ifeq ($(CHOST),$(CBUILD))
+ifeq ($(HOST_ARCH),$(ARCH))
+ QEMU_MACHINE := -cpu host,accel=kvm -machine ppce500
+ else
+ QEMU_MACHINE := -machine ppce500
+@@ -200,10 +199,11 @@ else ifeq ($(ARCH),m68k)
+ QEMU_ARCH := m68k
+ KERNEL_ARCH := m68k
+ KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+-ifeq ($(CHOST),$(CBUILD))
+-QEMU_MACHINE := -cpu host,accel=kvm -machine q800
+KERNEL_CMDLINE := $(shell sed -n 's/CONFIG_CMDLINE=\(.*\)/\1/p' arch/m68k.config)
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host,accel=kvm -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+ else
+-QEMU_MACHINE := -machine q800
+QEMU_MACHINE := -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+ endif
+ else
+ $(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64le, powerpc, m68k)
+@@ -238,14 +238,14 @@ $(BUILD_PATH)/init-cpio-spec.txt:
+ 	echo "nod /dev/console 644 0 0 c 5 1" >> $@
+ 	echo "dir /bin 755 0 0" >> $@
+ 	echo "file /bin/iperf3 $(IPERF_PATH)/src/iperf3 755 0 0" >> $@
+-	echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/tools/wg 755 0 0" >> $@
+	echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/wg 755 0 0" >> $@
+ 	echo "file /bin/bash $(BASH_PATH)/bash 755 0 0" >> $@
+ 	echo "file /bin/ip $(IPROUTE2_PATH)/ip/ip 755 0 0" >> $@
+ 	echo "file /bin/ss $(IPROUTE2_PATH)/misc/ss 755 0 0" >> $@
+ 	echo "file /bin/ping $(IPUTILS_PATH)/ping 755 0 0" >> $@
+ 	echo "file /bin/ncat $(NMAP_PATH)/ncat/ncat 755 0 0" >> $@
+-	echo "file /bin/xtables-multi $(IPTABLES_PATH)/iptables/xtables-multi 755 0 0" >> $@
+-	echo "slink /bin/iptables xtables-multi 777 0 0" >> $@
+	echo "file /bin/xtables-legacy-multi $(IPTABLES_PATH)/iptables/xtables-legacy-multi 755 0 0" >> $@
+	echo "slink /bin/iptables xtables-legacy-multi 777 0 0" >> $@
+ 	echo "slink /bin/ping6 ping 777 0 0" >> $@
+ 	echo "dir /lib 755 0 0" >> $@
+ 	echo "file /lib/libc.so $(MUSL_PATH)/lib/libc.so 755 0 0" >> $@
+@@ -260,8 +260,8 @@ $(KERNEL_BUILD_PATH)/.config: kernel.con
+ 	cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config
+ 	$(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,)
+ 
+-$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/tools/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
+-	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)"
+$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
+	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE)
+ 
+ $(BUILD_PATH)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config
+ 	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install
+@@ -280,7 +280,7 @@ $(BUILD_PATH)/include/.installed: $(MUSL
+ 
+ $(MUSL_CC): $(MUSL_PATH)/lib/libc.so
+ 	sh $(MUSL_PATH)/tools/musl-gcc.specs.sh $(BUILD_PATH)/include $(MUSL_PATH)/lib /lib/ld-linux.so.1 > $(BUILD_PATH)/musl-gcc.specs
+-	printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" -fno-stack-protector -no-pie "$$@"\n' > $(BUILD_PATH)/musl-gcc
+	printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" "$$@"\n' > $(BUILD_PATH)/musl-gcc
+ 	chmod +x $(BUILD_PATH)/musl-gcc
+ 
+ $(IPERF_PATH)/.installed: $(IPERF_TAR)
+@@ -291,7 +291,7 @@ $(IPERF_PATH)/.installed: $(IPERF_TAR)
+ 	touch $@
+ 
+ $(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS)
+-	cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared
+	cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --with-openssl=no
+ 	$(MAKE) -C $(IPERF_PATH)
+ 	$(STRIP) -s $@
+ 
+@@ -308,8 +308,8 @@ $(WIREGUARD_TOOLS_PATH)/.installed: $(WI
+ 	flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ 	touch $@
+ 
+-$(WIREGUARD_TOOLS_PATH)/src/tools/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+-	LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src/tools LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
+$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+	LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg
+ 	$(STRIP) -s $@
+ 
+ $(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS)
+@@ -323,7 +323,8 @@ $(IPUTILS_PATH)/.installed: $(IPUTILS_TA
+ 	touch $@
+ 
+ $(IPUTILS_PATH)/ping: | $(IPUTILS_PATH)/.installed $(USERSPACE_DEPS)
+-	$(MAKE) -C $(IPUTILS_PATH) USE_CAP=no USE_IDN=no USE_NETTLE=no USE_CRYPTO=no ping
+	sed -i /atexit/d $(IPUTILS_PATH)/ping.c
+	cd $(IPUTILS_PATH) && $(CC) $(CFLAGS) -std=c99 -o $@ ping.c ping_common.c ping6_common.c iputils_common.c -D_GNU_SOURCE -D'IPUTILS_VERSION(f)=f' -lresolv $(LDFLAGS)
+ 	$(STRIP) -s $@
+ 
+ $(BASH_PATH)/.installed: $(BASH_TAR)
+@@ -357,7 +358,7 @@ $(IPTABLES_PATH)/.installed: $(IPTABLES_
+ 	sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure
+ 	touch $@
+ 
+-$(IPTABLES_PATH)/iptables/xtables-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS)
+ 	cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include
+ 	$(MAKE) -C $(IPTABLES_PATH)
+ 	$(STRIP) -s $@
+@@ -368,8 +369,9 @@ $(NMAP_PATH)/.installed: $(NMAP_TAR)
+ 	touch $@
+ 
+ $(NMAP_PATH)/ncat/ncat: | $(NMAP_PATH)/.installed $(USERSPACE_DEPS)
+-	cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux
+-	$(MAKE) -C $(NMAP_PATH) build-ncat
+	cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux --without-libssh
+	$(MAKE) -C $(NMAP_PATH)/libpcap
+	$(MAKE) -C $(NMAP_PATH)/ncat
+ 	$(STRIP) -s $@
+ 
+ clean:
+@@ -379,7 +381,7 @@ distclean: clean
+ 	rm -rf $(DISTFILES_PATH)
+ 
+ menuconfig: $(KERNEL_BUILD_PATH)/.config
+-	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC="$(NOPIE_GCC)" menuconfig
+	$(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) menuconfig
+ 
+ .PHONY: qemu build clean distclean menuconfig
+ .DELETE_ON_ERROR:
+--- a/tools/testing/selftests/wireguard/qemu/arch/m68k.config
+++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config
+@@ -1,9 +1,9 @@
+ CONFIG_MMU=y
+CONFIG_M68KCLASSIC=y
+ CONFIG_M68040=y
+ CONFIG_MAC=y
+ CONFIG_SERIAL_PMACZILOG=y
+ CONFIG_SERIAL_PMACZILOG_TTYS=y
+ CONFIG_SERIAL_PMACZILOG_CONSOLE=y
+-CONFIG_CMDLINE_BOOL=y
+ CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+ CONFIG_FRAME_WARN=1024
+--- a/tools/testing/selftests/wireguard/qemu/init.c
+++ b/tools/testing/selftests/wireguard/qemu/init.c
+@@ -21,6 +21,7 @@
+ #include <sys/reboot.h>
+ #include <sys/utsname.h>
+ #include <sys/sendfile.h>
+#include <sys/sysmacros.h>
+ #include <linux/random.h>
+ #include <linux/version.h>
+ 
+--- a/tools/testing/selftests/wireguard/qemu/kernel.config
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
+@@ -39,6 +39,7 @@ CONFIG_PRINTK=y
+ CONFIG_KALLSYMS=y
+ CONFIG_BUG=y
+ CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
+CONFIG_JUMP_LABEL=y
+ CONFIG_EMBEDDED=n
+ CONFIG_BASE_FULL=y
+ CONFIG_FUTEX=y
+@@ -55,6 +56,7 @@ CONFIG_NO_HZ_IDLE=y
+ CONFIG_NO_HZ_FULL=n
+ CONFIG_HZ_PERIODIC=n
+ CONFIG_HIGH_RES_TIMERS=y
+CONFIG_COMPAT_32BIT_TIME=y
+ CONFIG_ARCH_RANDOM=y
+ CONFIG_FILE_LOCKING=y
+ CONFIG_POSIX_TIMERS=y
--- a/target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch
+++ b/target/linux/generic/backport-5.4/080-wireguard-0079-wireguard-queueing-do-not-account-for-pfmemalloc-whe.patch
@ -0,0 +1,39 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Thu, 2 Jan 2020 17:47:50 +0100
+Subject: [PATCH] wireguard: queueing: do not account for pfmemalloc when
+ clearing skb header
+
+commit 04d2ea92a18417619182cbb79063f154892b0150 upstream.
+
+Before 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_
+header()"), the pfmemalloc flag used to be between headers_start and
+headers_end, which is a region we clear when preparing the packet for
+encryption/decryption. This is a parameter we certainly want to
+preserve, which is why 8b7008620b84 moved it out of there. The code here
+was written in a world before 8b7008620b84, though, where we had to
+manually account for it. This commit brings things up to speed.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+---
+ drivers/net/wireguard/queueing.h | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/drivers/net/wireguard/queueing.h
+++ b/drivers/net/wireguard/queueing.h
+@@ -83,13 +83,10 @@ static inline __be16 wg_skb_examine_untr
+ 
+ static inline void wg_reset_packet(struct sk_buff *skb)
+ {
+-	const int pfmemalloc = skb->pfmemalloc;
+-
+ 	skb_scrub_packet(skb, true);
+ 	memset(&skb->headers_start, 0,
+ 	       offsetof(struct sk_buff, headers_end) -
+ 		       offsetof(struct sk_buff, headers_start));
+-	skb->pfmemalloc = pfmemalloc;
+ 	skb->queue_mapping = 0;
+ 	skb->nohdr = 0;
+ 	skb->peeked = 0;
--- a/Show More
+++ b/Show More