From 90166038fbdd388b46f3dd5f7e6ac5dad45167a6 Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Thu, 12 May 2016 14:52:16 +0200 Subject: [PATCH 206/325] GELI: Use a dedicated uma zone for unauthenticated writes to onetime devices ... as they are likely to originate from the vm page daemon. Previously the system could deadlock because the vm daemon was waiting for pages to be written to disk, while GELI was waiting for the vm daemon to make room for the buffer GELI needed to actually write the pages: (kgdb) where #0 sched_switch (td=0xfffff800055bf9a0, newtd=0xfffff80002341000, flags=) at /usr/src/sys/kern/sched_ule.c:1969 #1 0xffffffff80962635 in mi_switch (flags=, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:455 #2 0xffffffff809aaa3a in sleepq_wait (wchan=0x0, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:637 #3 0xffffffff80962038 in _sleep (ident=, lock=, priority=, wmesg=0xffffffff80e826ee "vmwait", sbt=0, pr=, flags=) at /usr/src/sys/kern/kern_synch.c:229 #4 0xffffffff80c1ac6b in vm_wait () at /usr/src/sys/vm/vm_page.c:2705 #5 0xffffffff80c06a9f in kmem_back (object=0xffffffff8144d6f0, addr=18446741874805047296, size=69632, flags=) at /usr/src/sys/vm/vm_kern.c:356 #6 0xffffffff80c068d2 in kmem_malloc (vmem=0xffffffff813aa500, size=69632, flags=2) at /usr/src/sys/vm/vm_kern.c:316 #7 0xffffffff80bfd7d6 in uma_large_malloc (size=69632, wait=2) at /usr/src/sys/vm/uma_core.c:1106 #8 0xffffffff8092f614 in malloc (size=, mtp=0xffffffff81b4d520, flags=0) at /usr/src/sys/kern/kern_malloc.c:513 #9 0xffffffff81b4ab99 in g_eli_crypto_run (wr=0xfffff80002560040, bp=0xfffff80008a86d90) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli_privacy.c:262 #10 0xffffffff81b3e860 in g_eli_worker (arg=0xfffff80002560040) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli.c:565 #11 0xffffffff80910f5c in fork_exit (callout=0xffffffff81b3e0b0 , arg=0xfffff80002560040, frame=0xfffffe005005ec00) at /usr/src/sys/kern/kern_fork.c:1034 #12 0xffffffff80c33f0e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:611 #13 0x0000000000000000 in ?? () (kgdb) p vm_cnt $16 = {v_swtch = 0, v_trap = 0, v_syscall = 0, v_intr = 0, v_soft = 0, v_vm_faults = 0, v_io_faults = 0, v_cow_faults = 0, v_cow_optim = 0, v_zfod = 0, v_ozfod = 0, v_swapin = 0, v_swapout = 0, v_swappgsin = 0, v_swappgsout = 0, v_vnodein = 0, v_vnodeout = 0, v_vnodepgsin = 0, v_vnodepgsout = 0, v_intrans = 0, v_reactivated = 0, v_pdwakeups = 22197, v_pdpages = 0, v_tcached = 0, v_dfree = 0, v_pfree = 0, v_tfree = 0, v_page_size = 4096, v_page_count = 247688, v_free_reserved = 372, v_free_target = 5320, v_free_min = 1609, v_free_count = 2, v_wire_count = 140735, v_active_count = 96194, v_inactive_target = 7980, v_inactive_count = 10756, v_cache_count = 0, v_pageout_free_min = 34, v_interrupt_free_min = 2, v_free_severe = 990, v_forks = 0, v_vforks = 0, v_rforks = 0, v_kthreads = 0, v_forkpages = 0, v_vforkpages = 0, v_rforkpages = 0, v_kthreadpages = 0, v_spare = 0xffffffff8144d5ac} A sysctl (kern.geom.eli.use_uma_for_all_writes) is added to optionally use the zone for unauthenticated GELI writes in general, without letting common writes cut into the reserve for onetime writes. This may reduce latency for larger writes and as we need to keep a couple of items in the zone anyway, the impact on the zone size is minor. Currently a single zone with a somewhat humongous item size sufficient for unauthenticated GELI writes up to MAXPHYS is being used. While this may look a bit wasteful, in practice we don't need a lot of items, so this seem tolerable for now. In case of writes that are too big to fit into an UMA zone item GELI falls back to using malloc again. The UMA zone item size is chosen based on kern.geom.eli.max_uma_bio_length. The pager never seems to use writes above MAXPHYS but ZFS will use writes up to recordsize. The best solution would probably be to only use the dedicated uma zone for common writes if the size is above 65356 bytes, the largest zone item size internally used by malloc. Currently the zone isn't used for reads as those are less time critical and usually are small enough for malloc() to succeed right away anyway. The zone currently isn't used for authenticated writes either, mainly because they aren't used by default for swapping and this would complicate the code without obvious gain. Example length distribution when reproducing ElectroBSD with -j4 and 1 GB of RAM: gpt/swap-ada1.eli BIO_WRITE value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4965848 8192 |@@@@@ 943980 12288 |@@ 362668 16384 |@ 161485 20480 |@ 120939 24576 | 87827 28672 | 57402 32768 | 40470 36864 | 42243 40960 | 28543 45056 | 20347 49152 | 15235 53248 | 13450 57344 | 9535 61440 | 9952 65536 |@ 179360 69632 | 0 gpt/swap-ada1.eli BIO_READ value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4645114 8192 | 0 12288 | 0 16384 | 3446 20480 | 0 Note that the GELI overhead is not accounted for here and only the results for the swap device are shown. Zone use: [fk@elektrobier3 ~]$ vmstat -z | egrep 'ITEM|eli' | column -t ITEM SIZE LIMIT USED FREE REQ FAIL SLEEP g_eli: 172032, 0, 0, 14, 8077487, 0, 0 This includes writes to gpt/dpool-ada1.eli and gpt/dpool-ada1.eli. Discussion: While the zone served 8077487 memory requests total, 14 items were sufficient for this and therefore the zone only withheld 172032 * 14 bytes plus zone meta data from the rest of the system. PR: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=209759 PR submission date: 2016-05-26 Obtained from: ElectroBSD --- sys/geom/eli/g_eli.c | 58 ++++++++++++++++++++++++-- sys/geom/eli/g_eli.h | 3 ++ sys/geom/eli/g_eli_privacy.c | 81 +++++++++++++++++++++++++++++++++--- 3 files changed, 134 insertions(+), 8 deletions(-) diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c index 54f10b2857c5..5aaaf44c92ad 100644 --- a/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -86,6 +86,33 @@ u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); +uma_zone_t g_eli_zone; +static u_int g_eli_uma_reserve = 1; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, uma_reserve, CTLFLAG_RDTUN, + &g_eli_uma_reserve, 0, "Items to pre-allocate in dedicated uma zone " + "and reserve for unauthenticated writes to onetime disks"); + +u_int g_eli_all_writes_use_uma = 0; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, use_uma_for_all_writes, CTLFLAG_RDTUN, + &g_eli_all_writes_use_uma, 0, "Use the dedicated uma zone for all " + "unauthenticated writes that fit, not just for writes to onetime " + "providers. May reduce write latency but also increases memory usage."); + +/* + * Items in the dedicated uma zone have a fixed size and need + * to be big enough for all bio lengths for which it is being used. + * + * g_eli_max_uma_bio_length is the largest amount of data geli can receive + * in a row and fit into an uma zone item, it is used to calculate + * g_eli_zone_item_size which additionally accounts for the encryption + * overhead, which depends on the number of sectors. + */ +u_int g_eli_max_uma_bio_length = MAXPHYS; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, max_uma_bio_length, CTLFLAG_RDTUN, + &g_eli_max_uma_bio_length, 0, + "Maximum bio length for which the uma zone is being used."); +u_int g_eli_zone_item_size; + /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. @@ -303,8 +330,8 @@ g_eli_write_done(struct bio *bp) pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; - free(pbp->bio_driver2, M_ELI); - pbp->bio_driver2 = NULL; + sc = pbp->bio_to->geom->softc; + g_eli_free_driver2(pbp, sc->sc_flags); if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); @@ -315,7 +342,6 @@ g_eli_write_done(struct bio *bp) /* * Write is finished, send it up. */ - sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -1356,6 +1382,31 @@ static void g_eli_init(struct g_class *mp) { + /* + * Calculate uma zone item size based on the largest bio length + * we need to use it for. + * + * 512 bytes is the smallest sector size supported and results in the + * largest overhead. If larger sectors are being used, we'll just waste + * a bit more memory. + * + * We currently only use the zone for unauthenticated writes, + * otherwise the item size would have to be even bigger. + */ + g_eli_zone_item_size = (g_eli_max_uma_bio_length + \ + (g_eli_max_uma_bio_length / 512) * \ + (sizeof(struct cryptop) + sizeof(struct cryptodesc))); + + G_ELI_DEBUG(3, "Using uma zone item size %d for max bio length %d", + g_eli_zone_item_size, g_eli_max_uma_bio_length); + g_eli_zone = uma_zcreate("g_eli", g_eli_zone_item_size, NULL, NULL, + NULL, NULL, 0, UMA_ZONE_NOFREE); + if (0 < g_eli_uma_reserve) { + /* Increase the chances that items are available when needed. */ + uma_prealloc(g_eli_zone, g_eli_uma_reserve); + uma_zone_reserve(g_eli_zone, g_eli_uma_reserve); + } + g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) @@ -1366,6 +1417,7 @@ static void g_eli_fini(struct g_class *mp) { + uma_zdestroy(g_eli_zone); if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h index 5141081710a5..5cfb3935d3c4 100644 --- a/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -147,6 +147,8 @@ #if (MAX_KEY_BYTES < G_ELI_DATAIVKEYLEN) #error "MAX_KEY_BYTES is less than G_ELI_DATAKEYLEN" #endif +extern u_int g_eli_max_uma_bio_length; +extern u_int g_eli_zone_item_size; extern int g_eli_debug; extern u_int g_eli_overwrites; @@ -722,5 +724,6 @@ void g_eli_key_init(struct g_eli_softc *sc); void g_eli_key_destroy(struct g_eli_softc *sc); uint8_t *g_eli_key_hold(struct g_eli_softc *sc, off_t offset, size_t blocksize); void g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey); +void g_eli_free_driver2(struct bio *bp, int sc_flags); #endif #endif /* !_G_ELI_H_ */ diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c index 6dbda7f0ef03..99e60e0e677d 100644 --- a/sys/geom/eli/g_eli_privacy.c +++ b/sys/geom/eli/g_eli_privacy.c @@ -51,6 +51,9 @@ __FBSDID("$FreeBSD$"); #include #include +extern u_int g_eli_all_writes_use_uma; +extern uma_zone_t g_eli_zone; + /* * Code paths: * BIO_READ: @@ -61,6 +64,76 @@ __FBSDID("$FreeBSD$"); MALLOC_DECLARE(M_ELI); +static int +g_eli_use_uma_zone(const struct bio *bp, int sc_flags) +{ + + return (bp->bio_cmd == BIO_WRITE && + bp->bio_length <= g_eli_max_uma_bio_length && + ((sc_flags & G_ELI_FLAG_ONETIME) != 0 || g_eli_all_writes_use_uma) && + (sc_flags & G_ELI_FLAG_AUTH) == 0); +} + +static void * +g_eli_alloc_driver2(const struct bio *bp, size_t size, int sc_flags) +{ + void *p; + + /* + * While requests from the authentication code path should + * be handled correctly (by never using the uma zone) they + * currently aren't expected to occur at all. + */ + KASSERT((sc_flags & G_ELI_FLAG_AUTH) == 0, + ("g_eli_alloc_driver2() used for authenticated write")); + + if (g_eli_use_uma_zone(bp, sc_flags)) { + int uma_flags; + + KASSERT(size <= g_eli_zone_item_size, + ("Insufficient g_eli_zone_item_size %u < %u", + (unsigned)g_eli_zone_item_size, (unsigned)size)); + /* + * Writes to onetime providers are likely to originate + * from the page daemon, therefore we try to get the + * memory a bit harder for them to prevent vm deadlocks. + */ + if ((sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_flags = M_NOWAIT|M_USE_RESERVE; + else + uma_flags = M_WAITOK; + + G_ELI_DEBUG(3, "Using uma zone for size: %jd, bio_length: %jd", + (intmax_t)size, bp->bio_length); + while (NULL == (p = uma_zalloc(g_eli_zone, uma_flags))) { + /* Only reachable for onetime providers */ + pause("g_eli:uma", min(hz/1000, 1)); + } + } else { + G_ELI_DEBUG(3, "Using malloc for size: %jd, bio_length: %jd", + (intmax_t)size, bp->bio_length); + p = malloc(size, M_ELI, M_WAITOK); + } + + return (p); +} + +void +g_eli_free_driver2(struct bio *bp, int sc_flags) +{ + + if (g_eli_use_uma_zone(bp, sc_flags)) { + G_ELI_DEBUG(3, "Using uma_free for bio_length: %jd", + bp->bio_length); + uma_zfree(g_eli_zone, bp->bio_driver2); + } else { + G_ELI_DEBUG(3, "Using free for bio_length: %jd", + bp->bio_length); + free(bp->bio_driver2, M_ELI); + } + bp->bio_driver2 = NULL; +} + /* * The function is called after we read and decrypt data. * @@ -96,8 +169,7 @@ g_eli_crypto_read_done(struct cryptop *crp) */ if (bp->bio_inbed < bp->bio_children) return (0); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_driver2(bp, sc->sc_flags); if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); @@ -155,8 +227,7 @@ g_eli_crypto_write_done(struct cryptop *crp) if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_driver2(bp, sc->sc_flags); g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); @@ -261,7 +332,7 @@ g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; - p = malloc(size, M_ELI, M_WAITOK); + p = g_eli_alloc_driver2(bp, size, sc->sc_flags); bp->bio_inbed = 0; bp->bio_children = nsec; -- 2.32.0