diff --git a/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch b/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch
index 68cfa62..d722aad 100644
--- a/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch
+++ b/linux-tkg-patches/5.15/0008-5.15-bcachefs.patch
@@ -1,5 +1,355 @@
+From 6032ed7e926efcff82d52458c7fd7a42c255cea1 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sun, 6 Nov 2022 10:52:14 +0100
+Subject: [PATCH] bcachefs-5.15
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ block/bio.c                                   |   34 +-
+ block/blk-core.c                              |   13 +-
+ drivers/block/loop.c                          |    1 -
+ drivers/md/bcache/Kconfig                     |   10 +-
+ drivers/md/bcache/Makefile                    |    4 +-
+ drivers/md/bcache/bcache.h                    |    2 +-
+ drivers/md/bcache/super.c                     |    1 -
+ drivers/md/bcache/util.h                      |    3 +-
+ fs/Kconfig                                    |    1 +
+ fs/Makefile                                   |    1 +
+ fs/bcachefs/Kconfig                           |   52 +
+ fs/bcachefs/Makefile                          |   65 +
+ fs/bcachefs/acl.c                             |  406 ++
+ fs/bcachefs/acl.h                             |   58 +
+ fs/bcachefs/alloc_background.c                | 1343 +++++++
+ fs/bcachefs/alloc_background.h                |  139 +
+ fs/bcachefs/alloc_foreground.c                | 1263 ++++++
+ fs/bcachefs/alloc_foreground.h                |  173 +
+ fs/bcachefs/alloc_types.h                     |   87 +
+ fs/bcachefs/bcachefs.h                        |  974 +++++
+ fs/bcachefs/bcachefs_format.h                 | 1986 ++++++++++
+ fs/bcachefs/bcachefs_ioctl.h                  |  367 ++
+ fs/bcachefs/bkey.c                            | 1172 ++++++
+ fs/bcachefs/bkey.h                            |  566 +++
+ fs/bcachefs/bkey_buf.h                        |   60 +
+ fs/bcachefs/bkey_methods.c                    |  463 +++
+ fs/bcachefs/bkey_methods.h                    |  105 +
+ fs/bcachefs/bkey_sort.c                       |  198 +
+ fs/bcachefs/bkey_sort.h                       |   44 +
+ fs/bcachefs/bset.c                            | 1598 ++++++++
+ fs/bcachefs/bset.h                            |  615 +++
+ fs/bcachefs/btree_cache.c                     | 1160 ++++++
+ fs/bcachefs/btree_cache.h                     |  107 +
+ fs/bcachefs/btree_gc.c                        | 2102 ++++++++++
+ fs/bcachefs/btree_gc.h                        |  105 +
+ fs/bcachefs/btree_io.c                        | 2111 ++++++++++
+ fs/bcachefs/btree_io.h                        |  222 ++
+ fs/bcachefs/btree_iter.c                      | 3329 ++++++++++++++++
+ fs/bcachefs/btree_iter.h                      |  406 ++
+ fs/bcachefs/btree_key_cache.c                 |  743 ++++
+ fs/bcachefs/btree_key_cache.h                 |   45 +
+ fs/bcachefs/btree_locking.h                   |  259 ++
+ fs/bcachefs/btree_types.h                     |  713 ++++
+ fs/bcachefs/btree_update.h                    |  141 +
+ fs/bcachefs/btree_update_interior.c           | 2238 +++++++++++
+ fs/bcachefs/btree_update_interior.h           |  321 ++
+ fs/bcachefs/btree_update_leaf.c               | 1756 +++++++++
+ fs/bcachefs/buckets.c                         | 2122 ++++++++++
+ fs/bcachefs/buckets.h                         |  298 ++
+ fs/bcachefs/buckets_types.h                   |  104 +
+ fs/bcachefs/buckets_waiting_for_journal.c     |  167 +
+ fs/bcachefs/buckets_waiting_for_journal.h     |   15 +
+ .../buckets_waiting_for_journal_types.h       |   23 +
+ fs/bcachefs/chardev.c                         |  761 ++++
+ fs/bcachefs/chardev.h                         |   31 +
+ fs/bcachefs/checksum.c                        |  665 ++++
+ fs/bcachefs/checksum.h                        |  204 +
+ fs/bcachefs/clock.c                           |  191 +
+ fs/bcachefs/clock.h                           |   38 +
+ fs/bcachefs/clock_types.h                     |   37 +
+ fs/bcachefs/compress.c                        |  641 +++
+ fs/bcachefs/compress.h                        |   18 +
+ fs/bcachefs/darray.h                          |   76 +
+ fs/bcachefs/debug.c                           |  628 +++
+ fs/bcachefs/debug.h                           |   30 +
+ fs/bcachefs/dirent.c                          |  545 +++
+ fs/bcachefs/dirent.h                          |   67 +
+ fs/bcachefs/disk_groups.c                     |  506 +++
+ fs/bcachefs/disk_groups.h                     |   90 +
+ fs/bcachefs/ec.c                              | 1682 ++++++++
+ fs/bcachefs/ec.h                              |  228 ++
+ fs/bcachefs/ec_types.h                        |   46 +
+ fs/bcachefs/errcode.h                         |   12 +
+ fs/bcachefs/error.c                           |  185 +
+ fs/bcachefs/error.h                           |  238 ++
+ fs/bcachefs/extent_update.c                   |  178 +
+ fs/bcachefs/extent_update.h                   |   12 +
+ fs/bcachefs/extents.c                         | 1259 ++++++
+ fs/bcachefs/extents.h                         |  688 ++++
+ fs/bcachefs/extents_types.h                   |   40 +
+ fs/bcachefs/eytzinger.h                       |  281 ++
+ fs/bcachefs/fifo.h                            |  127 +
+ fs/bcachefs/fs-common.c                       |  494 +++
+ fs/bcachefs/fs-common.h                       |   43 +
+ fs/bcachefs/fs-io.c                           | 3495 +++++++++++++++++
+ fs/bcachefs/fs-io.h                           |   57 +
+ fs/bcachefs/fs-ioctl.c                        |  523 +++
+ fs/bcachefs/fs-ioctl.h                        |   81 +
+ fs/bcachefs/fs.c                              | 1940 +++++++++
+ fs/bcachefs/fs.h                              |  208 +
+ fs/bcachefs/fsck.c                            | 2356 +++++++++++
+ fs/bcachefs/fsck.h                            |    8 +
+ fs/bcachefs/inode.c                           |  720 ++++
+ fs/bcachefs/inode.h                           |  204 +
+ fs/bcachefs/io.c                              | 2416 ++++++++++++
+ fs/bcachefs/io.h                              |  189 +
+ fs/bcachefs/io_types.h                        |  161 +
+ fs/bcachefs/journal.c                         | 1410 +++++++
+ fs/bcachefs/journal.h                         |  522 +++
+ fs/bcachefs/journal_io.c                      | 1700 ++++++++
+ fs/bcachefs/journal_io.h                      |   60 +
+ fs/bcachefs/journal_reclaim.c                 |  847 ++++
+ fs/bcachefs/journal_reclaim.h                 |   86 +
+ fs/bcachefs/journal_sb.c                      |  222 ++
+ fs/bcachefs/journal_sb.h                      |   24 +
+ fs/bcachefs/journal_seq_blacklist.c           |  322 ++
+ fs/bcachefs/journal_seq_blacklist.h           |   22 +
+ fs/bcachefs/journal_types.h                   |  340 ++
+ fs/bcachefs/keylist.c                         |   67 +
+ fs/bcachefs/keylist.h                         |   76 +
+ fs/bcachefs/keylist_types.h                   |   16 +
+ fs/bcachefs/lru.c                             |  203 +
+ fs/bcachefs/lru.h                             |   17 +
+ fs/bcachefs/migrate.c                         |  196 +
+ fs/bcachefs/migrate.h                         |    7 +
+ fs/bcachefs/move.c                            | 1130 ++++++
+ fs/bcachefs/move.h                            |   73 +
+ fs/bcachefs/move_types.h                      |   19 +
+ fs/bcachefs/movinggc.c                        |  424 ++
+ fs/bcachefs/movinggc.h                        |    9 +
+ fs/bcachefs/opts.c                            |  560 +++
+ fs/bcachefs/opts.h                            |  517 +++
+ fs/bcachefs/quota.c                           |  852 ++++
+ fs/bcachefs/quota.h                           |   71 +
+ fs/bcachefs/quota_types.h                     |   43 +
+ fs/bcachefs/rebalance.c                       |  349 ++
+ fs/bcachefs/rebalance.h                       |   28 +
+ fs/bcachefs/rebalance_types.h                 |   26 +
+ fs/bcachefs/recovery.c                        | 1472 +++++++
+ fs/bcachefs/recovery.h                        |   66 +
+ fs/bcachefs/reflink.c                         |  404 ++
+ fs/bcachefs/reflink.h                         |   73 +
+ fs/bcachefs/replicas.c                        | 1073 +++++
+ fs/bcachefs/replicas.h                        |  106 +
+ fs/bcachefs/replicas_types.h                  |   10 +
+ fs/bcachefs/siphash.c                         |  173 +
+ fs/bcachefs/siphash.h                         |   87 +
+ fs/bcachefs/str_hash.h                        |  351 ++
+ fs/bcachefs/subvolume.c                       | 1075 +++++
+ fs/bcachefs/subvolume.h                       |  124 +
+ fs/bcachefs/subvolume_types.h                 |    9 +
+ fs/bcachefs/super-io.c                        | 1601 ++++++++
+ fs/bcachefs/super-io.h                        |  126 +
+ fs/bcachefs/super.c                           | 1966 ++++++++++
+ fs/bcachefs/super.h                           |  264 ++
+ fs/bcachefs/super_types.h                     |   51 +
+ fs/bcachefs/sysfs.c                           |  889 +++++
+ fs/bcachefs/sysfs.h                           |   44 +
+ fs/bcachefs/tests.c                           |  947 +++++
+ fs/bcachefs/tests.h                           |   15 +
+ fs/bcachefs/trace.c                           |   12 +
+ fs/bcachefs/util.c                            |  984 +++++
+ fs/bcachefs/util.h                            |  877 +++++
+ fs/bcachefs/varint.c                          |  120 +
+ fs/bcachefs/varint.h                          |   11 +
+ fs/bcachefs/vstructs.h                        |   63 +
+ fs/bcachefs/xattr.c                           |  629 +++
+ fs/bcachefs/xattr.h                           |   50 +
+ fs/dcache.c                                   |   10 +-
+ fs/inode.c                                    |  218 +-
+ include/linux/bio.h                           |    7 +-
+ include/linux/blkdev.h                        |    1 +
+ .../md/bcache => include/linux}/closure.h     |   39 +-
+ include/linux/compiler_attributes.h           |    5 +
+ include/linux/dcache.h                        |    1 +
+ include/linux/exportfs.h                      |    6 +
+ include/linux/fs.h                            |    9 +-
+ include/linux/generic-radix-tree.h            |    6 +
+ include/linux/list_bl.h                       |   22 +
+ include/linux/lockdep.h                       |    4 +
+ include/linux/sched.h                         |    1 +
+ include/linux/six.h                           |  203 +
+ include/linux/vmalloc.h                       |    1 +
+ include/trace/events/bcachefs.h               | 1034 +++++
+ init/init_task.c                              |    1 +
+ kernel/Kconfig.locks                          |    3 +
+ kernel/locking/Makefile                       |    1 +
+ kernel/locking/lockdep.c                      |   20 +
+ kernel/locking/six.c                          |  759 ++++
+ kernel/module.c                               |    4 +-
+ lib/Kconfig                                   |    3 +
+ lib/Kconfig.debug                             |    9 +
+ lib/Makefile                                  |    2 +
+ {drivers/md/bcache => lib}/closure.c          |   35 +-
+ lib/generic-radix-tree.c                      |   17 +-
+ mm/filemap.c                                  |    1 +
+ mm/nommu.c                                    |   18 +
+ mm/vmalloc.c                                  |   21 +
+ 188 files changed, 78910 insertions(+), 151 deletions(-)
+ create mode 100644 fs/bcachefs/Kconfig
+ create mode 100644 fs/bcachefs/Makefile
+ create mode 100644 fs/bcachefs/acl.c
+ create mode 100644 fs/bcachefs/acl.h
+ create mode 100644 fs/bcachefs/alloc_background.c
+ create mode 100644 fs/bcachefs/alloc_background.h
+ create mode 100644 fs/bcachefs/alloc_foreground.c
+ create mode 100644 fs/bcachefs/alloc_foreground.h
+ create mode 100644 fs/bcachefs/alloc_types.h
+ create mode 100644 fs/bcachefs/bcachefs.h
+ create mode 100644 fs/bcachefs/bcachefs_format.h
+ create mode 100644 fs/bcachefs/bcachefs_ioctl.h
+ create mode 100644 fs/bcachefs/bkey.c
+ create mode 100644 fs/bcachefs/bkey.h
+ create mode 100644 fs/bcachefs/bkey_buf.h
+ create mode 100644 fs/bcachefs/bkey_methods.c
+ create mode 100644 fs/bcachefs/bkey_methods.h
+ create mode 100644 fs/bcachefs/bkey_sort.c
+ create mode 100644 fs/bcachefs/bkey_sort.h
+ create mode 100644 fs/bcachefs/bset.c
+ create mode 100644 fs/bcachefs/bset.h
+ create mode 100644 fs/bcachefs/btree_cache.c
+ create mode 100644 fs/bcachefs/btree_cache.h
+ create mode 100644 fs/bcachefs/btree_gc.c
+ create mode 100644 fs/bcachefs/btree_gc.h
+ create mode 100644 fs/bcachefs/btree_io.c
+ create mode 100644 fs/bcachefs/btree_io.h
+ create mode 100644 fs/bcachefs/btree_iter.c
+ create mode 100644 fs/bcachefs/btree_iter.h
+ create mode 100644 fs/bcachefs/btree_key_cache.c
+ create mode 100644 fs/bcachefs/btree_key_cache.h
+ create mode 100644 fs/bcachefs/btree_locking.h
+ create mode 100644 fs/bcachefs/btree_types.h
+ create mode 100644 fs/bcachefs/btree_update.h
+ create mode 100644 fs/bcachefs/btree_update_interior.c
+ create mode 100644 fs/bcachefs/btree_update_interior.h
+ create mode 100644 fs/bcachefs/btree_update_leaf.c
+ create mode 100644 fs/bcachefs/buckets.c
+ create mode 100644 fs/bcachefs/buckets.h
+ create mode 100644 fs/bcachefs/buckets_types.h
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h
+ create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h
+ create mode 100644 fs/bcachefs/chardev.c
+ create mode 100644 fs/bcachefs/chardev.h
+ create mode 100644 fs/bcachefs/checksum.c
+ create mode 100644 fs/bcachefs/checksum.h
+ create mode 100644 fs/bcachefs/clock.c
+ create mode 100644 fs/bcachefs/clock.h
+ create mode 100644 fs/bcachefs/clock_types.h
+ create mode 100644 fs/bcachefs/compress.c
+ create mode 100644 fs/bcachefs/compress.h
+ create mode 100644 fs/bcachefs/darray.h
+ create mode 100644 fs/bcachefs/debug.c
+ create mode 100644 fs/bcachefs/debug.h
+ create mode 100644 fs/bcachefs/dirent.c
+ create mode 100644 fs/bcachefs/dirent.h
+ create mode 100644 fs/bcachefs/disk_groups.c
+ create mode 100644 fs/bcachefs/disk_groups.h
+ create mode 100644 fs/bcachefs/ec.c
+ create mode 100644 fs/bcachefs/ec.h
+ create mode 100644 fs/bcachefs/ec_types.h
+ create mode 100644 fs/bcachefs/errcode.h
+ create mode 100644 fs/bcachefs/error.c
+ create mode 100644 fs/bcachefs/error.h
+ create mode 100644 fs/bcachefs/extent_update.c
+ create mode 100644 fs/bcachefs/extent_update.h
+ create mode 100644 fs/bcachefs/extents.c
+ create mode 100644 fs/bcachefs/extents.h
+ create mode 100644 fs/bcachefs/extents_types.h
+ create mode 100644 fs/bcachefs/eytzinger.h
+ create mode 100644 fs/bcachefs/fifo.h
+ create mode 100644 fs/bcachefs/fs-common.c
+ create mode 100644 fs/bcachefs/fs-common.h
+ create mode 100644 fs/bcachefs/fs-io.c
+ create mode 100644 fs/bcachefs/fs-io.h
+ create mode 100644 fs/bcachefs/fs-ioctl.c
+ create mode 100644 fs/bcachefs/fs-ioctl.h
+ create mode 100644 fs/bcachefs/fs.c
+ create mode 100644 fs/bcachefs/fs.h
+ create mode 100644 fs/bcachefs/fsck.c
+ create mode 100644 fs/bcachefs/fsck.h
+ create mode 100644 fs/bcachefs/inode.c
+ create mode 100644 fs/bcachefs/inode.h
+ create mode 100644 fs/bcachefs/io.c
+ create mode 100644 fs/bcachefs/io.h
+ create mode 100644 fs/bcachefs/io_types.h
+ create mode 100644 fs/bcachefs/journal.c
+ create mode 100644 fs/bcachefs/journal.h
+ create mode 100644 fs/bcachefs/journal_io.c
+ create mode 100644 fs/bcachefs/journal_io.h
+ create mode 100644 fs/bcachefs/journal_reclaim.c
+ create mode 100644 fs/bcachefs/journal_reclaim.h
+ create mode 100644 fs/bcachefs/journal_sb.c
+ create mode 100644 fs/bcachefs/journal_sb.h
+ create mode 100644 fs/bcachefs/journal_seq_blacklist.c
+ create mode 100644 fs/bcachefs/journal_seq_blacklist.h
+ create mode 100644 fs/bcachefs/journal_types.h
+ create mode 100644 fs/bcachefs/keylist.c
+ create mode 100644 fs/bcachefs/keylist.h
+ create mode 100644 fs/bcachefs/keylist_types.h
+ create mode 100644 fs/bcachefs/lru.c
+ create mode 100644 fs/bcachefs/lru.h
+ create mode 100644 fs/bcachefs/migrate.c
+ create mode 100644 fs/bcachefs/migrate.h
+ create mode 100644 fs/bcachefs/move.c
+ create mode 100644 fs/bcachefs/move.h
+ create mode 100644 fs/bcachefs/move_types.h
+ create mode 100644 fs/bcachefs/movinggc.c
+ create mode 100644 fs/bcachefs/movinggc.h
+ create mode 100644 fs/bcachefs/opts.c
+ create mode 100644 fs/bcachefs/opts.h
+ create mode 100644 fs/bcachefs/quota.c
+ create mode 100644 fs/bcachefs/quota.h
+ create mode 100644 fs/bcachefs/quota_types.h
+ create mode 100644 fs/bcachefs/rebalance.c
+ create mode 100644 fs/bcachefs/rebalance.h
+ create mode 100644 fs/bcachefs/rebalance_types.h
+ create mode 100644 fs/bcachefs/recovery.c
+ create mode 100644 fs/bcachefs/recovery.h
+ create mode 100644 fs/bcachefs/reflink.c
+ create mode 100644 fs/bcachefs/reflink.h
+ create mode 100644 fs/bcachefs/replicas.c
+ create mode 100644 fs/bcachefs/replicas.h
+ create mode 100644 fs/bcachefs/replicas_types.h
+ create mode 100644 fs/bcachefs/siphash.c
+ create mode 100644 fs/bcachefs/siphash.h
+ create mode 100644 fs/bcachefs/str_hash.h
+ create mode 100644 fs/bcachefs/subvolume.c
+ create mode 100644 fs/bcachefs/subvolume.h
+ create mode 100644 fs/bcachefs/subvolume_types.h
+ create mode 100644 fs/bcachefs/super-io.c
+ create mode 100644 fs/bcachefs/super-io.h
+ create mode 100644 fs/bcachefs/super.c
+ create mode 100644 fs/bcachefs/super.h
+ create mode 100644 fs/bcachefs/super_types.h
+ create mode 100644 fs/bcachefs/sysfs.c
+ create mode 100644 fs/bcachefs/sysfs.h
+ create mode 100644 fs/bcachefs/tests.c
+ create mode 100644 fs/bcachefs/tests.h
+ create mode 100644 fs/bcachefs/trace.c
+ create mode 100644 fs/bcachefs/util.c
+ create mode 100644 fs/bcachefs/util.h
+ create mode 100644 fs/bcachefs/varint.c
+ create mode 100644 fs/bcachefs/varint.h
+ create mode 100644 fs/bcachefs/vstructs.h
+ create mode 100644 fs/bcachefs/xattr.c
+ create mode 100644 fs/bcachefs/xattr.h
+ rename {drivers/md/bcache => include/linux}/closure.h (94%)
+ create mode 100644 include/linux/six.h
+ create mode 100644 include/trace/events/bcachefs.h
+ create mode 100644 kernel/locking/six.c
+ rename {drivers/md/bcache => lib}/closure.c (88%)
+
 diff --git a/block/bio.c b/block/bio.c
-index a6fb6a0b4295..3c9cc0000168 100644
+index ba9120d4fe49..ba076c1547ff 100644
 --- a/block/bio.c
 +++ b/block/bio.c
 @@ -526,15 +526,15 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
@@ -21,7 +371,7 @@ index a6fb6a0b4295..3c9cc0000168 100644
  
  /**
   * bio_truncate - truncate the bio to small size of @new_size
-@@ -1284,17 +1284,28 @@ EXPORT_SYMBOL(bio_advance);
+@@ -1265,17 +1265,27 @@ EXPORT_SYMBOL(bio_advance);
  void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
  			struct bio *src, struct bvec_iter *src_iter)
  {
@@ -46,19 +396,18 @@ index a6fb6a0b4295..3c9cc0000168 100644
 +		memcpy(dst_p + dst_bv.bv_offset,
 +		       src_p + src_bv.bv_offset,
 +		       bytes);
-+
+ 
+-		memcpy(dst_buf, src_buf, bytes);
 +		kunmap_atomic(dst_p);
 +		kunmap_atomic(src_p);
-
--		memcpy(dst_buf, src_buf, bytes);
-
+ 
 -		kunmap_local(dst_buf);
 -		kunmap_local(src_buf);
 +		flush_dcache_page(dst_bv.bv_page);
-
+ 
  		bio_advance_iter_single(src, src_iter, bytes);
  		bio_advance_iter_single(dst, dst_iter, bytes);
-@@ -1366,6 +1378,7 @@ void bio_set_pages_dirty(struct bio *bio)
+@@ -1349,6 +1359,7 @@ void bio_set_pages_dirty(struct bio *bio)
  			set_page_dirty_lock(bvec->bv_page);
  	}
  }
@@ -66,7 +415,7 @@ index a6fb6a0b4295..3c9cc0000168 100644
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
-@@ -1425,6 +1438,7 @@ void bio_check_pages_dirty(struct bio *bio)
+@@ -1408,6 +1419,7 @@ void bio_check_pages_dirty(struct bio *bio)
  	spin_unlock_irqrestore(&bio_dirty_lock, flags);
  	schedule_work(&bio_dirty_work);
  }
@@ -75,10 +424,10 @@ index a6fb6a0b4295..3c9cc0000168 100644
  static inline bool bio_remaining_done(struct bio *bio)
  {
 diff --git a/block/blk-core.c b/block/blk-core.c
-index 4d8f5fe91588..96cf713f03a7 100644
+index 13e1fca1e923..99576a4c0bb1 100644
 --- a/block/blk-core.c
 +++ b/block/blk-core.c
-@@ -214,18 +214,23 @@ int blk_status_to_errno(blk_status_t status)
+@@ -215,18 +215,23 @@ int blk_status_to_errno(blk_status_t status)
  }
  EXPORT_SYMBOL_GPL(blk_status_to_errno);
  
@@ -107,10 +456,10 @@ index 4d8f5fe91588..96cf713f03a7 100644
  		blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
  		req->cmd_flags & ~REQ_OP_MASK,
 diff --git a/drivers/block/loop.c b/drivers/block/loop.c
-index 7bf4686af774..80511131b884 100644
+index 79e485949b60..f5036056a430 100644
 --- a/drivers/block/loop.c
 +++ b/drivers/block/loop.c
-@@ -1412,7 +1412,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
+@@ -1410,7 +1410,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
  
  	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
  	lo_number = lo->lo_number;
@@ -180,10 +529,10 @@ index 5fc989a6d452..e5e147d0e49a 100644
  struct bucket {
  	atomic_t	pin;
 diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
-index f2874c77ff79..7217e05107bf 100644
+index af4fa8071cbc..46cae9a7f7fb 100644
 --- a/drivers/md/bcache/super.c
 +++ b/drivers/md/bcache/super.c
-@@ -2910,7 +2910,6 @@ static int __init bcache_init(void)
+@@ -2911,7 +2911,6 @@ static int __init bcache_init(void)
  		goto err;
  
  	bch_debug_init();
@@ -296,10 +645,10 @@ index 000000000000..27742ce276cd
 +	Include some unit and performance tests for the core btree code
 diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
 new file mode 100644
-index 000000000000..71cda24e6d08
+index 000000000000..7ddae26116a0
 --- /dev/null
 +++ b/fs/bcachefs/Makefile
-@@ -0,0 +1,62 @@
+@@ -0,0 +1,65 @@
 +
 +obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 +
@@ -318,6 +667,7 @@ index 000000000000..71cda24e6d08
 +	btree_update_interior.o	\
 +	btree_update_leaf.o	\
 +	buckets.o		\
++	buckets_waiting_for_journal.o	\
 +	chardev.o		\
 +	checksum.o		\
 +	clock.o			\
@@ -339,8 +689,10 @@ index 000000000000..71cda24e6d08
 +	journal.o		\
 +	journal_io.o		\
 +	journal_reclaim.o	\
++	journal_sb.o		\
 +	journal_seq_blacklist.o	\
 +	keylist.o		\
++	lru.o			\
 +	migrate.o		\
 +	move.o			\
 +	movinggc.o		\
@@ -840,10 +1192,10 @@ index 000000000000..2d76a4897ba8
 +#endif /* _BCACHEFS_ACL_H */
 diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
 new file mode 100644
-index 000000000000..b2735c8591d6
+index 000000000000..e8a34eccac25
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.c
-@@ -0,0 +1,1312 @@
+@@ -0,0 +1,1343 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -855,10 +1207,12 @@ index 000000000000..b2735c8591d6
 +#include "btree_update_interior.h"
 +#include "btree_gc.h"
 +#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
 +#include "clock.h"
 +#include "debug.h"
 +#include "ec.h"
 +#include "error.h"
++#include "lru.h"
 +#include "recovery.h"
 +#include "varint.h"
 +
@@ -871,12 +1225,7 @@ index 000000000000..b2735c8591d6
 +#include <linux/sort.h>
 +#include <trace/events/bcachefs.h>
 +
-+const char * const bch2_allocator_states[] = {
-+#define x(n)	#n,
-+	ALLOC_THREAD_STATES()
-+#undef x
-+	NULL
-+};
++/* Persistent alloc info: */
 +
 +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
@@ -884,7 +1233,28 @@ index 000000000000..b2735c8591d6
 +#undef x
 +};
 +
-+/* Persistent alloc info: */
++const char * const bch2_bucket_states[] = {
++	"free",
++	"need gc gens",
++	"need discard",
++	"cached",
++	"dirty",
++	NULL
++};
++
++struct bkey_alloc_unpacked {
++	u64		journal_seq;
++	u64		bucket;
++	u8		dev;
++	u8		gen;
++	u8		oldest_gen;
++	u8		data_type;
++	bool		need_discard:1;
++	bool		need_inc_gen:1;
++#define x(_name, _bits)	u##_bits _name;
++	BCH_ALLOC_FIELDS_V2()
++#undef  x
++};
 +
 +static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
 +				     const void **p, unsigned field)
@@ -1006,6 +1376,8 @@ index 000000000000..b2735c8591d6
 +	out->gen	= a.v->gen;
 +	out->oldest_gen	= a.v->oldest_gen;
 +	out->data_type	= a.v->data_type;
++	out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v);
++	out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v);
 +	out->journal_seq = le64_to_cpu(a.v->journal_seq);
 +
 +#define x(_name, _bits)							\
@@ -1027,47 +1399,7 @@ index 000000000000..b2735c8591d6
 +	return 0;
 +}
 +
-+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
-+			       const struct bkey_alloc_unpacked src)
-+{
-+	struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
-+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-+	u8 *out = a->v.data;
-+	u8 *end = (void *) &dst[1];
-+	u8 *last_nonzero_field = out;
-+	unsigned bytes;
-+
-+	a->k.p		= POS(src.dev, src.bucket);
-+	a->v.gen	= src.gen;
-+	a->v.oldest_gen	= src.oldest_gen;
-+	a->v.data_type	= src.data_type;
-+	a->v.journal_seq = cpu_to_le64(src.journal_seq);
-+
-+#define x(_name, _bits)							\
-+	nr_fields++;							\
-+									\
-+	if (src._name) {						\
-+		out += bch2_varint_encode_fast(out, src._name);		\
-+									\
-+		last_nonzero_field = out;				\
-+		last_nonzero_fieldnr = nr_fields;			\
-+	} else {							\
-+		*out++ = 0;						\
-+	}
-+
-+	BCH_ALLOC_FIELDS_V2()
-+#undef  x
-+	BUG_ON(out > end);
-+
-+	out = last_nonzero_field;
-+	a->v.nr_fields = last_nonzero_fieldnr;
-+
-+	bytes = (u8 *) out - (u8 *) &a->v;
-+	set_bkey_val_bytes(&a->k, bytes);
-+	memset_u64s_tail(&a->v, 0, bytes);
-+}
-+
-+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
++static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 +{
 +	struct bkey_alloc_unpacked ret = {
 +		.dev	= k.k->p.inode,
@@ -1090,11 +1422,71 @@ index 000000000000..b2735c8591d6
 +	return ret;
 +}
 +
-+void bch2_alloc_pack(struct bch_fs *c,
-+		     struct bkey_alloc_buf *dst,
-+		     const struct bkey_alloc_unpacked src)
++void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
 +{
-+	bch2_alloc_pack_v3(dst, src);
++	if (k.k->type == KEY_TYPE_alloc_v4) {
++		*out = *bkey_s_c_to_alloc_v4(k).v;
++	} else {
++		struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
++
++		*out = (struct bch_alloc_v4) {
++			.journal_seq		= u.journal_seq,
++			.flags			= u.need_discard,
++			.gen			= u.gen,
++			.oldest_gen		= u.oldest_gen,
++			.data_type		= u.data_type,
++			.stripe_redundancy	= u.stripe_redundancy,
++			.dirty_sectors		= u.dirty_sectors,
++			.cached_sectors		= u.cached_sectors,
++			.io_time[READ]		= u.read_time,
++			.io_time[WRITE]		= u.write_time,
++			.stripe			= u.stripe,
++		};
++	}
++}
++
++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
++{
++	struct bkey_i_alloc_v4 *ret;
++
++	if (k.k->type == KEY_TYPE_alloc_v4) {
++		ret = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++		if (!IS_ERR(ret))
++			bkey_reassemble(&ret->k_i, k);
++	} else {
++		ret = bch2_trans_kmalloc(trans, sizeof(*ret));
++		if (!IS_ERR(ret)) {
++			bkey_alloc_v4_init(&ret->k_i);
++			ret->k.p = k.k->p;
++			bch2_alloc_to_v4(k, &ret->v);
++		}
++	}
++	return ret;
++}
++
++struct bkey_i_alloc_v4 *
++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
++			      struct bpos pos)
++{
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	int ret;
++
++	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
++			     BTREE_ITER_WITH_UPDATES|
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return ERR_PTR(ret);
++	}
++
++	a = bch2_alloc_to_v4_mut(trans, k);
++	if (IS_ERR(a))
++		bch2_trans_iter_exit(trans, iter);
++	return a;
 +}
 +
 +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -1140,150 +1532,805 @@ index 000000000000..b2735c8591d6
 +const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
 +{
 +	struct bkey_alloc_unpacked u;
++	struct bch_dev *ca;
 +
 +	if (k.k->p.inode >= c->sb.nr_devices ||
 +	    !c->devs[k.k->p.inode])
 +		return "invalid device";
 +
++	ca = bch_dev_bkey_exists(c, k.k->p.inode);
++
++	if (k.k->p.offset < ca->mi.first_bucket ||
++	    k.k->p.offset >= ca->mi.nbuckets)
++		return "invalid bucket";
++
 +	if (bch2_alloc_unpack_v3(&u, k))
 +		return "unpack error";
 +
 +	return NULL;
 +}
 +
-+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-+			   struct bkey_s_c k)
++const char *bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k)
 +{
-+	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-+
-+	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
-+	       u.gen, u.oldest_gen, bch2_data_types[u.data_type],
-+	       u.journal_seq);
-+#define x(_name, ...)	pr_buf(out, " " #_name " %llu", (u64) u._name);
-+	BCH_ALLOC_FIELDS_V2()
-+#undef  x
-+}
-+
-+static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
 +	struct bch_dev *ca;
-+	struct bucket *g;
-+	struct bkey_alloc_unpacked u;
 +
-+	if (!bkey_is_alloc(k.k))
-+		return 0;
++	if (k.k->p.inode >= c->sb.nr_devices ||
++	    !c->devs[k.k->p.inode])
++		return "invalid device";
 +
 +	ca = bch_dev_bkey_exists(c, k.k->p.inode);
-+	g = bucket(ca, k.k->p.offset);
-+	u = bch2_alloc_unpack(k);
 +
-+	g->_mark.gen		= u.gen;
-+	g->_mark.data_type	= u.data_type;
-+	g->_mark.dirty_sectors	= u.dirty_sectors;
-+	g->_mark.cached_sectors	= u.cached_sectors;
-+	g->io_time[READ]	= u.read_time;
-+	g->io_time[WRITE]	= u.write_time;
-+	g->oldest_gen		= u.oldest_gen;
-+	g->gen_valid		= 1;
++	if (k.k->p.offset < ca->mi.first_bucket ||
++	    k.k->p.offset >= ca->mi.nbuckets)
++		return "invalid bucket";
 +
-+	return 0;
++	return NULL;
++}
++
++void bch2_alloc_v4_swab(struct bkey_s k)
++{
++	struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
++
++	a->journal_seq		= swab64(a->journal_seq);
++	a->flags		= swab32(a->flags);
++	a->dirty_sectors	= swab32(a->dirty_sectors);
++	a->cached_sectors	= swab32(a->cached_sectors);
++	a->io_time[0]		= swab64(a->io_time[0]);
++	a->io_time[1]		= swab64(a->io_time[1]);
++	a->stripe		= swab32(a->stripe);
++	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
++}
++
++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
++{
++	struct bch_alloc_v4 a;
++
++	bch2_alloc_to_v4(k, &a);
++
++	pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %llu",
++	       a.gen, a.oldest_gen, bch2_data_types[a.data_type],
++	       a.journal_seq, BCH_ALLOC_V4_NEED_DISCARD(&a));
++	pr_buf(out, " dirty_sectors %u",	a.dirty_sectors);
++	pr_buf(out, " cached_sectors %u",	a.cached_sectors);
++	pr_buf(out, " stripe %u",		a.stripe);
++	pr_buf(out, " stripe_redundancy %u",	a.stripe_redundancy);
++	pr_buf(out, " read_time %llu",		a.io_time[READ]);
++	pr_buf(out, " write_time %llu",		a.io_time[WRITE]);
 +}
 +
 +int bch2_alloc_read(struct bch_fs *c)
 +{
 +	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct bch_dev *ca;
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	down_read(&c->gc_lock);
-+	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
-+	up_read(&c->gc_lock);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		ca = bch_dev_bkey_exists(c, k.k->p.inode);
++		bch2_alloc_to_v4(k, &a);
++
++		*bucket_gen(ca, k.k->p.offset) = a.gen;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
 +	bch2_trans_exit(&trans);
-+	if (ret) {
++
++	if (ret)
 +		bch_err(c, "error reading alloc info: %i", ret);
-+		return ret;
++
++	return ret;
++}
++
++/* Free space/discard btree: */
++
++static int bch2_bucket_do_index(struct btree_trans *trans,
++				struct bkey_s_c alloc_k,
++				struct bch_alloc_v4 a,
++				bool set)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode);
++	struct btree_iter iter;
++	struct bkey_s_c old;
++	struct bkey_i *k;
++	enum bucket_state state = bucket_state(a);
++	enum btree_id btree;
++	enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
++	enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	if (state != BUCKET_free &&
++	    state != BUCKET_need_discard)
++		return 0;
++
++	k = bch2_trans_kmalloc(trans, sizeof(*k));
++	if (IS_ERR(k))
++		return PTR_ERR(k);
++
++	bkey_init(&k->k);
++	k->k.type = new_type;
++
++	switch (state) {
++	case BUCKET_free:
++		btree = BTREE_ID_freespace;
++		k->k.p = alloc_freespace_pos(alloc_k.k->p, a);
++		bch2_key_resize(&k->k, 1);
++		break;
++	case BUCKET_need_discard:
++		btree = BTREE_ID_need_discard;
++		k->k.p = alloc_k.k->p;
++		break;
++	default:
++		return 0;
++	}
++
++	bch2_trans_iter_init(trans, &iter, btree,
++			     bkey_start_pos(&k->k),
++			     BTREE_ITER_INTENT);
++	old = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(old);
++	if (ret)
++		goto err;
++
++	if (ca->mi.freespace_initialized &&
++	    bch2_fs_inconsistent_on(old.k->type != old_type, c,
++			"incorrect key when %s %s btree (got %s should be %s)\n"
++			"  for %s",
++			set ? "setting" : "clearing",
++			bch2_btree_ids[btree],
++			bch2_bkey_types[old.k->type],
++			bch2_bkey_types[old_type],
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++		ret = -EIO;
++		goto err;
++	}
++
++	ret = bch2_trans_update(trans, &iter, k, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_trans_mark_alloc(struct btree_trans *trans,
++			  struct bkey_s_c old, struct bkey_i *new,
++			  unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_alloc_v4 old_a, *new_a;
++	u64 old_lru, new_lru;
++	int ret = 0;
++
++	/*
++	 * Deletion only happens in the device removal path, with
++	 * BTREE_TRIGGER_NORUN:
++	 */
++	BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
++
++	bch2_alloc_to_v4(old, &old_a);
++	new_a = &bkey_i_to_alloc_v4(new)->v;
++
++	if (new_a->dirty_sectors > old_a.dirty_sectors ||
++	    new_a->cached_sectors > old_a.cached_sectors) {
++		new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
++		new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
++		SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
++	}
++
++	if (old_a.data_type && !new_a->data_type &&
++	    old_a.gen == new_a->gen &&
++	    !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
++		new_a->gen++;
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
++	}
++
++	if (bucket_state(old_a) != bucket_state(*new_a) ||
++	    (bucket_state(*new_a) == BUCKET_free &&
++	     alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) {
++		ret =   bch2_bucket_do_index(trans, old, old_a, false) ?:
++			bch2_bucket_do_index(trans, bkey_i_to_s_c(new), *new_a, true);
++		if (ret)
++			return ret;
++	}
++
++	old_lru = alloc_lru_idx(old_a);
++	new_lru = alloc_lru_idx(*new_a);
++
++	if (old_lru != new_lru) {
++		ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
++				      old_lru, &new_lru);
++		if (ret)
++			return ret;
++
++		if (new_lru && new_a->io_time[READ] != new_lru)
++			new_a->io_time[READ] = new_lru;
 +	}
 +
 +	return 0;
 +}
 +
-+static int bch2_alloc_write_key(struct btree_trans *trans,
-+				struct btree_iter *iter,
-+				unsigned flags)
++static int bch2_check_alloc_key(struct btree_trans *trans,
++				struct btree_iter *alloc_iter)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k;
-+	struct bch_dev *ca;
-+	struct bucket *g;
-+	struct bucket_mark m;
-+	struct bkey_alloc_unpacked old_u, new_u;
-+	struct bkey_alloc_buf a;
++	struct btree_iter discard_iter, freespace_iter, lru_iter;
++	struct bch_alloc_v4 a;
++	unsigned discard_key_type, freespace_key_type;
++	struct bkey_s_c alloc_k, k;
++	struct printbuf buf = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
 +	int ret;
-+retry:
-+	bch2_trans_begin(trans);
 +
-+	ret = bch2_btree_key_cache_flush(trans,
-+			BTREE_ID_alloc, iter->pos);
++	alloc_k = bch2_btree_iter_peek(alloc_iter);
++	if (!alloc_k.k)
++		return 0;
++
++	ret = bkey_err(alloc_k);
 +	if (ret)
-+		goto err;
++		return ret;
 +
-+	k = bch2_btree_iter_peek_slot(iter);
++	bch2_alloc_to_v4(alloc_k, &a);
++	discard_key_type = bucket_state(a) == BUCKET_need_discard
++		? KEY_TYPE_set : 0;
++	freespace_key_type = bucket_state(a) == BUCKET_free
++		? KEY_TYPE_set : 0;
++
++	bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
++			     alloc_k.k->p, 0);
++	bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
++			     alloc_freespace_pos(alloc_k.k->p, a), 0);
++	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
++			     POS(alloc_k.k->p.inode, a.io_time[READ]), 0);
++
++	k = bch2_btree_iter_peek_slot(&discard_iter);
 +	ret = bkey_err(k);
 +	if (ret)
 +		goto err;
 +
-+	old_u = bch2_alloc_unpack(k);
++	if (fsck_err_on(k.k->type != discard_key_type, c,
++			"incorrect key in need_discard btree (got %s should be %s)\n"
++			"  %s",
++			bch2_bkey_types[k.k->type],
++			bch2_bkey_types[discard_key_type],
++			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
 +
-+	percpu_down_read(&c->mark_lock);
-+	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
-+	g	= bucket(ca, iter->pos.offset);
-+	m	= READ_ONCE(g->mark);
-+	new_u	= alloc_mem_to_key(iter, g, m);
-+	percpu_up_read(&c->mark_lock);
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
 +
-+	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-+		return 0;
++		bkey_init(&update->k);
++		update->k.type	= discard_key_type;
++		update->k.p	= discard_iter.pos;
 +
-+	bch2_alloc_pack(c, &a, new_u);
-+	ret   = bch2_trans_update(trans, iter, &a.k,
-+				  BTREE_TRIGGER_NORUN) ?:
-+		bch2_trans_commit(trans, NULL, NULL,
-+				BTREE_INSERT_NOFAIL|flags);
-+err:
-+	if (ret == -EINTR)
-+		goto retry;
-+	return ret;
-+}
++		ret =   bch2_trans_update(trans, &discard_iter, update, 0) ?:
++			bch2_trans_commit(trans, NULL, NULL, 0);
++		if (ret)
++			goto err;
++	}
 +
-+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bch_dev *ca;
-+	unsigned i;
-+	int ret = 0;
++	k = bch2_btree_iter_peek_slot(&freespace_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
 +
-+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
-+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
++	if (fsck_err_on(k.k->type != freespace_key_type, c,
++			"incorrect key in freespace btree (got %s should be %s)\n"
++			"  %s",
++			bch2_bkey_types[k.k->type],
++			bch2_bkey_types[freespace_key_type],
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
 +
-+	for_each_member_device(ca, c, i) {
-+		bch2_btree_iter_set_pos(&iter,
-+			POS(ca->dev_idx, ca->mi.first_bucket));
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
 +
-+		while (iter.pos.offset < ca->mi.nbuckets) {
-+			ret = bch2_alloc_write_key(&trans, &iter, flags);
-+			if (ret) {
-+				percpu_ref_put(&ca->ref);
++		bkey_init(&update->k);
++		update->k.type	= freespace_key_type;
++		update->k.p	= freespace_iter.pos;
++		bch2_key_resize(&update->k, 1);
++
++		ret   = bch2_trans_update(trans, &freespace_iter, update, 0) ?:
++			bch2_trans_commit(trans, NULL, NULL, 0);
++		if (ret)
++			goto err;
++	}
++
++	if (bucket_state(a) == BUCKET_cached) {
++		k = bch2_btree_iter_peek_slot(&lru_iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (fsck_err_on(!a.io_time[READ], c,
++				"cached bucket with read_time 0\n"
++				"  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
++		    fsck_err_on(k.k->type != KEY_TYPE_lru ||
++				le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c,
++				"incorrect/missing lru entry\n"
++				"  %s\n"
++				"  %s",
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
++				(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
++			u64 read_time = a.io_time[READ];
++
++			if (!a.io_time[READ])
++				a.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
++
++			ret   = bch2_lru_change(trans,
++						alloc_k.k->p.inode,
++						alloc_k.k->p.offset,
++						0, &a.io_time[READ]);
++			if (ret)
 +				goto err;
++
++			if (a.io_time[READ] != read_time) {
++				struct bkey_i_alloc_v4 *a_mut =
++					bch2_alloc_to_v4_mut(trans, alloc_k);
++				ret = PTR_ERR_OR_ZERO(a_mut);
++				if (ret)
++					goto err;
++
++				a_mut->v.io_time[READ] = a.io_time[READ];
++				ret = bch2_trans_update(trans, alloc_iter,
++							&a_mut->k_i, BTREE_TRIGGER_NORUN);
++				if (ret)
++					goto err;
 +			}
-+			bch2_btree_iter_advance(&iter);
++
++			ret = bch2_trans_commit(trans, NULL, NULL, 0);
++			if (ret)
++				goto err;
 +		}
 +	}
 +err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &lru_iter);
++	bch2_trans_iter_exit(trans, &freespace_iter);
++	bch2_trans_iter_exit(trans, &discard_iter);
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
++{
++	struct bch_dev *ca;
++
++	if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
++		return false;
++
++	ca = bch_dev_bkey_exists(c, pos.inode);
++	return pos.offset >= ca->mi.first_bucket &&
++		pos.offset < ca->mi.nbuckets;
++}
++
++static int bch2_check_freespace_key(struct btree_trans *trans,
++				    struct btree_iter *freespace_iter,
++				    bool initial)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter alloc_iter;
++	struct bkey_s_c k, freespace_k;
++	struct bch_alloc_v4 a;
++	u64 genbits;
++	struct bpos pos;
++	struct bkey_i *update;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	freespace_k = bch2_btree_iter_peek(freespace_iter);
++	if (!freespace_k.k)
++		return 1;
++
++	ret = bkey_err(freespace_k);
++	if (ret)
++		return ret;
++
++	pos = freespace_iter->pos;
++	pos.offset &= ~(~0ULL << 56);
++	genbits = freespace_iter->pos.offset & (~0ULL << 56);
++
++	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
++
++	if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
++			"%llu:%llu set in freespace btree but device or bucket does not exist",
++			pos.inode, pos.offset))
++		goto delete;
++
++	k = bch2_btree_iter_peek_slot(&alloc_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (fsck_err_on(bucket_state(a) != BUCKET_free ||
++			genbits != alloc_freespace_genbits(a), c,
++			"%s\n  incorrectly set in freespace index (free %u, genbits %llu should be %llu)",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			bucket_state(a) == BUCKET_free,
++			genbits >> 56, alloc_freespace_genbits(a) >> 56))
++		goto delete;
++out:
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	printbuf_exit(&buf);
++	return ret;
++delete:
++	update = bch2_trans_kmalloc(trans, sizeof(*update));
++	ret = PTR_ERR_OR_ZERO(update);
++	if (ret)
++		goto err;
++
++	bkey_init(&update->k);
++	update->k.p = freespace_iter->pos;
++	bch2_key_resize(&update->k, 1);
++
++	ret   = bch2_trans_update(trans, freespace_iter, update, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL, 0);
++	goto out;
++}
++
++int bch2_check_alloc_info(struct bch_fs *c, bool initial)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0, last_dev = -1;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		if (k.k->p.inode != last_dev) {
++			struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
++
++			if (!ca->mi.freespace_initialized) {
++				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
++				continue;
++			}
++
++			last_dev = k.k->p.inode;
++		}
++
++		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++			bch2_check_alloc_key(&trans, &iter));
++		if (ret)
++			break;
++	}
 +	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
++			     BTREE_ITER_PREFETCH);
++	while (1) {
++		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++			bch2_check_freespace_key(&trans, &iter, initial));
++		if (ret)
++			break;
++
++		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++err:
 +	bch2_trans_exit(&trans);
++	return ret < 0 ? ret : 0;
++}
++
++static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
++				   struct bch_dev *ca, bool *discard_done)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
++			     BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++
++	a = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		goto out;
++
++	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
++		a->v.gen++;
++		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
++		goto write;
++	}
++
++	BUG_ON(a->v.journal_seq > c->journal.flushed_seq_ondisk);
++
++	if (bch2_fs_inconsistent_on(!BCH_ALLOC_V4_NEED_DISCARD(&a->v), c,
++			"%s\n  incorrectly set in need_discard btree",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = -EIO;
++		goto out;
++	}
++
++	if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
++		/*
++		 * This works without any other locks because this is the only
++		 * thread that removes items from the need_discard tree
++		 */
++		bch2_trans_unlock(trans);
++		blkdev_issue_discard(ca->disk_sb.bdev,
++				     k.k->p.offset * ca->mi.bucket_size,
++				     ca->mi.bucket_size,
++				     GFP_KERNEL, 0);
++		*discard_done = true;
++
++		ret = bch2_trans_relock(trans) ? 0 : -EINTR;
++		if (ret)
++			goto out;
++	}
++
++	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
++write:
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static void bch2_do_discards_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
++	struct bch_dev *ca = NULL;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
++			   POS_MIN, 0, k, ret) {
++		bool discard_done = false;
++
++		if (ca && k.k->p.inode != ca->dev_idx) {
++			percpu_ref_put(&ca->io_ref);
++			ca = NULL;
++		}
++
++		if (!ca) {
++			ca = bch_dev_bkey_exists(c, k.k->p.inode);
++			if (!percpu_ref_tryget(&ca->io_ref)) {
++				ca = NULL;
++				bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
++				continue;
++			}
++		}
++
++		seen++;
++
++		if (bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset)) {
++			open++;
++			continue;
++		}
++
++		if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++				c->journal.flushed_seq_ondisk,
++				k.k->p.inode, k.k->p.offset)) {
++			need_journal_commit++;
++			continue;
++		}
++
++		ret = __bch2_trans_do(&trans, NULL, NULL,
++				      BTREE_INSERT_USE_RESERVE|
++				      BTREE_INSERT_NOFAIL,
++				bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
++		if (ret)
++			break;
++
++		discarded++;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ca)
++		percpu_ref_put(&ca->io_ref);
++
++	bch2_trans_exit(&trans);
++
++	if (need_journal_commit * 2 > seen)
++		bch2_journal_flush_async(&c->journal, NULL);
++
++	percpu_ref_put(&c->writes);
++
++	trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
++}
++
++void bch2_do_discards(struct bch_fs *c)
++{
++	if (percpu_ref_tryget(&c->writes) &&
++	    !queue_work(system_long_wq, &c->discard_work))
++		percpu_ref_put(&c->writes);
++}
++
++static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter lru_iter, alloc_iter = { NULL };
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	u64 bucket, idx;
++	int ret;
++
++	bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
++			     POS(ca->dev_idx, 0), 0);
++	k = bch2_btree_iter_peek(&lru_iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++
++	if (!k.k || k.k->p.inode != ca->dev_idx)
++		goto out;
++
++	if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
++				    "non lru key in lru btree"))
++		goto out;
++
++	idx	= k.k->p.offset;
++	bucket	= le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
++
++	a = bch2_trans_start_alloc_update(trans, &alloc_iter,
++					  POS(ca->dev_idx, bucket));
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		goto out;
++
++	if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a->v), c,
++			"invalidating bucket with wrong lru idx (got %llu should be %llu",
++			idx, alloc_lru_idx(a->v)))
++		goto out;
++
++	SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
++	a->v.gen++;
++	a->v.data_type		= 0;
++	a->v.dirty_sectors	= 0;
++	a->v.cached_sectors	= 0;
++	a->v.io_time[READ]	= atomic64_read(&c->io_clock[READ].now);
++	a->v.io_time[WRITE]	= atomic64_read(&c->io_clock[WRITE].now);
++
++	ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
++				BTREE_TRIGGER_BUCKET_INVALIDATE);
++out:
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	bch2_trans_iter_exit(trans, &lru_iter);
++	return ret;
++}
++
++static void bch2_do_invalidates_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
++	struct bch_dev *ca;
++	struct btree_trans trans;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_member_device(ca, c, i)
++		while (!ret && should_invalidate_buckets(ca))
++			ret = __bch2_trans_do(&trans, NULL, NULL,
++					      BTREE_INSERT_USE_RESERVE|
++					      BTREE_INSERT_NOFAIL,
++					invalidate_one_bucket(&trans, ca));
++
++	bch2_trans_exit(&trans);
++	percpu_ref_put(&c->writes);
++}
++
++void bch2_do_invalidates(struct bch_fs *c)
++{
++	if (percpu_ref_tryget(&c->writes))
++		queue_work(system_long_wq, &c->invalidate_work);
++}
++
++static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct bch_member *m;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc,
++			   POS(ca->dev_idx, ca->mi.first_bucket),
++			   BTREE_ITER_SLOTS|
++			   BTREE_ITER_PREFETCH, k, ret) {
++		if (iter.pos.offset >= ca->mi.nbuckets)
++			break;
++
++		bch2_alloc_to_v4(k, &a);
++		ret = __bch2_trans_do(&trans, NULL, NULL,
++				      BTREE_INSERT_LAZY_RW,
++				 bch2_bucket_do_index(&trans, k, a, true));
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	if (ret) {
++		bch_err(ca, "error initializing free space: %i", ret);
++		return ret;
++	}
++
++	mutex_lock(&c->sb_lock);
++	m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
++	SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++int bch2_fs_freespace_init(struct bch_fs *c)
++{
++	struct bch_dev *ca;
++	unsigned i;
++	int ret = 0;
++	bool doing_init = false;
++
++	/*
++	 * We can crash during the device add path, so we need to check this on
++	 * every mount:
++	 */
++
++	for_each_member_device(ca, c, i) {
++		if (ca->mi.freespace_initialized)
++			continue;
++
++		if (!doing_init) {
++			bch_info(c, "initializing freespace");
++			doing_init = true;
++		}
++
++		ret = bch2_dev_freespace_init(c, ca);
++		if (ret) {
++			percpu_ref_put(&ca->ref);
++			return ret;
++		}
++	}
++
++	if (doing_init) {
++		mutex_lock(&c->sb_lock);
++		bch2_write_super(c);
++		mutex_unlock(&c->sb_lock);
++
++		bch_verbose(c, "done initializing freespace");
++	}
++
 +	return ret;
 +}
 +
@@ -1293,620 +2340,29 @@ index 000000000000..b2735c8591d6
 +			      size_t bucket_nr, int rw)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
 +	struct btree_iter iter;
-+	struct bucket *g;
-+	struct bkey_alloc_buf *a;
-+	struct bkey_alloc_unpacked u;
-+	u64 *time, now;
++	struct bkey_i_alloc_v4 *a;
++	u64 now;
 +	int ret = 0;
 +
-+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
-+			     BTREE_ITER_CACHED|
-+			     BTREE_ITER_CACHED_NOFILL|
-+			     BTREE_ITER_INTENT);
-+	ret = bch2_btree_iter_traverse(&iter);
-+	if (ret)
-+		goto out;
-+
-+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
++	a = bch2_trans_start_alloc_update(trans, &iter,  POS(dev, bucket_nr));
 +	ret = PTR_ERR_OR_ZERO(a);
 +	if (ret)
-+		goto out;
++		return ret;
 +
-+	percpu_down_read(&c->mark_lock);
-+	g = bucket(ca, bucket_nr);
-+	u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
-+	percpu_up_read(&c->mark_lock);
-+
-+	time = rw == READ ? &u.read_time : &u.write_time;
 +	now = atomic64_read(&c->io_clock[rw].now);
-+	if (*time == now)
++	if (a->v.io_time[rw] == now)
 +		goto out;
 +
-+	*time = now;
++	a->v.io_time[rw] = now;
 +
-+	bch2_alloc_pack(c, a, u);
-+	ret   = bch2_trans_update(trans, &iter, &a->k, 0) ?:
++	ret   = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
 +		bch2_trans_commit(trans, NULL, NULL, 0);
 +out:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
 +
-+/* Background allocator thread: */
-+
-+/*
-+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
-+ * (marking them as invalidated on disk), then optionally issues discard
-+ * commands to the newly free buckets, then puts them on the various freelists.
-+ */
-+
-+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-+				       struct bucket_mark m)
-+{
-+	u8 gc_gen;
-+
-+	if (!is_available_bucket(m))
-+		return false;
-+
-+	if (m.owned_by_allocator)
-+		return false;
-+
-+	if (ca->buckets_nouse &&
-+	    test_bit(b, ca->buckets_nouse))
-+		return false;
-+
-+	gc_gen = bucket_gc_gen(bucket(ca, b));
-+
-+	ca->inc_gen_needs_gc		+= gc_gen >= BUCKET_GC_GEN_MAX / 2;
-+	ca->inc_gen_really_needs_gc	+= gc_gen >= BUCKET_GC_GEN_MAX;
-+
-+	return gc_gen < BUCKET_GC_GEN_MAX;
-+}
-+
-+/*
-+ * Determines what order we're going to reuse buckets, smallest bucket_key()
-+ * first.
-+ */
-+
-+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-+				u64 now, u64 last_seq_ondisk)
-+{
-+	unsigned used = bucket_sectors_used(m);
-+
-+	if (used) {
-+		/*
-+		 * Prefer to keep buckets that have been read more recently, and
-+		 * buckets that have more data in them:
-+		 */
-+		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-+		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
-+
-+		return -last_read_scaled;
-+	} else {
-+		/*
-+		 * Prefer to use buckets with smaller gc_gen so that we don't
-+		 * have to walk the btree and recalculate oldest_gen - but shift
-+		 * off the low bits so that buckets will still have equal sort
-+		 * keys when there's only a small difference, so that we can
-+		 * keep sequential buckets together:
-+		 */
-+		return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
-+			(bucket_gc_gen(g) >> 4);
-+	}
-+}
-+
-+static inline int bucket_alloc_cmp(alloc_heap *h,
-+				   struct alloc_heap_entry l,
-+				   struct alloc_heap_entry r)
-+{
-+	return  cmp_int(l.key, r.key) ?:
-+		cmp_int(r.nr, l.nr) ?:
-+		cmp_int(l.bucket, r.bucket);
-+}
-+
-+static inline int bucket_idx_cmp(const void *_l, const void *_r)
-+{
-+	const struct alloc_heap_entry *l = _l, *r = _r;
-+
-+	return cmp_int(l->bucket, r->bucket);
-+}
-+
-+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	struct bucket_array *buckets;
-+	struct alloc_heap_entry e = { 0 };
-+	u64 now, last_seq_ondisk;
-+	size_t b, i, nr = 0;
-+
-+	down_read(&ca->bucket_lock);
-+
-+	buckets = bucket_array(ca);
-+	ca->alloc_heap.used = 0;
-+	now = atomic64_read(&c->io_clock[READ].now);
-+	last_seq_ondisk = c->journal.last_seq_ondisk;
-+
-+	/*
-+	 * Find buckets with lowest read priority, by building a maxheap sorted
-+	 * by read priority and repeatedly replacing the maximum element until
-+	 * all buckets have been visited.
-+	 */
-+	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-+		struct bucket *g = &buckets->b[b];
-+		struct bucket_mark m = READ_ONCE(g->mark);
-+		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
-+
-+		cond_resched();
-+
-+		if (!bch2_can_invalidate_bucket(ca, b, m))
-+			continue;
-+
-+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
-+			e.nr++;
-+		} else {
-+			if (e.nr)
-+				heap_add_or_replace(&ca->alloc_heap, e,
-+					-bucket_alloc_cmp, NULL);
-+
-+			e = (struct alloc_heap_entry) {
-+				.bucket = b,
-+				.nr	= 1,
-+				.key	= key,
-+			};
-+		}
-+	}
-+
-+	if (e.nr)
-+		heap_add_or_replace(&ca->alloc_heap, e,
-+				-bucket_alloc_cmp, NULL);
-+
-+	for (i = 0; i < ca->alloc_heap.used; i++)
-+		nr += ca->alloc_heap.data[i].nr;
-+
-+	while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-+		nr -= ca->alloc_heap.data[0].nr;
-+		heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
-+	}
-+
-+	up_read(&ca->bucket_lock);
-+}
-+
-+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	struct bucket_array *buckets = bucket_array(ca);
-+	struct bucket_mark m;
-+	size_t b, start;
-+
-+	if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-+	    ca->fifo_last_bucket >= ca->mi.nbuckets)
-+		ca->fifo_last_bucket = ca->mi.first_bucket;
-+
-+	start = ca->fifo_last_bucket;
-+
-+	do {
-+		ca->fifo_last_bucket++;
-+		if (ca->fifo_last_bucket == ca->mi.nbuckets)
-+			ca->fifo_last_bucket = ca->mi.first_bucket;
-+
-+		b = ca->fifo_last_bucket;
-+		m = READ_ONCE(buckets->b[b].mark);
-+
-+		if (bch2_can_invalidate_bucket(ca, b, m)) {
-+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-+
-+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-+			if (heap_full(&ca->alloc_heap))
-+				break;
-+		}
-+
-+		cond_resched();
-+	} while (ca->fifo_last_bucket != start);
-+}
-+
-+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	struct bucket_array *buckets = bucket_array(ca);
-+	struct bucket_mark m;
-+	size_t checked, i;
-+
-+	for (checked = 0;
-+	     checked < ca->mi.nbuckets / 2;
-+	     checked++) {
-+		size_t b = bch2_rand_range(ca->mi.nbuckets -
-+					   ca->mi.first_bucket) +
-+			ca->mi.first_bucket;
-+
-+		m = READ_ONCE(buckets->b[b].mark);
-+
-+		if (bch2_can_invalidate_bucket(ca, b, m)) {
-+			struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-+
-+			heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-+			if (heap_full(&ca->alloc_heap))
-+				break;
-+		}
-+
-+		cond_resched();
-+	}
-+
-+	sort(ca->alloc_heap.data,
-+	     ca->alloc_heap.used,
-+	     sizeof(ca->alloc_heap.data[0]),
-+	     bucket_idx_cmp, NULL);
-+
-+	/* remove duplicates: */
-+	for (i = 0; i + 1 < ca->alloc_heap.used; i++)
-+		if (ca->alloc_heap.data[i].bucket ==
-+		    ca->alloc_heap.data[i + 1].bucket)
-+			ca->alloc_heap.data[i].nr = 0;
-+}
-+
-+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	size_t i, nr = 0;
-+
-+	ca->inc_gen_needs_gc			= 0;
-+	ca->inc_gen_really_needs_gc		= 0;
-+
-+	switch (ca->mi.replacement) {
-+	case BCH_CACHE_REPLACEMENT_lru:
-+		find_reclaimable_buckets_lru(c, ca);
-+		break;
-+	case BCH_CACHE_REPLACEMENT_fifo:
-+		find_reclaimable_buckets_fifo(c, ca);
-+		break;
-+	case BCH_CACHE_REPLACEMENT_random:
-+		find_reclaimable_buckets_random(c, ca);
-+		break;
-+	}
-+
-+	heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
-+
-+	for (i = 0; i < ca->alloc_heap.used; i++)
-+		nr += ca->alloc_heap.data[i].nr;
-+
-+	return nr;
-+}
-+
-+/*
-+ * returns sequence number of most recent journal entry that updated this
-+ * bucket:
-+ */
-+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-+{
-+	if (m.journal_seq_valid) {
-+		u64 journal_seq = atomic64_read(&c->journal.seq);
-+		u64 bucket_seq	= journal_seq;
-+
-+		bucket_seq &= ~((u64) U16_MAX);
-+		bucket_seq |= m.journal_seq;
-+
-+		if (bucket_seq > journal_seq)
-+			bucket_seq -= 1 << 16;
-+
-+		return bucket_seq;
-+	} else {
-+		return 0;
-+	}
-+}
-+
-+static int bucket_invalidate_btree(struct btree_trans *trans,
-+				   struct bch_dev *ca, u64 b)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bkey_alloc_buf *a;
-+	struct bkey_alloc_unpacked u;
-+	struct bucket *g;
-+	struct bucket_mark m;
-+	struct btree_iter iter;
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-+			     POS(ca->dev_idx, b),
-+			     BTREE_ITER_CACHED|
-+			     BTREE_ITER_CACHED_NOFILL|
-+			     BTREE_ITER_INTENT);
-+
-+	a = bch2_trans_kmalloc(trans, sizeof(*a));
-+	ret = PTR_ERR_OR_ZERO(a);
-+	if (ret)
-+		goto err;
-+
-+	ret = bch2_btree_iter_traverse(&iter);
-+	if (ret)
-+		goto err;
-+
-+	percpu_down_read(&c->mark_lock);
-+	g = bucket(ca, b);
-+	m = READ_ONCE(g->mark);
-+	u = alloc_mem_to_key(&iter, g, m);
-+	percpu_up_read(&c->mark_lock);
-+
-+	u.gen++;
-+	u.data_type	= 0;
-+	u.dirty_sectors	= 0;
-+	u.cached_sectors = 0;
-+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
-+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);
-+
-+	bch2_alloc_pack(c, a, u);
-+	ret = bch2_trans_update(trans, &iter, &a->k,
-+				BTREE_TRIGGER_BUCKET_INVALIDATE);
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-+				      u64 *journal_seq, unsigned flags)
-+{
-+	struct bucket *g;
-+	struct bucket_mark m;
-+	size_t b;
-+	int ret = 0;
-+
-+	BUG_ON(!ca->alloc_heap.used ||
-+	       !ca->alloc_heap.data[0].nr);
-+	b = ca->alloc_heap.data[0].bucket;
-+
-+	/* first, put on free_inc and mark as owned by allocator: */
-+	percpu_down_read(&c->mark_lock);
-+	g = bucket(ca, b);
-+	m = READ_ONCE(g->mark);
-+
-+	BUG_ON(m.dirty_sectors);
-+
-+	bch2_mark_alloc_bucket(c, ca, b, true);
-+
-+	spin_lock(&c->freelist_lock);
-+	verify_not_on_freelist(c, ca, b);
-+	BUG_ON(!fifo_push(&ca->free_inc, b));
-+	spin_unlock(&c->freelist_lock);
-+
-+	/*
-+	 * If we're not invalidating cached data, we only increment the bucket
-+	 * gen in memory here, the incremented gen will be updated in the btree
-+	 * by bch2_trans_mark_pointer():
-+	 */
-+	if (!m.cached_sectors &&
-+	    !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
-+		BUG_ON(m.data_type);
-+		bucket_cmpxchg(g, m, m.gen++);
-+		percpu_up_read(&c->mark_lock);
-+		goto out;
-+	}
-+
-+	percpu_up_read(&c->mark_lock);
-+
-+	/*
-+	 * If the read-only path is trying to shut down, we can't be generating
-+	 * new btree updates:
-+	 */
-+	if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
-+		ret = 1;
-+		goto out;
-+	}
-+
-+	ret = bch2_trans_do(c, NULL, journal_seq,
-+			    BTREE_INSERT_NOCHECK_RW|
-+			    BTREE_INSERT_NOFAIL|
-+			    BTREE_INSERT_JOURNAL_RESERVED|
-+			    flags,
-+			    bucket_invalidate_btree(&trans, ca, b));
-+out:
-+	if (!ret) {
-+		/* remove from alloc_heap: */
-+		struct alloc_heap_entry e, *top = ca->alloc_heap.data;
-+
-+		top->bucket++;
-+		top->nr--;
-+
-+		if (!top->nr)
-+			heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-+
-+		/*
-+		 * Make sure we flush the last journal entry that updated this
-+		 * bucket (i.e. deleting the last reference) before writing to
-+		 * this bucket again:
-+		 */
-+		*journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
-+	} else {
-+		size_t b2;
-+
-+		/* remove from free_inc: */
-+		percpu_down_read(&c->mark_lock);
-+		spin_lock(&c->freelist_lock);
-+
-+		bch2_mark_alloc_bucket(c, ca, b, false);
-+
-+		BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-+		BUG_ON(b != b2);
-+
-+		spin_unlock(&c->freelist_lock);
-+		percpu_up_read(&c->mark_lock);
-+	}
-+
-+	return ret < 0 ? ret : 0;
-+}
-+
-+/*
-+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
-+ */
-+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	u64 journal_seq = 0;
-+	int ret = 0;
-+
-+	/* Only use nowait if we've already invalidated at least one bucket: */
-+	while (!ret &&
-+	       !fifo_full(&ca->free_inc) &&
-+	       ca->alloc_heap.used) {
-+		if (kthread_should_stop()) {
-+			ret = 1;
-+			break;
-+		}
-+
-+		ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-+				(!fifo_empty(&ca->free_inc)
-+				 ? BTREE_INSERT_NOWAIT : 0));
-+		/*
-+		 * We only want to batch up invalidates when they're going to
-+		 * require flushing the journal:
-+		 */
-+		if (!journal_seq)
-+			break;
-+	}
-+
-+	/* If we used NOWAIT, don't return the error: */
-+	if (!fifo_empty(&ca->free_inc))
-+		ret = 0;
-+	if (ret < 0)
-+		bch_err(ca, "error invalidating buckets: %i", ret);
-+	if (ret)
-+		return ret;
-+
-+	if (journal_seq)
-+		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-+	if (ret) {
-+		bch_err(ca, "journal error: %i", ret);
-+		return ret;
-+	}
-+
-+	return 0;
-+}
-+
-+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-+{
-+	if (ca->allocator_state != new_state) {
-+		ca->allocator_state = new_state;
-+		closure_wake_up(&ca->fs->freelist_wait);
-+	}
-+}
-+
-+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-+{
-+	unsigned i;
-+	int ret = 0;
-+
-+	spin_lock(&c->freelist_lock);
-+	for (i = 0; i < RESERVE_NR; i++) {
-+		/*
-+		 * Don't strand buckets on the copygc freelist until
-+		 * after recovery is finished:
-+		 */
-+		if (i == RESERVE_MOVINGGC &&
-+		    !test_bit(BCH_FS_STARTED, &c->flags))
-+			continue;
-+
-+		if (fifo_push(&ca->free[i], b)) {
-+			fifo_pop(&ca->free_inc, b);
-+			ret = 1;
-+			break;
-+		}
-+	}
-+	spin_unlock(&c->freelist_lock);
-+
-+	ca->allocator_state = ret
-+		? ALLOCATOR_running
-+		: ALLOCATOR_blocked_full;
-+	closure_wake_up(&c->freelist_wait);
-+	return ret;
-+}
-+
-+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-+{
-+	if (ca->mi.discard &&
-+	    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-+		blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-+				     ca->mi.bucket_size, GFP_NOFS, 0);
-+}
-+
-+static bool allocator_thread_running(struct bch_dev *ca)
-+{
-+	unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-+		test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-+		? ALLOCATOR_running
-+		: ALLOCATOR_stopped;
-+	alloc_thread_set_state(ca, state);
-+	return state == ALLOCATOR_running;
-+}
-+
-+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-+{
-+	s64 available = dev_buckets_reclaimable(ca) -
-+		(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-+	bool ret = available > 0;
-+
-+	alloc_thread_set_state(ca, ret
-+			       ? ALLOCATOR_running
-+			       : ALLOCATOR_blocked);
-+	return ret;
-+}
-+
-+/**
-+ * bch_allocator_thread - move buckets from free_inc to reserves
-+ *
-+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
-+ * the reserves are depleted by bucket allocation. When we run out
-+ * of free_inc, try to invalidate some buckets and write out
-+ * prios and gens.
-+ */
-+static int bch2_allocator_thread(void *arg)
-+{
-+	struct bch_dev *ca = arg;
-+	struct bch_fs *c = ca->fs;
-+	unsigned long gc_count = c->gc_count;
-+	size_t nr;
-+	int ret;
-+
-+	set_freezable();
-+
-+	while (1) {
-+		ret = kthread_wait_freezable(allocator_thread_running(ca));
-+		if (ret)
-+			goto stop;
-+
-+		while (!ca->alloc_heap.used) {
-+			cond_resched();
-+
-+			ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-+			if (ret)
-+				goto stop;
-+
-+			gc_count = c->gc_count;
-+			nr = find_reclaimable_buckets(c, ca);
-+
-+			trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-+					 ca->inc_gen_really_needs_gc);
-+
-+			if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-+			     ca->inc_gen_really_needs_gc) &&
-+			    c->gc_thread) {
-+				atomic_inc(&c->kick_gc);
-+				wake_up_process(c->gc_thread);
-+			}
-+		}
-+
-+		ret = bch2_invalidate_buckets(c, ca);
-+		if (ret)
-+			goto stop;
-+
-+		while (!fifo_empty(&ca->free_inc)) {
-+			u64 b = fifo_peek(&ca->free_inc);
-+
-+			discard_one_bucket(c, ca, b);
-+
-+			ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-+			if (ret)
-+				goto stop;
-+		}
-+	}
-+stop:
-+	alloc_thread_set_state(ca, ALLOCATOR_stopped);
-+	return 0;
-+}
-+
 +/* Startup/shutdown (ro/rw): */
 +
 +void bch2_recalc_capacity(struct bch_fs *c)
@@ -1915,7 +2371,7 @@ index 000000000000..b2735c8591d6
 +	u64 capacity = 0, reserved_sectors = 0, gc_reserve;
 +	unsigned bucket_size_max = 0;
 +	unsigned long ra_pages = 0;
-+	unsigned i, j;
++	unsigned i;
 +
 +	lockdep_assert_held(&c->state_lock);
 +
@@ -1946,8 +2402,9 @@ index 000000000000..b2735c8591d6
 +		 * allocations for foreground writes must wait -
 +		 * not -ENOSPC calculations.
 +		 */
-+		for (j = 0; j < RESERVE_NONE; j++)
-+			dev_reserve += ca->free[j].size;
++
++		dev_reserve += ca->nr_btree_reserve * 2;
++		dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 +
 +		dev_reserve += 1;	/* btree write point */
 +		dev_reserve += 1;	/* copygc write point */
@@ -1990,7 +2447,7 @@ index 000000000000..b2735c8591d6
 +	     ob++) {
 +		spin_lock(&ob->lock);
 +		if (ob->valid && !ob->on_partial_list &&
-+		    ob->ptr.dev == ca->dev_idx)
++		    ob->dev == ca->dev_idx)
 +			ret = true;
 +		spin_unlock(&ob->lock);
 +	}
@@ -2003,8 +2460,6 @@ index 000000000000..b2735c8591d6
 +{
 +	unsigned i;
 +
-+	BUG_ON(ca->alloc_thread);
-+
 +	/* First, remove device from allocation groups: */
 +
 +	for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -2078,180 +2533,120 @@ index 000000000000..b2735c8591d6
 +			set_bit(ca->dev_idx, c->rw_devs[i].d);
 +}
 +
-+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-+{
-+	if (ca->alloc_thread)
-+		closure_wait_event(&c->freelist_wait,
-+				   ca->allocator_state != ALLOCATOR_running);
-+}
-+
-+/* stop allocator thread: */
-+void bch2_dev_allocator_stop(struct bch_dev *ca)
-+{
-+	struct task_struct *p;
-+
-+	p = rcu_dereference_protected(ca->alloc_thread, 1);
-+	ca->alloc_thread = NULL;
-+
-+	/*
-+	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-+	 * the thread shutting down to avoid bch2_wake_allocator() racing:
-+	 *
-+	 * XXX: it would be better to have the rcu barrier be asynchronous
-+	 * instead of blocking us here
-+	 */
-+	synchronize_rcu();
-+
-+	if (p) {
-+		kthread_stop(p);
-+		put_task_struct(p);
-+	}
-+}
-+
-+/* start allocator thread: */
-+int bch2_dev_allocator_start(struct bch_dev *ca)
-+{
-+	struct task_struct *p;
-+
-+	/*
-+	 * allocator thread already started?
-+	 */
-+	if (ca->alloc_thread)
-+		return 0;
-+
-+	p = kthread_create(bch2_allocator_thread, ca,
-+			   "bch-alloc/%s", ca->name);
-+	if (IS_ERR(p)) {
-+		bch_err(ca->fs, "error creating allocator thread: %li",
-+			PTR_ERR(p));
-+		return PTR_ERR(p);
-+	}
-+
-+	get_task_struct(p);
-+	rcu_assign_pointer(ca->alloc_thread, p);
-+	wake_up_process(p);
-+	return 0;
-+}
-+
 +void bch2_fs_allocator_background_init(struct bch_fs *c)
 +{
 +	spin_lock_init(&c->freelist_lock);
-+}
-+
-+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-+{
-+	struct open_bucket *ob;
-+
-+	for (ob = c->open_buckets;
-+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-+	     ob++) {
-+		spin_lock(&ob->lock);
-+		if (ob->valid && !ob->on_partial_list) {
-+			pr_buf(out, "%zu ref %u type %s\n",
-+			       ob - c->open_buckets,
-+			       atomic_read(&ob->pin),
-+			       bch2_data_types[ob->type]);
-+		}
-+		spin_unlock(&ob->lock);
-+	}
-+
++	INIT_WORK(&c->discard_work, bch2_do_discards_work);
++	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 +}
 diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
 new file mode 100644
-index 000000000000..370573f8e05d
+index 000000000000..da1b650e8017
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.h
-@@ -0,0 +1,143 @@
+@@ -0,0 +1,139 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
 +#define _BCACHEFS_ALLOC_BACKGROUND_H
 +
 +#include "bcachefs.h"
 +#include "alloc_types.h"
++#include "buckets.h"
 +#include "debug.h"
-+
-+extern const char * const bch2_allocator_states[];
-+
-+struct bkey_alloc_unpacked {
-+	u64		journal_seq;
-+	u64		bucket;
-+	u8		dev;
-+	u8		gen;
-+	u8		oldest_gen;
-+	u8		data_type;
-+#define x(_name, _bits)	u##_bits _name;
-+	BCH_ALLOC_FIELDS_V2()
-+#undef  x
-+};
-+
-+struct bkey_alloc_buf {
-+	struct bkey_i	k;
-+	struct bch_alloc_v3 v;
-+
-+#define x(_name,  _bits)		+ _bits / 8
-+	u8		_pad[0 + BCH_ALLOC_FIELDS_V2()];
-+#undef  x
-+} __attribute__((packed, aligned(8)));
++#include "super.h"
 +
 +/* How out of date a pointer gen is allowed to be: */
 +#define BUCKET_GC_GEN_MAX	96U
 +
-+/* returns true if not equal */
-+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
-+					   struct bkey_alloc_unpacked r)
++static inline u8 alloc_gc_gen(struct bch_alloc_v4 a)
 +{
-+	return  l.gen != r.gen			||
-+		l.oldest_gen != r.oldest_gen	||
-+		l.data_type != r.data_type
-+#define x(_name, ...)	|| l._name != r._name
-+	BCH_ALLOC_FIELDS_V2()
-+#undef  x
-+	;
++	return a.gen - a.oldest_gen;
 +}
 +
-+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
-+		     const struct bkey_alloc_unpacked);
++enum bucket_state {
++	BUCKET_free,
++	BUCKET_need_gc_gens,
++	BUCKET_need_discard,
++	BUCKET_cached,
++	BUCKET_dirty,
++};
++
++extern const char * const bch2_bucket_states[];
++
++static inline enum bucket_state bucket_state(struct bch_alloc_v4 a)
++{
++	if (a.dirty_sectors || a.stripe)
++		return BUCKET_dirty;
++	if (a.cached_sectors)
++		return BUCKET_cached;
++	BUG_ON(a.data_type);
++	if (BCH_ALLOC_V4_NEED_DISCARD(&a))
++		return BUCKET_need_discard;
++	if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
++		return BUCKET_need_gc_gens;
++	return BUCKET_free;
++}
++
++static inline u64 alloc_lru_idx(struct bch_alloc_v4 a)
++{
++	return bucket_state(a) == BUCKET_cached ? a.io_time[READ] : 0;
++}
++
++static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
++{
++	return ((u64) alloc_gc_gen(a) >> 4) << 56;
++}
++
++static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a)
++{
++	pos.offset |= alloc_freespace_genbits(a);
++	return pos;
++}
++
++struct bkey_i_alloc_v4 *
++bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos);
++
++void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *);
++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c);
 +
 +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 +
-+static inline struct bkey_alloc_unpacked
-+alloc_mem_to_key(struct btree_iter *iter,
-+		 struct bucket *g, struct bucket_mark m)
-+{
-+	return (struct bkey_alloc_unpacked) {
-+		.dev		= iter->pos.inode,
-+		.bucket		= iter->pos.offset,
-+		.gen		= m.gen,
-+		.oldest_gen	= g->oldest_gen,
-+		.data_type	= m.data_type,
-+		.dirty_sectors	= m.dirty_sectors,
-+		.cached_sectors	= m.cached_sectors,
-+		.read_time	= g->io_time[READ],
-+		.write_time	= g->io_time[WRITE],
-+	};
-+}
-+
 +#define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 +
 +const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
 +const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 +const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
++const char *bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c k);
++void bch2_alloc_v4_swab(struct bkey_s);
 +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
 +#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
 +	.key_invalid	= bch2_alloc_v1_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
 +}
 +
 +#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v2_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
 +}
 +
 +#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v3_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
++}
++
++#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {	\
++	.key_invalid	= bch2_alloc_v4_invalid,	\
++	.val_to_text	= bch2_alloc_to_text,		\
++	.swab		= bch2_alloc_v4_swab,		\
++	.trans_trigger	= bch2_trans_mark_alloc,	\
++	.atomic_trigger	= bch2_mark_alloc,		\
 +}
 +
 +static inline bool bkey_is_alloc(const struct bkey *k)
@@ -2263,54 +2658,38 @@ index 000000000000..370573f8e05d
 +
 +int bch2_alloc_read(struct bch_fs *);
 +
-+static inline void bch2_wake_allocator(struct bch_dev *ca)
-+{
-+	struct task_struct *p;
++int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
++			  struct bkey_i *, unsigned);
++int bch2_check_alloc_info(struct bch_fs *, bool);
++void bch2_do_discards(struct bch_fs *);
 +
-+	rcu_read_lock();
-+	p = rcu_dereference(ca->alloc_thread);
-+	if (p)
-+		wake_up_process(p);
-+	rcu_read_unlock();
++static inline bool should_invalidate_buckets(struct bch_dev *ca)
++{
++	struct bch_dev_usage u = bch2_dev_usage_read(ca);
++
++	return u.d[BCH_DATA_cached].buckets &&
++		u.buckets_unavailable + u.d[BCH_DATA_cached].buckets <
++		ca->mi.nbuckets >> 7;
 +}
 +
-+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-+					  size_t bucket)
-+{
-+	if (bch2_expensive_debug_checks) {
-+		size_t iter;
-+		long i;
-+		unsigned j;
++void bch2_do_invalidates(struct bch_fs *);
 +
-+		for (j = 0; j < RESERVE_NR; j++)
-+			fifo_for_each_entry(i, &ca->free[j], iter)
-+				BUG_ON(i == bucket);
-+		fifo_for_each_entry(i, &ca->free_inc, iter)
-+			BUG_ON(i == bucket);
-+	}
-+}
++int bch2_fs_freespace_init(struct bch_fs *);
 +
 +void bch2_recalc_capacity(struct bch_fs *);
 +
 +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 +
-+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-+void bch2_dev_allocator_stop(struct bch_dev *);
-+int bch2_dev_allocator_start(struct bch_dev *);
-+
-+int bch2_alloc_write(struct bch_fs *, unsigned);
 +void bch2_fs_allocator_background_init(struct bch_fs *);
 +
-+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-+
 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
 new file mode 100644
-index 000000000000..412fed479482
+index 000000000000..4dbab45be5ed
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.c
-@@ -0,0 +1,960 @@
+@@ -0,0 +1,1263 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2012 Google, Inc.
@@ -2327,19 +2706,31 @@ index 000000000000..412fed479482
 +#include "bcachefs.h"
 +#include "alloc_background.h"
 +#include "alloc_foreground.h"
++#include "btree_iter.h"
++#include "btree_update.h"
 +#include "btree_gc.h"
 +#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
 +#include "clock.h"
 +#include "debug.h"
 +#include "disk_groups.h"
 +#include "ec.h"
++#include "error.h"
 +#include "io.h"
++#include "journal.h"
 +
 +#include <linux/math64.h>
 +#include <linux/rculist.h>
 +#include <linux/rcupdate.h>
 +#include <trace/events/bcachefs.h>
 +
++const char * const bch2_alloc_reserves[] = {
++#define x(t) #t,
++	BCH_ALLOC_RESERVES()
++#undef x
++	NULL
++};
++
 +/*
 + * Open buckets represent a bucket that's currently being allocated from.  They
 + * serve two purposes:
@@ -2356,9 +2747,32 @@ index 000000000000..412fed479482
 + * reference _after_ doing the index update that makes its allocation reachable.
 + */
 +
++static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
++{
++	open_bucket_idx_t idx = ob - c->open_buckets;
++	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
++
++	ob->hash = *slot;
++	*slot = idx;
++}
++
++static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
++{
++	open_bucket_idx_t idx = ob - c->open_buckets;
++	open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
++
++	while (*slot != idx) {
++		BUG_ON(!*slot);
++		slot = &c->open_buckets[*slot].hash;
++	}
++
++	*slot = ob->hash;
++	ob->hash = 0;
++}
++
 +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 +{
-+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 +
 +	if (ob->ec) {
 +		bch2_ec_bucket_written(c, ob);
@@ -2368,14 +2782,15 @@ index 000000000000..412fed479482
 +	percpu_down_read(&c->mark_lock);
 +	spin_lock(&ob->lock);
 +
-+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
 +	ob->valid = false;
-+	ob->type = 0;
++	ob->data_type = 0;
 +
 +	spin_unlock(&ob->lock);
 +	percpu_up_read(&c->mark_lock);
 +
 +	spin_lock(&c->freelist_lock);
++	bch2_open_bucket_hash_remove(c, ob);
++
 +	ob->freelist = c->open_buckets_freelist;
 +	c->open_buckets_freelist = ob - c->open_buckets;
 +
@@ -2394,8 +2809,7 @@ index 000000000000..412fed479482
 +	unsigned i;
 +
 +	open_bucket_for_each(c, obs, ob, i)
-+		if (ob->ptr.dev == dev &&
-+		    ob->ec)
++		if (ob->dev == dev && ob->ec)
 +			bch2_ec_bucket_cancel(c, ob);
 +}
 +
@@ -2408,7 +2822,7 @@ index 000000000000..412fed479482
 +	ob = c->open_buckets + c->open_buckets_freelist;
 +	c->open_buckets_freelist = ob->freelist;
 +	atomic_set(&ob->pin, 1);
-+	ob->type = 0;
++	ob->data_type = 0;
 +
 +	c->open_buckets_nr_free--;
 +	return ob;
@@ -2418,8 +2832,8 @@ index 000000000000..412fed479482
 +				    struct write_point *wp,
 +				    struct open_bucket *ob)
 +{
-+	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-+	bool may_realloc = wp->type == BCH_DATA_user;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++	bool may_realloc = wp->data_type == BCH_DATA_user;
 +
 +	BUG_ON(ca->open_buckets_partial_nr >
 +	       ARRAY_SIZE(ca->open_buckets_partial));
@@ -2440,85 +2854,62 @@ index 000000000000..412fed479482
 +	}
 +}
 +
-+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-+{
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	struct open_bucket *ob;
-+	unsigned i;
-+
-+	open_bucket_for_each(c, obs, ob, i) {
-+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-+
-+		BUG_ON(ptr_stale(ca, &ob->ptr));
-+	}
-+#endif
-+}
-+
 +/* _only_ for allocating the journal on a new device: */
 +long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 +{
-+	struct bucket_array *buckets;
-+	ssize_t b;
++	while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
++		u64 b = ca->new_fs_bucket_idx++;
 +
-+	rcu_read_lock();
-+	buckets = bucket_array(ca);
++		if (!is_superblock_bucket(ca, b) &&
++		    (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
++			return b;
++	}
 +
-+	for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
-+		if (is_available_bucket(buckets->b[b].mark) &&
-+		    !buckets->b[b].mark.owned_by_allocator)
-+			goto success;
-+	b = -1;
-+success:
-+	rcu_read_unlock();
-+	return b;
++	return -1;
 +}
 +
 +static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 +{
 +	switch (reserve) {
-+	case RESERVE_BTREE:
-+	case RESERVE_BTREE_MOVINGGC:
++	case RESERVE_btree:
++	case RESERVE_btree_movinggc:
 +		return 0;
-+	case RESERVE_MOVINGGC:
++	case RESERVE_movinggc:
 +		return OPEN_BUCKETS_COUNT / 4;
 +	default:
 +		return OPEN_BUCKETS_COUNT / 2;
 +	}
 +}
 +
-+/**
-+ * bch_bucket_alloc - allocate a single bucket from a specific device
-+ *
-+ * Returns index of bucket on success, 0 on failure
-+ * */
-+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-+				      enum alloc_reserve reserve,
-+				      bool may_alloc_partial,
-+				      struct closure *cl)
++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
++					      u64 bucket,
++					      enum alloc_reserve reserve,
++					      struct bch_alloc_v4 *a,
++					      u64 *skipped_open,
++					      u64 *skipped_need_journal_commit,
++					      u64 *skipped_nouse,
++					      struct closure *cl)
 +{
 +	struct open_bucket *ob;
-+	long b = 0;
++
++	if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
++		(*skipped_nouse)++;
++		return NULL;
++	}
++
++	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
++		(*skipped_open)++;
++		return NULL;
++	}
++
++	if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++			c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
++		(*skipped_need_journal_commit)++;
++		return NULL;
++	}
 +
 +	spin_lock(&c->freelist_lock);
 +
-+	if (may_alloc_partial) {
-+		int i;
-+
-+		for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-+			ob = c->open_buckets + ca->open_buckets_partial[i];
-+
-+			if (reserve <= ob->alloc_reserve) {
-+				array_remove_item(ca->open_buckets_partial,
-+						  ca->open_buckets_partial_nr,
-+						  i);
-+				ob->on_partial_list = false;
-+				ob->alloc_reserve = reserve;
-+				spin_unlock(&c->freelist_lock);
-+				return ob;
-+			}
-+		}
-+	}
-+
 +	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
 +		if (cl)
 +			closure_wait(&c->open_buckets_wait, cl);
@@ -2527,36 +2918,18 @@ index 000000000000..412fed479482
 +			c->blocked_allocate_open_bucket = local_clock();
 +
 +		spin_unlock(&c->freelist_lock);
-+		trace_open_bucket_alloc_fail(ca, reserve);
++
++		trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
 +		return ERR_PTR(-OPEN_BUCKETS_EMPTY);
 +	}
 +
-+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
-+		goto out;
-+
-+	switch (reserve) {
-+	case RESERVE_BTREE_MOVINGGC:
-+	case RESERVE_MOVINGGC:
-+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
-+			goto out;
-+		break;
-+	default:
-+		break;
++	/* Recheck under lock: */
++	if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
++		spin_unlock(&c->freelist_lock);
++		(*skipped_open)++;
++		return NULL;
 +	}
 +
-+	if (cl)
-+		closure_wait(&c->freelist_wait, cl);
-+
-+	if (!c->blocked_allocate)
-+		c->blocked_allocate = local_clock();
-+
-+	spin_unlock(&c->freelist_lock);
-+
-+	trace_bucket_alloc_fail(ca, reserve);
-+	return ERR_PTR(-FREELIST_EMPTY);
-+out:
-+	verify_not_on_freelist(c, ca, b);
-+
 +	ob = bch2_open_bucket_alloc(c);
 +
 +	spin_lock(&ob->lock);
@@ -2564,15 +2937,14 @@ index 000000000000..412fed479482
 +	ob->valid	= true;
 +	ob->sectors_free = ca->mi.bucket_size;
 +	ob->alloc_reserve = reserve;
-+	ob->ptr		= (struct bch_extent_ptr) {
-+		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
-+		.gen	= bucket(ca, b)->mark.gen,
-+		.offset	= bucket_to_sector(ca, b),
-+		.dev	= ca->dev_idx,
-+	};
-+
++	ob->dev		= ca->dev_idx;
++	ob->gen		= a->gen;
++	ob->bucket	= bucket;
 +	spin_unlock(&ob->lock);
 +
++	ca->nr_open_buckets++;
++	bch2_open_bucket_hash_add(c, ob);
++
 +	if (c->blocked_allocate_open_bucket) {
 +		bch2_time_stats_update(
 +			&c->times[BCH_TIME_blocked_allocate_open_bucket],
@@ -2587,12 +2959,285 @@ index 000000000000..412fed479482
 +		c->blocked_allocate = 0;
 +	}
 +
-+	ca->nr_open_buckets++;
 +	spin_unlock(&c->freelist_lock);
 +
-+	bch2_wake_allocator(ca);
++	trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]);
++	return ob;
++}
++
++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
++					    enum alloc_reserve reserve, u64 free_entry,
++					    u64 *skipped_open,
++					    u64 *skipped_need_journal_commit,
++					    u64 *skipped_nouse,
++					    struct closure *cl)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct open_bucket *ob;
++	struct bch_alloc_v4 a;
++	u64 b = free_entry & ~(~0ULL << 56);
++	unsigned genbits = free_entry >> 56;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret) {
++		ob = ERR_PTR(ret);
++		goto err;
++	}
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c,
++			"non free bucket in freespace btree (state %s)\n"
++			"  %s\n"
++			"  at %llu (genbits %u)",
++			bch2_bucket_states[bucket_state(a)],
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++			free_entry, genbits)) {
++		ob = ERR_PTR(-EIO);
++		goto err;
++	}
++
++	if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
++			"bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
++			"  %s",
++			genbits, alloc_freespace_genbits(a) >> 56,
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ob = ERR_PTR(-EIO);
++		goto err;
++	}
++
++	if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
++			"freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
++			b, ca->mi.first_bucket, ca->mi.nbuckets)) {
++		ob = ERR_PTR(-EIO);
++		goto err;
++	}
++
++	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
++				skipped_open,
++				skipped_need_journal_commit,
++				skipped_nouse,
++				cl);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
++	return ob;
++}
++
++static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
++						    enum alloc_reserve reserve)
++{
++	struct open_bucket *ob;
++	int i;
++
++	spin_lock(&c->freelist_lock);
++
++	for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
++		ob = c->open_buckets + ca->open_buckets_partial[i];
++
++		if (reserve <= ob->alloc_reserve) {
++			array_remove_item(ca->open_buckets_partial,
++					  ca->open_buckets_partial_nr,
++					  i);
++			ob->on_partial_list = false;
++			ob->alloc_reserve = reserve;
++			spin_unlock(&c->freelist_lock);
++			return ob;
++		}
++	}
++
++	spin_unlock(&c->freelist_lock);
++	return NULL;
++}
++
++/*
++ * This path is for before the freespace btree is initialized:
++ *
++ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
++ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
++ */
++static noinline struct open_bucket *
++bch2_bucket_alloc_trans_early(struct btree_trans *trans,
++			      struct bch_dev *ca,
++			      enum alloc_reserve reserve,
++			      u64 *cur_bucket,
++			      u64 *buckets_seen,
++			      u64 *skipped_open,
++			      u64 *skipped_need_journal_commit,
++			      u64 *skipped_nouse,
++			      struct closure *cl)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct open_bucket *ob = NULL;
++	int ret;
++
++	*cur_bucket = max_t(u64, *cur_bucket, ca->mi.first_bucket);
++	*cur_bucket = max_t(u64, *cur_bucket, ca->new_fs_bucket_idx);
++
++	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *cur_bucket),
++			   BTREE_ITER_SLOTS, k, ret) {
++		struct bch_alloc_v4 a;
++
++		if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
++			break;
++
++		if (ca->new_fs_bucket_idx &&
++		    is_superblock_bucket(ca, k.k->p.offset))
++			continue;
++
++		bch2_alloc_to_v4(k, &a);
++
++		if (bucket_state(a) != BUCKET_free)
++			continue;
++
++		(*buckets_seen)++;
++
++		ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
++					skipped_open,
++					skipped_need_journal_commit,
++					skipped_nouse,
++					cl);
++		if (ob)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	*cur_bucket = iter.pos.offset;
++
++	return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
++}
++
++static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
++						   struct bch_dev *ca,
++						   enum alloc_reserve reserve,
++						   u64 *cur_bucket,
++						   u64 *buckets_seen,
++						   u64 *skipped_open,
++						   u64 *skipped_need_journal_commit,
++						   u64 *skipped_nouse,
++						   struct closure *cl)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct open_bucket *ob = NULL;
++	int ret;
++
++	if (unlikely(!ca->mi.freespace_initialized))
++		return bch2_bucket_alloc_trans_early(trans, ca, reserve,
++						     cur_bucket,
++						     buckets_seen,
++						     skipped_open,
++						     skipped_need_journal_commit,
++						     skipped_nouse,
++						     cl);
++
++	BUG_ON(ca->new_fs_bucket_idx);
++
++	for_each_btree_key(trans, iter, BTREE_ID_freespace,
++			   POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
++		if (k.k->p.inode != ca->dev_idx)
++			break;
++
++		for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
++		     *cur_bucket != k.k->p.offset && !ob;
++		     (*cur_bucket)++) {
++			if (btree_trans_too_many_iters(trans)) {
++				ob = ERR_PTR(-EINTR);
++				break;
++			}
++
++			(*buckets_seen)++;
++
++			ob = try_alloc_bucket(trans, ca, reserve,
++					      *cur_bucket,
++					      skipped_open,
++					      skipped_need_journal_commit,
++					      skipped_nouse,
++					      cl);
++		}
++		if (ob)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ob ?: ERR_PTR(ret);
++}
++
++/**
++ * bch_bucket_alloc - allocate a single bucket from a specific device
++ *
++ * Returns index of bucket on success, 0 on failure
++ * */
++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
++				      enum alloc_reserve reserve,
++				      bool may_alloc_partial,
++				      struct closure *cl)
++{
++	struct open_bucket *ob = NULL;
++	u64 avail = dev_buckets_available(ca, reserve);
++	u64 cur_bucket = 0;
++	u64 buckets_seen = 0;
++	u64 skipped_open = 0;
++	u64 skipped_need_journal_commit = 0;
++	u64 skipped_nouse = 0;
++	int ret;
++
++	if (may_alloc_partial) {
++		ob = try_alloc_partial_bucket(c, ca, reserve);
++		if (ob)
++			return ob;
++	}
++again:
++	if (!avail) {
++		if (cl) {
++			closure_wait(&c->freelist_wait, cl);
++			/* recheck after putting ourself on waitlist */
++			avail = dev_buckets_available(ca, reserve);
++			if (avail) {
++				closure_wake_up(&c->freelist_wait);
++				goto again;
++			}
++		}
++
++		if (!c->blocked_allocate)
++			c->blocked_allocate = local_clock();
++
++		ob = ERR_PTR(-FREELIST_EMPTY);
++		goto err;
++	}
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++			PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
++							&cur_bucket,
++							&buckets_seen,
++							&skipped_open,
++							&skipped_need_journal_commit,
++							&skipped_nouse,
++							cl)));
++
++	if (skipped_need_journal_commit * 2 > avail)
++		bch2_journal_flush_async(&c->journal, NULL);
++err:
++	if (!ob)
++		ob = ERR_PTR(ret ?: -FREELIST_EMPTY);
++
++	if (IS_ERR(ob)) {
++		trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
++					buckets_seen,
++					skipped_open,
++					skipped_need_journal_commit,
++					skipped_nouse,
++					cl == NULL, PTR_ERR(ob));
++		atomic_long_inc(&c->bucket_alloc_fail);
++	}
 +
-+	trace_bucket_alloc(ca, reserve);
 +	return ob;
 +}
 +
@@ -2623,7 +3268,7 @@ index 000000000000..412fed479482
 +			       struct dev_stripe_state *stripe)
 +{
 +	u64 *v = stripe->next_alloc + ca->dev_idx;
-+	u64 free_space = dev_buckets_available(ca);
++	u64 free_space = dev_buckets_available(ca, RESERVE_none);
 +	u64 free_space_inv = free_space
 +		? div64_u64(1ULL << 48, free_space)
 +		: 1ULL << 48;
@@ -2651,9 +3296,9 @@ index 000000000000..412fed479482
 +			   struct open_bucket *ob)
 +{
 +	unsigned durability =
-+		bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
++		bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 +
-+	__clear_bit(ob->ptr.dev, devs_may_alloc->d);
++	__clear_bit(ob->dev, devs_may_alloc->d);
 +	*nr_effective	+= (flags & BUCKET_ALLOC_USE_DURABILITY)
 +		? durability : 1;
 +	*have_cache	|= !durability;
@@ -2661,8 +3306,7 @@ index 000000000000..412fed479482
 +	ob_push(c, ptrs, ob);
 +}
 +
-+enum bucket_alloc_ret
-+bch2_bucket_alloc_set(struct bch_fs *c,
++int bch2_bucket_alloc_set(struct bch_fs *c,
 +		      struct open_buckets *ptrs,
 +		      struct dev_stripe_state *stripe,
 +		      struct bch_devs_mask *devs_may_alloc,
@@ -2675,8 +3319,9 @@ index 000000000000..412fed479482
 +{
 +	struct dev_alloc_list devs_sorted =
 +		bch2_dev_alloc_list(c, stripe, devs_may_alloc);
++	unsigned dev;
 +	struct bch_dev *ca;
-+	enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
++	int ret = -INSUFFICIENT_DEVICES;
 +	unsigned i;
 +
 +	BUG_ON(*nr_effective >= nr_replicas);
@@ -2684,30 +3329,43 @@ index 000000000000..412fed479482
 +	for (i = 0; i < devs_sorted.nr; i++) {
 +		struct open_bucket *ob;
 +
-+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
++		dev = devs_sorted.devs[i];
++
++		rcu_read_lock();
++		ca = rcu_dereference(c->devs[dev]);
++		if (ca)
++			percpu_ref_get(&ca->ref);
++		rcu_read_unlock();
++
 +		if (!ca)
 +			continue;
 +
-+		if (!ca->mi.durability && *have_cache)
++		if (!ca->mi.durability && *have_cache) {
++			percpu_ref_put(&ca->ref);
 +			continue;
++		}
 +
 +		ob = bch2_bucket_alloc(c, ca, reserve,
 +				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
++		if (!IS_ERR(ob))
++			bch2_dev_stripe_increment(ca, stripe);
++		percpu_ref_put(&ca->ref);
++
 +		if (IS_ERR(ob)) {
-+			ret = -PTR_ERR(ob);
++			ret = PTR_ERR(ob);
 +
 +			if (cl)
-+				return ret;
++				break;
 +			continue;
 +		}
 +
 +		add_new_bucket(c, ptrs, devs_may_alloc,
 +			       nr_effective, have_cache, flags, ob);
 +
-+		bch2_dev_stripe_increment(ca, stripe);
-+
-+		if (*nr_effective >= nr_replicas)
-+			return ALLOC_SUCCESS;
++		if (*nr_effective >= nr_replicas) {
++			ret = 0;
++			break;
++		}
 +	}
 +
 +	return ret;
@@ -2721,8 +3379,7 @@ index 000000000000..412fed479482
 + * it's to a device we don't want:
 + */
 +
-+static enum bucket_alloc_ret
-+bucket_alloc_from_stripe(struct bch_fs *c,
++static int bucket_alloc_from_stripe(struct bch_fs *c,
 +			 struct open_buckets *ptrs,
 +			 struct write_point *wp,
 +			 struct bch_devs_mask *devs_may_alloc,
@@ -2765,13 +3422,13 @@ index 000000000000..412fed479482
 +				continue;
 +
 +			ob = c->open_buckets + h->s->blocks[ec_idx];
-+			if (ob->ptr.dev == devs_sorted.devs[i] &&
++			if (ob->dev == devs_sorted.devs[i] &&
 +			    !test_and_set_bit(ec_idx, h->s->blocks_allocated))
 +				goto got_bucket;
 +		}
 +	goto out_put_head;
 +got_bucket:
-+	ca = bch_dev_bkey_exists(c, ob->ptr.dev);
++	ca = bch_dev_bkey_exists(c, ob->dev);
 +
 +	ob->ec_idx	= ec_idx;
 +	ob->ec		= h->s;
@@ -2801,12 +3458,12 @@ index 000000000000..412fed479482
 +	unsigned i;
 +
 +	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 +
 +		if (*nr_effective < nr_replicas &&
-+		    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
++		    test_bit(ob->dev, devs_may_alloc->d) &&
 +		    (ca->mi.durability ||
-+		     (wp->type == BCH_DATA_user && !*have_cache)) &&
++		     (wp->data_type == BCH_DATA_user && !*have_cache)) &&
 +		    (ob->ec || !need_ec)) {
 +			add_new_bucket(c, ptrs, devs_may_alloc,
 +				       nr_effective, have_cache,
@@ -2818,8 +3475,7 @@ index 000000000000..412fed479482
 +	wp->ptrs = ptrs_skip;
 +}
 +
-+static enum bucket_alloc_ret
-+open_bucket_add_buckets(struct bch_fs *c,
++static int open_bucket_add_buckets(struct bch_fs *c,
 +			struct open_buckets *ptrs,
 +			struct write_point *wp,
 +			struct bch_devs_list *devs_have,
@@ -2835,11 +3491,11 @@ index 000000000000..412fed479482
 +	struct bch_devs_mask devs;
 +	struct open_bucket *ob;
 +	struct closure *cl = NULL;
-+	enum bucket_alloc_ret ret;
++	int ret;
 +	unsigned i;
 +
 +	rcu_read_lock();
-+	devs = target_rw_devs(c, wp->type, target);
++	devs = target_rw_devs(c, wp->data_type, target);
 +	rcu_read_unlock();
 +
 +	/* Don't allocate from devices we already have pointers to: */
@@ -2847,7 +3503,7 @@ index 000000000000..412fed479482
 +		__clear_bit(devs_have->devs[i], devs.d);
 +
 +	open_bucket_for_each(c, ptrs, ob, i)
-+		__clear_bit(ob->ptr.dev, devs.d);
++		__clear_bit(ob->dev, devs.d);
 +
 +	if (erasure_code) {
 +		if (!ec_open_bucket(c, ptrs)) {
@@ -2863,8 +3519,8 @@ index 000000000000..412fed479482
 +						 target, erasure_code,
 +						 nr_replicas, nr_effective,
 +						 have_cache, flags, _cl);
-+			if (ret == FREELIST_EMPTY ||
-+			    ret == OPEN_BUCKETS_EMPTY)
++			if (ret == -FREELIST_EMPTY ||
++			    ret == -OPEN_BUCKETS_EMPTY)
 +				return ret;
 +			if (*nr_effective >= nr_replicas)
 +				return 0;
@@ -2877,9 +3533,6 @@ index 000000000000..412fed479482
 +	if (*nr_effective >= nr_replicas)
 +		return 0;
 +
-+	percpu_down_read(&c->mark_lock);
-+	rcu_read_lock();
-+
 +retry_blocking:
 +	/*
 +	 * Try nonblocking first, so that if one device is full we'll try from
@@ -2888,14 +3541,11 @@ index 000000000000..412fed479482
 +	ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
 +				nr_replicas, nr_effective, have_cache,
 +				reserve, flags, cl);
-+	if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
++	if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
 +		cl = _cl;
 +		goto retry_blocking;
 +	}
 +
-+	rcu_read_unlock();
-+	percpu_up_read(&c->mark_lock);
-+
 +	return ret;
 +}
 +
@@ -2907,7 +3557,7 @@ index 000000000000..412fed479482
 +	unsigned i, j;
 +
 +	open_bucket_for_each(c, obs, ob, i) {
-+		bool drop = !ca || ob->ptr.dev == ca->dev_idx;
++		bool drop = !ca || ob->dev == ca->dev_idx;
 +
 +		if (!drop && ob->ec) {
 +			mutex_lock(&ob->ec->lock);
@@ -2916,7 +3566,7 @@ index 000000000000..412fed479482
 +					continue;
 +
 +				ob2 = c->open_buckets + ob->ec->blocks[j];
-+				drop |= ob2->ptr.dev == ca->dev_idx;
++				drop |= ob2->dev == ca->dev_idx;
 +			}
 +			mutex_unlock(&ob->ec->lock);
 +		}
@@ -3085,7 +3735,7 @@ index 000000000000..412fed479482
 +	unsigned nr_effective, write_points_nr;
 +	unsigned ob_flags = 0;
 +	bool have_cache;
-+	enum bucket_alloc_ret ret;
++	int ret;
 +	int i;
 +
 +	if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
@@ -3100,11 +3750,11 @@ index 000000000000..412fed479482
 +
 +	wp = writepoint_find(c, write_point.v);
 +
-+	if (wp->type == BCH_DATA_user)
++	if (wp->data_type == BCH_DATA_user)
 +		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
 +
 +	/* metadata may not allocate on cache devices: */
-+	if (wp->type != BCH_DATA_user)
++	if (wp->data_type != BCH_DATA_user)
 +		have_cache = true;
 +
 +	if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -3134,7 +3784,7 @@ index 000000000000..412fed479482
 +	if (erasure_code && !ec_open_bucket(c, &ptrs))
 +		pr_debug("failed to get ec bucket: ret %u", ret);
 +
-+	if (ret == INSUFFICIENT_DEVICES &&
++	if (ret == -INSUFFICIENT_DEVICES &&
 +	    nr_effective >= nr_replicas_required)
 +		ret = 0;
 +
@@ -3154,8 +3804,6 @@ index 000000000000..412fed479482
 +
 +	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 +
-+	verify_not_stale(c, &wp->ptrs);
-+
 +	return wp;
 +err:
 +	open_bucket_for_each(c, &wp->ptrs, ob, i)
@@ -3167,27 +3815,42 @@ index 000000000000..412fed479482
 +
 +	mutex_unlock(&wp->lock);
 +
-+	if (ret == FREELIST_EMPTY &&
++	if (ret == -FREELIST_EMPTY &&
 +	    try_decrease_writepoints(c, write_points_nr))
 +		goto retry;
 +
 +	switch (ret) {
-+	case OPEN_BUCKETS_EMPTY:
-+	case FREELIST_EMPTY:
++	case -OPEN_BUCKETS_EMPTY:
++	case -FREELIST_EMPTY:
 +		return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
-+	case INSUFFICIENT_DEVICES:
++	case -INSUFFICIENT_DEVICES:
 +		return ERR_PTR(-EROFS);
 +	default:
-+		BUG();
++		return ERR_PTR(ret);
 +	}
 +}
 +
++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++
++	return (struct bch_extent_ptr) {
++		.type	= 1 << BCH_EXTENT_ENTRY_ptr,
++		.gen	= ob->gen,
++		.dev	= ob->dev,
++		.offset	= bucket_to_sector(ca, ob->bucket) +
++			ca->mi.bucket_size -
++			ob->sectors_free,
++	};
++}
++
 +/*
 + * Append pointers to the space we just allocated to @k, and mark @sectors space
 + * as allocated out of @ob
 + */
 +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-+				    struct bkey_i *k, unsigned sectors)
++				    struct bkey_i *k, unsigned sectors,
++				    bool cached)
 +
 +{
 +	struct open_bucket *ob;
@@ -3197,14 +3860,14 @@ index 000000000000..412fed479482
 +	wp->sectors_free -= sectors;
 +
 +	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-+		struct bch_extent_ptr tmp = ob->ptr;
++		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
++		struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
 +
-+		tmp.cached = !ca->mi.durability &&
-+			wp->type == BCH_DATA_user;
++		ptr.cached = cached ||
++			(!ca->mi.durability &&
++			 wp->data_type == BCH_DATA_user);
 +
-+		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-+		bch2_bkey_append_ptr(k, tmp);
++		bch2_bkey_append_ptr(k, ptr);
 +
 +		BUG_ON(sectors > ob->sectors_free);
 +		ob->sectors_free -= sectors;
@@ -3234,7 +3897,7 @@ index 000000000000..412fed479482
 +				   enum bch_data_type type)
 +{
 +	mutex_init(&wp->lock);
-+	wp->type = type;
++	wp->data_type = type;
 +}
 +
 +void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -3271,12 +3934,31 @@ index 000000000000..412fed479482
 +				   writepoint_hash(c, wp->write_point));
 +	}
 +}
++
++void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	struct open_bucket *ob;
++
++	for (ob = c->open_buckets;
++	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
++	     ob++) {
++		spin_lock(&ob->lock);
++		if (ob->valid && !ob->on_partial_list) {
++			pr_buf(out, "%zu ref %u type %s\n",
++			       ob - c->open_buckets,
++			       atomic_read(&ob->pin),
++			       bch2_data_types[ob->data_type]);
++		}
++		spin_unlock(&ob->lock);
++	}
++
++}
 diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
 new file mode 100644
-index 000000000000..c658295cb8e0
+index 000000000000..8bc78877f0fc
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.h
-@@ -0,0 +1,138 @@
+@@ -0,0 +1,173 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
 +#define _BCACHEFS_ALLOC_FOREGROUND_H
@@ -3291,12 +3973,7 @@ index 000000000000..c658295cb8e0
 +struct bch_fs;
 +struct bch_devs_List;
 +
-+enum bucket_alloc_ret {
-+	ALLOC_SUCCESS,
-+	OPEN_BUCKETS_EMPTY,
-+	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-+	INSUFFICIENT_DEVICES,
-+};
++extern const char * const bch2_alloc_reserves[];
 +
 +struct dev_alloc_list {
 +	unsigned	nr;
@@ -3371,14 +4048,51 @@ index 000000000000..c658295cb8e0
 +	unsigned i;
 +
 +	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-+		ob->type = wp->type;
++		ob->data_type = wp->data_type;
 +		atomic_inc(&ob->pin);
 +		ob_push(c, ptrs, ob);
 +	}
 +}
 +
-+enum bucket_alloc_ret
-+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
++static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
++						  unsigned dev, u64 bucket)
++{
++	return c->open_buckets_hash +
++		(jhash_3words(dev, bucket, bucket >> 32, 0) &
++		 (OPEN_BUCKETS_COUNT - 1));
++}
++
++static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
++{
++	open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
++
++	while (slot) {
++		struct open_bucket *ob = &c->open_buckets[slot];
++
++		if (ob->dev == dev && ob->bucket == bucket)
++			return true;
++
++		slot = ob->hash;
++	}
++
++	return false;
++}
++
++static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
++{
++	bool ret;
++
++	if (bch2_bucket_is_open(c, dev, bucket))
++		return true;
++
++	spin_lock(&c->freelist_lock);
++	ret = bch2_bucket_is_open(c, dev, bucket);
++	spin_unlock(&c->freelist_lock);
++
++	return ret;
++}
++
++int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
 +		      struct dev_stripe_state *, struct bch_devs_mask *,
 +		      unsigned, unsigned *, bool *, enum alloc_reserve,
 +		      unsigned, struct closure *);
@@ -3392,8 +4106,9 @@ index 000000000000..c658295cb8e0
 +					     unsigned,
 +					     struct closure *);
 +
++struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-+				    struct bkey_i *, unsigned);
++				    struct bkey_i *, unsigned, bool);
 +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 +
 +void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
@@ -3414,13 +4129,15 @@ index 000000000000..c658295cb8e0
 +
 +void bch2_fs_allocator_foreground_init(struct bch_fs *);
 +
++void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
++
 +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
 diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
 new file mode 100644
-index 000000000000..4a1cd8b73d16
+index 000000000000..21b56451bc18
 --- /dev/null
 +++ b/fs/bcachefs/alloc_types.h
-@@ -0,0 +1,98 @@
+@@ -0,0 +1,87 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_TYPES_H
 +#define _BCACHEFS_ALLOC_TYPES_H
@@ -3433,51 +4150,48 @@ index 000000000000..4a1cd8b73d16
 +
 +struct ec_bucket_buf;
 +
-+#define ALLOC_THREAD_STATES()		\
-+	x(stopped)			\
-+	x(running)			\
-+	x(blocked)			\
-+	x(blocked_full)
-+
-+enum allocator_states {
-+#define x(n)	ALLOCATOR_##n,
-+	ALLOC_THREAD_STATES()
-+#undef x
-+};
++#define BCH_ALLOC_RESERVES()		\
++	x(btree_movinggc)		\
++	x(btree)			\
++	x(movinggc)			\
++	x(none)
 +
 +enum alloc_reserve {
-+	RESERVE_BTREE_MOVINGGC	= -2,
-+	RESERVE_BTREE		= -1,
-+	RESERVE_MOVINGGC	= 0,
-+	RESERVE_NONE		= 1,
-+	RESERVE_NR		= 2,
++#define x(name)	RESERVE_##name,
++	BCH_ALLOC_RESERVES()
++#undef x
 +};
 +
-+typedef FIFO(long)	alloc_fifo;
-+
 +#define OPEN_BUCKETS_COUNT	1024
 +
 +#define WRITE_POINT_HASH_NR	32
 +#define WRITE_POINT_MAX		32
 +
++/*
++ * 0 is never a valid open_bucket_idx_t:
++ */
 +typedef u16			open_bucket_idx_t;
 +
 +struct open_bucket {
 +	spinlock_t		lock;
 +	atomic_t		pin;
 +	open_bucket_idx_t	freelist;
++	open_bucket_idx_t	hash;
 +
 +	/*
 +	 * When an open bucket has an ec_stripe attached, this is the index of
 +	 * the block in the stripe this open_bucket corresponds to:
 +	 */
 +	u8			ec_idx;
-+	u8			type;
++	enum bch_data_type	data_type:3;
 +	unsigned		valid:1;
 +	unsigned		on_partial_list:1;
 +	int			alloc_reserve:3;
++
 +	unsigned		sectors_free;
-+	struct bch_extent_ptr	ptr;
++	u8			dev;
++	u8			gen;
++	u64			bucket;
 +	struct ec_stripe_new	*ec;
 +};
 +
@@ -3497,7 +4211,7 @@ index 000000000000..4a1cd8b73d16
 +	struct mutex		lock;
 +	u64			last_used;
 +	unsigned long		write_point;
-+	enum bch_data_type	type;
++	enum bch_data_type	data_type;
 +
 +	/* calculated based on how many pointers we're actually going to use: */
 +	unsigned		sectors_free;
@@ -3510,21 +4224,13 @@ index 000000000000..4a1cd8b73d16
 +	unsigned long		v;
 +};
 +
-+struct alloc_heap_entry {
-+	size_t			bucket;
-+	size_t			nr;
-+	unsigned long		key;
-+};
-+
-+typedef HEAP(struct alloc_heap_entry) alloc_heap;
-+
 +#endif /* _BCACHEFS_ALLOC_TYPES_H */
 diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
 new file mode 100644
-index 000000000000..fdf3a777ae16
+index 000000000000..a13845a23387
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs.h
-@@ -0,0 +1,958 @@
+@@ -0,0 +1,974 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_H
 +#define _BCACHEFS_H
@@ -3704,7 +4410,11 @@ index 000000000000..fdf3a777ae16
 + */
 +
 +#undef pr_fmt
++#ifdef __KERNEL__
 +#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
++#else
++#define pr_fmt(fmt) "%s() " fmt "\n", __func__
++#endif
 +
 +#include <linux/backing-dev-defs.h>
 +#include <linux/bug.h>
@@ -3727,6 +4437,7 @@ index 000000000000..fdf3a777ae16
 +#include <linux/zstd.h>
 +
 +#include "bcachefs_format.h"
++#include "errcode.h"
 +#include "fifo.h"
 +#include "opts.h"
 +#include "util.h"
@@ -3745,8 +4456,8 @@ index 000000000000..fdf3a777ae16
 +#define bch2_fmt(_c, fmt)		"bcachefs (%s): " fmt "\n", ((_c)->name)
 +#define bch2_fmt_inum(_c, _inum, fmt)	"bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
 +#else
-+#define bch2_fmt(_c, fmt)		"%s: " fmt "\n", ((_c)->name)
-+#define bch2_fmt_inum(_c, _inum, fmt)	"%s inum %llu: " fmt "\n", ((_c)->name), (_inum)
++#define bch2_fmt(_c, fmt)		fmt "\n"
++#define bch2_fmt_inum(_c, _inum, fmt)	"inum %llu: " fmt "\n", (_inum)
 +#endif
 +
 +#define bch_info(c, fmt, ...) \
@@ -3803,9 +4514,6 @@ index 000000000000..fdf3a777ae16
 +		"significantly affect performance")			\
 +	BCH_DEBUG_PARAM(debug_check_iterators,				\
 +		"Enables extra verification for btree iterators")	\
-+	BCH_DEBUG_PARAM(debug_check_bkeys,				\
-+		"Run bkey_debugcheck (primarily checking GC/allocation "\
-+		"information) when iterating over keys")		\
 +	BCH_DEBUG_PARAM(debug_check_btree_accounting,			\
 +		"Verify btree accounting for keys within a node")	\
 +	BCH_DEBUG_PARAM(journal_seq_verify,				\
@@ -3847,8 +4555,12 @@ index 000000000000..fdf3a777ae16
 +#define BCH_TIME_STATS()			\
 +	x(btree_node_mem_alloc)			\
 +	x(btree_node_split)			\
++	x(btree_node_compact)			\
++	x(btree_node_merge)			\
 +	x(btree_node_sort)			\
 +	x(btree_node_read)			\
++	x(btree_interior_update_foreground)	\
++	x(btree_interior_update_total)		\
 +	x(btree_gc)				\
 +	x(btree_lock_contended_read)		\
 +	x(btree_lock_contended_intent)		\
@@ -3856,8 +4568,8 @@ index 000000000000..fdf3a777ae16
 +	x(data_write)				\
 +	x(data_read)				\
 +	x(data_promote)				\
-+	x(journal_write)			\
-+	x(journal_delay)			\
++	x(journal_flush_write)			\
++	x(journal_noflush_write)		\
 +	x(journal_flush_seq)			\
 +	x(blocked_journal)			\
 +	x(blocked_allocate)			\
@@ -3873,6 +4585,7 @@ index 000000000000..fdf3a777ae16
 +#include "alloc_types.h"
 +#include "btree_types.h"
 +#include "buckets_types.h"
++#include "buckets_waiting_for_journal_types.h"
 +#include "clock_types.h"
 +#include "ec_types.h"
 +#include "journal_types.h"
@@ -3911,6 +4624,10 @@ index 000000000000..fdf3a777ae16
 +	GC_PHASE_BTREE_reflink,
 +	GC_PHASE_BTREE_subvolumes,
 +	GC_PHASE_BTREE_snapshots,
++	GC_PHASE_BTREE_lru,
++	GC_PHASE_BTREE_freespace,
++	GC_PHASE_BTREE_need_discard,
++	GC_PHASE_BTREE_backpointers,
 +
 +	GC_PHASE_PENDING_DELETE,
 +};
@@ -3954,6 +4671,7 @@ index 000000000000..fdf3a777ae16
 +	struct bch_sb_handle	disk_sb;
 +	struct bch_sb		*sb_read_scratch;
 +	int			sb_write_error;
++	dev_t			dev;
 +
 +	struct bch_devs_mask	self;
 +
@@ -3966,7 +4684,9 @@ index 000000000000..fdf3a777ae16
 +	 * gc_lock, for device resize - holding any is sufficient for access:
 +	 * Or rcu_read_lock(), but only for ptr_stale():
 +	 */
-+	struct bucket_array __rcu *buckets[2];
++	struct bucket_array __rcu *buckets_gc;
++	struct bucket_gens __rcu *bucket_gens;
++	u8			*oldest_gen;
 +	unsigned long		*buckets_nouse;
 +	struct rw_semaphore	bucket_lock;
 +
@@ -3975,32 +4695,17 @@ index 000000000000..fdf3a777ae16
 +	struct bch_dev_usage __percpu	*usage_gc;
 +
 +	/* Allocator: */
-+	struct task_struct __rcu *alloc_thread;
++	u64			new_fs_bucket_idx;
 +
-+	/*
-+	 * free: Buckets that are ready to be used
-+	 *
-+	 * free_inc: Incoming buckets - these are buckets that currently have
-+	 * cached data in them, and we can't reuse them until after we write
-+	 * their new gen to disk. After prio_write() finishes writing the new
-+	 * gens/prios, they'll be moved to the free list (and possibly discarded
-+	 * in the process)
-+	 */
-+	alloc_fifo		free[RESERVE_NR];
-+	alloc_fifo		free_inc;
 +	unsigned		nr_open_buckets;
++	unsigned		nr_btree_reserve;
 +
 +	open_bucket_idx_t	open_buckets_partial[OPEN_BUCKETS_COUNT];
 +	open_bucket_idx_t	open_buckets_partial_nr;
 +
-+	size_t			fifo_last_bucket;
-+
 +	size_t			inc_gen_needs_gc;
 +	size_t			inc_gen_really_needs_gc;
-+
-+	enum allocator_states	allocator_state;
-+
-+	alloc_heap		alloc_heap;
++	size_t			buckets_waiting_on_journal;
 +
 +	atomic64_t		rebalance_work;
 +
@@ -4022,17 +4727,13 @@ index 000000000000..fdf3a777ae16
 +
 +enum {
 +	/* startup: */
-+	BCH_FS_INITIALIZED,
-+	BCH_FS_ALLOC_READ_DONE,
 +	BCH_FS_ALLOC_CLEAN,
-+	BCH_FS_ALLOCATOR_RUNNING,
-+	BCH_FS_ALLOCATOR_STOPPING,
 +	BCH_FS_INITIAL_GC_DONE,
 +	BCH_FS_INITIAL_GC_UNFIXED,
 +	BCH_FS_TOPOLOGY_REPAIR_DONE,
-+	BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
 +	BCH_FS_FSCK_DONE,
 +	BCH_FS_STARTED,
++	BCH_FS_MAY_GO_RW,
 +	BCH_FS_RW,
 +	BCH_FS_WAS_RW,
 +
@@ -4050,16 +4751,11 @@ index 000000000000..fdf3a777ae16
 +	/* misc: */
 +	BCH_FS_NEED_ANOTHER_GC,
 +	BCH_FS_DELETED_NODES,
-+	BCH_FS_NEED_ALLOC_WRITE,
 +	BCH_FS_REBUILD_REPLICAS,
-+	BCH_FS_HOLD_BTREE_WRITES,
 +};
 +
 +struct btree_debug {
 +	unsigned		id;
-+	struct dentry		*btree;
-+	struct dentry		*btree_format;
-+	struct dentry		*failed;
 +};
 +
 +struct bch_fs_pcpu {
@@ -4080,6 +4776,7 @@ index 000000000000..fdf3a777ae16
 +		enum btree_id	btree_id:8;
 +		unsigned	level:8;
 +		bool		allocated;
++		bool		overwritten;
 +		struct bkey_i	*k;
 +		u32		journal_seq;
 +		u32		journal_offset;
@@ -4156,7 +4853,6 @@ index 000000000000..fdf3a777ae16
 +
 +		u16		version;
 +		u16		version_min;
-+		u16		encoded_extent_max;
 +
 +		u8		nr_devices;
 +		u8		clean;
@@ -4187,7 +4883,7 @@ index 000000000000..fdf3a777ae16
 +	struct mutex		snapshot_table_lock;
 +	struct work_struct	snapshot_delete_work;
 +	struct work_struct	snapshot_wait_for_pagecache_and_delete_work;
-+	struct snapshot_id_list	snapshots_unlinked;
++	snapshot_id_list	snapshots_unlinked;
 +	struct mutex		snapshots_unlinked_lock;
 +
 +	/* BTREE CACHE */
@@ -4227,8 +4923,10 @@ index 000000000000..fdf3a777ae16
 +	struct btree_path_buf  __percpu	*btree_paths_bufs;
 +
 +	struct srcu_struct	btree_trans_barrier;
++	bool			btree_trans_barrier_initialized;
 +
 +	struct btree_key_cache	btree_key_cache;
++	unsigned		btree_key_cache_btrees;
 +
 +	struct workqueue_struct	*btree_update_wq;
 +	struct workqueue_struct	*btree_io_complete_wq;
@@ -4277,10 +4975,12 @@ index 000000000000..fdf3a777ae16
 +	struct closure_waitlist	freelist_wait;
 +	u64			blocked_allocate;
 +	u64			blocked_allocate_open_bucket;
++
 +	open_bucket_idx_t	open_buckets_freelist;
 +	open_bucket_idx_t	open_buckets_nr_free;
 +	struct closure_waitlist	open_buckets_wait;
 +	struct open_bucket	open_buckets[OPEN_BUCKETS_COUNT];
++	open_bucket_idx_t	open_buckets_hash[OPEN_BUCKETS_COUNT];
 +
 +	struct write_point	btree_write_point;
 +	struct write_point	rebalance_write_point;
@@ -4290,6 +4990,10 @@ index 000000000000..fdf3a777ae16
 +	struct mutex		write_points_hash_lock;
 +	unsigned		write_points_nr;
 +
++	struct buckets_waiting_for_journal buckets_waiting_for_journal;
++	struct work_struct	discard_work;
++	struct work_struct	invalidate_work;
++
 +	/* GARBAGE COLLECTION */
 +	struct task_struct	*gc_thread;
 +	atomic_t		kick_gc;
@@ -4315,6 +5019,7 @@ index 000000000000..fdf3a777ae16
 +	 * it's not while a gc is in progress.
 +	 */
 +	struct rw_semaphore	gc_lock;
++	struct mutex		gc_gens_lock;
 +
 +	/* IO PATH */
 +	struct semaphore	io_in_flight;
@@ -4352,7 +5057,8 @@ index 000000000000..fdf3a777ae16
 +	struct mutex		data_progress_lock;
 +
 +	/* STRIPES: */
-+	GENRADIX(struct stripe) stripes[2];
++	GENRADIX(struct stripe) stripes;
++	GENRADIX(struct gc_stripe) gc_stripes;
 +
 +	ec_stripes_heap		ec_stripes_heap;
 +	spinlock_t		ec_stripes_heap_lock;
@@ -4376,7 +5082,6 @@ index 000000000000..fdf3a777ae16
 +	u64			reflink_hint;
 +	reflink_gc_table	reflink_gc_table;
 +	size_t			reflink_gc_nr;
-+	size_t			reflink_gc_idx;
 +
 +	/* VFS IO PATH - fs-io.c */
 +	struct bio_set		writepage_bioset;
@@ -4397,7 +5102,8 @@ index 000000000000..fdf3a777ae16
 +	struct bch_memquota_type quotas[QTYP_NR];
 +
 +	/* DEBUG JUNK */
-+	struct dentry		*debug;
++	struct dentry		*fs_debug_dir;
++	struct dentry		*btree_debug_dir;
 +	struct btree_debug	btree_debug[BTREE_ID_NR];
 +	struct btree		*verify_data;
 +	struct btree_node	*verify_ondisk;
@@ -4425,6 +5131,7 @@ index 000000000000..fdf3a777ae16
 +	atomic_long_t		read_realloc_races;
 +	atomic_long_t		extent_migrate_done;
 +	atomic_long_t		extent_migrate_raced;
++	atomic_long_t		bucket_alloc_fail;
 +
 +	unsigned		btree_gc_periodic:1;
 +	unsigned		copy_gc_enabled:1;
@@ -4448,10 +5155,25 @@ index 000000000000..fdf3a777ae16
 +
 +static inline unsigned block_bytes(const struct bch_fs *c)
 +{
-+	return c->opts.block_size << 9;
++	return c->opts.block_size;
 +}
 +
-+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
++static inline unsigned block_sectors(const struct bch_fs *c)
++{
++	return c->opts.block_size >> 9;
++}
++
++static inline size_t btree_sectors(const struct bch_fs *c)
++{
++	return c->opts.btree_node_size >> 9;
++}
++
++static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
++{
++	return c->btree_key_cache_btrees & (1U << btree);
++}
++
++static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
 +{
 +	struct timespec64 t;
 +	s32 rem;
@@ -4463,13 +5185,13 @@ index 000000000000..fdf3a777ae16
 +	return t;
 +}
 +
-+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
++static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
 +{
 +	return (ts.tv_sec * c->sb.time_units_per_sec +
 +		(int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
 +}
 +
-+static inline s64 bch2_current_time(struct bch_fs *c)
++static inline s64 bch2_current_time(const struct bch_fs *c)
 +{
 +	struct timespec64 now;
 +
@@ -4485,10 +5207,10 @@ index 000000000000..fdf3a777ae16
 +#endif /* _BCACHEFS_H */
 diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
 new file mode 100644
-index 000000000000..b115bd1fa5a3
+index 000000000000..8312018e1ed5
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_format.h
-@@ -0,0 +1,1893 @@
+@@ -0,0 +1,1986 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_FORMAT_H
 +#define _BCACHEFS_FORMAT_H
@@ -4567,6 +5289,22 @@ index 000000000000..b115bd1fa5a3
 +#include <asm/byteorder.h>
 +#include <linux/kernel.h>
 +#include <linux/uuid.h>
++#include "vstructs.h"
++
++#define BITMASK(name, type, field, offset, end)				\
++static const unsigned	name##_OFFSET = offset;				\
++static const unsigned	name##_BITS = (end - offset);			\
++									\
++static inline __u64 name(const type *k)					\
++{									\
++	return (k->field >> offset) & ~(~0ULL << (end - offset));	\
++}									\
++									\
++static inline void SET_##name(type *k, __u64 v)				\
++{									\
++	k->field &= ~(~(~0ULL << (end - offset)) << offset);		\
++	k->field |= (v & ~(~0ULL << (end - offset))) << offset;		\
++}
 +
 +#define LE_BITMASK(_bits, name, type, field, offset, end)		\
 +static const unsigned	name##_OFFSET = offset;				\
@@ -4837,7 +5575,10 @@ index 000000000000..b115bd1fa5a3
 +	x(subvolume,		21)			\
 +	x(snapshot,		22)			\
 +	x(inode_v2,		23)			\
-+	x(alloc_v3,		24)
++	x(alloc_v3,		24)			\
++	x(set,			25)			\
++	x(lru,			26)			\
++	x(alloc_v4,		27)
 +
 +enum bch_bkey_type {
 +#define x(name, nr) KEY_TYPE_##name	= nr,
@@ -4867,6 +5608,10 @@ index 000000000000..b115bd1fa5a3
 +	struct bch_val		v;
 +};
 +
++struct bch_set {
++	struct bch_val		v;
++};
++
 +/* Extents */
 +
 +/*
@@ -5367,8 +6112,8 @@ index 000000000000..b115bd1fa5a3
 +#define BCH_ALLOC_FIELDS_V2()			\
 +	x(read_time,		64)		\
 +	x(write_time,		64)		\
-+	x(dirty_sectors,	16)		\
-+	x(cached_sectors,	16)		\
++	x(dirty_sectors,	32)		\
++	x(cached_sectors,	32)		\
 +	x(stripe,		32)		\
 +	x(stripe_redundancy,	8)
 +
@@ -5383,11 +6128,34 @@ index 000000000000..b115bd1fa5a3
 +	__u8			data[];
 +} __attribute__((packed, aligned(8)));
 +
++struct bch_alloc_v4 {
++	struct bch_val		v;
++	__u64			journal_seq;
++	__u32			flags;
++	__u8			gen;
++	__u8			oldest_gen;
++	__u8			data_type;
++	__u8			stripe_redundancy;
++	__u32			dirty_sectors;
++	__u32			cached_sectors;
++	__u64			io_time[2];
++	__u32			stripe;
++	__u32			nr_external_backpointers;
++	struct bpos		backpointers[0];
++} __attribute__((packed, aligned(8)));
++
++LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
++LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
++
++BITMASK(BCH_ALLOC_V4_NEED_DISCARD,	struct bch_alloc_v4, flags,  0,  1)
++BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,	struct bch_alloc_v4, flags,  1,  2)
++BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
++BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,	struct bch_alloc_v4, flags,  8,  14)
++
 +enum {
 +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
 +	BCH_ALLOC_FIELDS_V1()
 +#undef x
-+	BCH_ALLOC_FIELD_NR
 +};
 +
 +/* Quotas: */
@@ -5505,6 +6273,15 @@ index 000000000000..b115bd1fa5a3
 +/* True if a subvolume points to this snapshot node: */
 +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,	struct bch_snapshot, flags,  1,  2)
 +
++/* LRU btree: */
++
++struct bch_lru {
++	struct bch_val		v;
++	__le64			idx;
++} __attribute__((packed, aligned(8)));
++
++#define LRU_ID_STRIPES		(1U << 16)
++
 +/* Optional/variable size superblock sections: */
 +
 +struct bch_sb_field {
@@ -5513,16 +6290,17 @@ index 000000000000..b115bd1fa5a3
 +	__le32			type;
 +};
 +
-+#define BCH_SB_FIELDS()		\
-+	x(journal,	0)	\
-+	x(members,	1)	\
-+	x(crypt,	2)	\
-+	x(replicas_v0,	3)	\
-+	x(quota,	4)	\
-+	x(disk_groups,	5)	\
-+	x(clean,	6)	\
-+	x(replicas,	7)	\
-+	x(journal_seq_blacklist, 8)
++#define BCH_SB_FIELDS()				\
++	x(journal,	0)			\
++	x(members,	1)			\
++	x(crypt,	2)			\
++	x(replicas_v0,	3)			\
++	x(quota,	4)			\
++	x(disk_groups,	5)			\
++	x(clean,	6)			\
++	x(replicas,	7)			\
++	x(journal_seq_blacklist, 8)		\
++	x(journal_v2,	9)
 +
 +enum bch_sb_field_type {
 +#define x(f, nr)	BCH_SB_FIELD_##f = nr,
@@ -5531,6 +6309,14 @@ index 000000000000..b115bd1fa5a3
 +	BCH_SB_FIELD_NR
 +};
 +
++/*
++ * Most superblock fields are replicated in all device's superblocks - a few are
++ * not:
++ */
++#define BCH_SINGLE_DEVICE_SB_FIELDS		\
++	((1U << BCH_SB_FIELD_journal)|		\
++	 (1U << BCH_SB_FIELD_journal_v2))
++
 +/* BCH_SB_FIELD_journal: */
 +
 +struct bch_sb_field_journal {
@@ -5538,6 +6324,15 @@ index 000000000000..b115bd1fa5a3
 +	__le64			buckets[0];
 +};
 +
++struct bch_sb_field_journal_v2 {
++	struct bch_sb_field	field;
++
++	struct bch_sb_field_journal_v2_entry {
++		__le64		start;
++		__le64		nr;
++	}			d[0];
++};
++
 +/* BCH_SB_FIELD_members: */
 +
 +#define BCH_MIN_NR_NBUCKETS	(1 << 6)
@@ -5554,12 +6349,13 @@ index 000000000000..b115bd1fa5a3
 +};
 +
 +LE64_BITMASK(BCH_MEMBER_STATE,		struct bch_member, flags[0],  0,  4)
-+/* 4-10 unused, was TIER, HAS_(META)DATA */
-+LE64_BITMASK(BCH_MEMBER_REPLACEMENT,	struct bch_member, flags[0], 10, 14)
++/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
 +LE64_BITMASK(BCH_MEMBER_DISCARD,	struct bch_member, flags[0], 14, 15)
 +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,	struct bch_member, flags[0], 15, 20)
 +LE64_BITMASK(BCH_MEMBER_GROUP,		struct bch_member, flags[0], 20, 28)
 +LE64_BITMASK(BCH_MEMBER_DURABILITY,	struct bch_member, flags[0], 28, 30)
++LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
++					struct bch_member, flags[0], 30, 31)
 +
 +#if 0
 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,	struct bch_member, flags[1], 0,  20);
@@ -5579,18 +6375,6 @@ index 000000000000..b115bd1fa5a3
 +	BCH_MEMBER_STATE_NR
 +};
 +
-+#define BCH_CACHE_REPLACEMENT_POLICIES()	\
-+	x(lru,		0)			\
-+	x(fifo,		1)			\
-+	x(random,	2)
-+
-+enum bch_cache_replacement_policies {
-+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
-+	BCH_CACHE_REPLACEMENT_POLICIES()
-+#undef x
-+	BCH_CACHE_REPLACEMENT_NR
-+};
-+
 +struct bch_sb_field_members {
 +	struct bch_sb_field	field;
 +	struct bch_member	members[0];
@@ -5778,19 +6562,25 @@ index 000000000000..b115bd1fa5a3
 +#define BCH_JSET_VERSION_OLD			2
 +#define BCH_BSET_VERSION_OLD			3
 +
++#define BCH_METADATA_VERSIONS()				\
++	x(bkey_renumber,		10)		\
++	x(inode_btree_change,		11)		\
++	x(snapshot,			12)		\
++	x(inode_backpointers,		13)		\
++	x(btree_ptr_sectors_written,	14)		\
++	x(snapshot_2,			15)		\
++	x(reflink_p_fix,		16)		\
++	x(subvol_dirent,		17)		\
++	x(inode_v2,			18)		\
++	x(freespace,			19)		\
++	x(alloc_v4,			20)
++
 +enum bcachefs_metadata_version {
-+	bcachefs_metadata_version_min			= 9,
-+	bcachefs_metadata_version_new_versioning	= 10,
-+	bcachefs_metadata_version_bkey_renumber		= 10,
-+	bcachefs_metadata_version_inode_btree_change	= 11,
-+	bcachefs_metadata_version_snapshot		= 12,
-+	bcachefs_metadata_version_inode_backpointers	= 13,
-+	bcachefs_metadata_version_btree_ptr_sectors_written = 14,
-+	bcachefs_metadata_version_snapshot_2		= 15,
-+	bcachefs_metadata_version_reflink_p_fix		= 16,
-+	bcachefs_metadata_version_subvol_dirent		= 17,
-+	bcachefs_metadata_version_inode_v2		= 18,
-+	bcachefs_metadata_version_max			= 19,
++	bcachefs_metadata_version_min = 9,
++#define x(t, n)	bcachefs_metadata_version_##t = n,
++	BCH_METADATA_VERSIONS()
++#undef x
++	bcachefs_metadata_version_max
 +};
 +
 +#define bcachefs_metadata_version_current	(bcachefs_metadata_version_max - 1)
@@ -5927,6 +6717,10 @@ index 000000000000..b115bd1fa5a3
 +LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);
 +LE64_BITMASK(BCH_SB_SHARD_INUMS,	struct bch_sb, flags[3], 28, 29);
 +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
++LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
++LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
++LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 +
 +/*
 + * Features:
@@ -6161,7 +6955,8 @@ index 000000000000..b115bd1fa5a3
 +	x(usage,		5)		\
 +	x(data_usage,		6)		\
 +	x(clock,		7)		\
-+	x(dev_usage,		8)
++	x(dev_usage,		8)		\
++	x(log,			9)
 +
 +enum {
 +#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@@ -6191,11 +6986,16 @@ index 000000000000..b115bd1fa5a3
 +	__le64			end;
 +};
 +
++#define BCH_FS_USAGE_TYPES()			\
++	x(reserved,		0)		\
++	x(inodes,		1)		\
++	x(key_version,		2)
++
 +enum {
-+	FS_USAGE_RESERVED		= 0,
-+	FS_USAGE_INODES			= 1,
-+	FS_USAGE_KEY_VERSION		= 2,
-+	FS_USAGE_NR			= 3
++#define x(f, nr)	BCH_FS_USAGE_##f	= nr,
++	BCH_FS_USAGE_TYPES()
++#undef x
++	BCH_FS_USAGE_NR
 +};
 +
 +struct jset_entry_usage {
@@ -6233,6 +7033,17 @@ index 000000000000..b115bd1fa5a3
 +	struct jset_entry_dev_usage_type d[];
 +} __attribute__((packed));
 +
++static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
++{
++	return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
++		sizeof(struct jset_entry_dev_usage_type);
++}
++
++struct jset_entry_log {
++	struct jset_entry	entry;
++	u8			d[];
++} __attribute__((packed));
++
 +/*
 + * On disk format for a journal entry:
 + * seq is monotonically increasing; every journal entry has its own unique
@@ -6286,7 +7097,11 @@ index 000000000000..b115bd1fa5a3
 +	x(stripes,	6)			\
 +	x(reflink,	7)			\
 +	x(subvolumes,	8)			\
-+	x(snapshots,	9)
++	x(snapshots,	9)			\
++	x(lru,		10)			\
++	x(freespace,	11)			\
++	x(need_discard,	12)			\
++	x(backpointers,	13)
 +
 +enum btree_id {
 +#define x(kwd, val) BTREE_ID_##kwd = val,
@@ -6757,10 +7572,10 @@ index 000000000000..930981ad5535
 +#endif /* _BCACHEFS_IOCTL_H */
 diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
 new file mode 100644
-index 000000000000..946dd27f09fc
+index 000000000000..4b01ab3029a2
 --- /dev/null
 +++ b/fs/bcachefs/bkey.c
-@@ -0,0 +1,1171 @@
+@@ -0,0 +1,1172 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -6820,11 +7635,12 @@ index 000000000000..946dd27f09fc
 +	tmp = __bch2_bkey_unpack_key(format, packed);
 +
 +	if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
-+		char buf1[160], buf2[160];
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
 +		char buf3[160], buf4[160];
 +
-+		bch2_bkey_to_text(&PBUF(buf1), unpacked);
-+		bch2_bkey_to_text(&PBUF(buf2), &tmp);
++		bch2_bkey_to_text(&buf1, unpacked);
++		bch2_bkey_to_text(&buf2, &tmp);
 +		bch2_to_binary(buf3, (void *) unpacked, 80);
 +		bch2_to_binary(buf4, high_word(format, packed), 80);
 +
@@ -6835,7 +7651,7 @@ index 000000000000..946dd27f09fc
 +		      format->bits_per_field[2],
 +		      format->bits_per_field[3],
 +		      format->bits_per_field[4],
-+		      buf1, buf2, buf3, buf4);
++		      buf1.buf, buf2.buf, buf3, buf4);
 +	}
 +}
 +
@@ -8572,10 +9388,10 @@ index 000000000000..0d7c67a959af
 +#endif /* _BCACHEFS_BKEY_BUF_H */
 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
 new file mode 100644
-index 000000000000..5c900cf8a8a2
+index 000000000000..0eac86e5e776
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.c
-@@ -0,0 +1,450 @@
+@@ -0,0 +1,463 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -8587,6 +9403,7 @@ index 000000000000..5c900cf8a8a2
 +#include "error.h"
 +#include "extents.h"
 +#include "inode.h"
++#include "lru.h"
 +#include "quota.h"
 +#include "reflink.h"
 +#include "subvolume.h"
@@ -8663,6 +9480,24 @@ index 000000000000..5c900cf8a8a2
 +	.val_to_text	= key_type_inline_data_to_text,	\
 +}
 +
++static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k)
++{
++	if (bkey_val_bytes(k.k))
++		return "nonempty value";
++	return NULL;
++}
++
++static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
++{
++	bch2_key_resize(l.k, l.k->size + r.k->size);
++	return true;
++}
++
++#define bch2_bkey_ops_set (struct bkey_ops) {		\
++	.key_invalid	= key_type_set_invalid,		\
++	.key_merge	= key_type_set_merge,		\
++}
++
 +const struct bkey_ops bch2_bkey_ops[] = {
 +#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
 +	BCH_BKEY_TYPES()
@@ -8708,7 +9543,8 @@ index 000000000000..5c900cf8a8a2
 +		(1U << KEY_TYPE_deleted)|
 +		(1U << KEY_TYPE_alloc)|
 +		(1U << KEY_TYPE_alloc_v2)|
-+		(1U << KEY_TYPE_alloc_v3),
++		(1U << KEY_TYPE_alloc_v3)|
++		(1U << KEY_TYPE_alloc_v4),
 +	[BKEY_TYPE_quotas] =
 +		(1U << KEY_TYPE_deleted)|
 +		(1U << KEY_TYPE_quota),
@@ -8725,6 +9561,15 @@ index 000000000000..5c900cf8a8a2
 +	[BKEY_TYPE_snapshots] =
 +		(1U << KEY_TYPE_deleted)|
 +		(1U << KEY_TYPE_snapshot),
++	[BKEY_TYPE_lru] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_lru),
++	[BKEY_TYPE_freespace] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_set),
++	[BKEY_TYPE_need_discard] =
++		(1U << KEY_TYPE_deleted)|
++		(1U << KEY_TYPE_set),
 +	[BKEY_TYPE_btree] =
 +		(1U << KEY_TYPE_deleted)|
 +		(1U << KEY_TYPE_btree_ptr)|
@@ -8790,22 +9635,6 @@ index 000000000000..5c900cf8a8a2
 +	return NULL;
 +}
 +
-+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-+{
-+	const char *invalid;
-+
-+	BUG_ON(!k.k->u64s);
-+
-+	invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
-+		bch2_bkey_in_btree_node(b, k);
-+	if (invalid) {
-+		char buf[160];
-+
-+		bch2_bkey_val_to_text(&PBUF(buf), c, k);
-+		bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
-+	}
-+}
-+
 +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 +{
 +	if (!bpos_cmp(pos, POS_MIN))
@@ -9028,10 +9857,10 @@ index 000000000000..5c900cf8a8a2
 +}
 diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
 new file mode 100644
-index 000000000000..3012035db1a3
+index 000000000000..2289a09d98fc
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.h
-@@ -0,0 +1,80 @@
+@@ -0,0 +1,105 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_METHODS_H
 +#define _BCACHEFS_BKEY_METHODS_H
@@ -9040,6 +9869,7 @@ index 000000000000..3012035db1a3
 +
 +struct bch_fs;
 +struct btree;
++struct btree_trans;
 +struct bkey;
 +enum btree_node_type;
 +
@@ -9054,6 +9884,10 @@ index 000000000000..3012035db1a3
 +	void		(*swab)(struct bkey_s);
 +	bool		(*key_normalize)(struct bch_fs *, struct bkey_s);
 +	bool		(*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
++	int		(*trans_trigger)(struct btree_trans *, struct bkey_s_c,
++					 struct bkey_i *, unsigned);
++	int		(*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
++					  struct bkey_s_c, unsigned);
 +	void		(*compat)(enum btree_id id, unsigned version,
 +				  unsigned big_endian, int write,
 +				  struct bkey_s);
@@ -9068,8 +9902,6 @@ index 000000000000..3012035db1a3
 +			      enum btree_node_type);
 +const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 +
-+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-+
 +void bch2_bpos_to_text(struct printbuf *, struct bpos);
 +void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
 +void bch2_val_to_text(struct printbuf *, struct bch_fs *,
@@ -9093,6 +9925,28 @@ index 000000000000..3012035db1a3
 +
 +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 +
++static inline int bch2_mark_key(struct btree_trans *trans,
++		  struct bkey_s_c old,
++		  struct bkey_s_c new,
++		  unsigned flags)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type];
++
++	return ops->atomic_trigger
++		? ops->atomic_trigger(trans, old, new, flags)
++		: 0;
++}
++
++static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
++			struct bkey_i *new, unsigned flags)
++{
++	const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
++
++	return ops->trans_trigger
++		? ops->trans_trigger(trans, old, new, flags)
++		: 0;
++}
++
 +void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 +
 +void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
@@ -9114,10 +9968,10 @@ index 000000000000..3012035db1a3
 +#endif /* _BCACHEFS_BKEY_METHODS_H */
 diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
 new file mode 100644
-index 000000000000..537ab7919e88
+index 000000000000..b1385a77da11
 --- /dev/null
 +++ b/fs/bcachefs/bkey_sort.c
-@@ -0,0 +1,253 @@
+@@ -0,0 +1,198 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
@@ -9237,23 +10091,6 @@ index 000000000000..537ab7919e88
 +	return nr;
 +}
 +
-+static void extent_sort_append(struct bch_fs *c,
-+			       struct bkey_format *f,
-+			       struct btree_nr_keys *nr,
-+			       struct bkey_packed **out,
-+			       struct bkey_s k)
-+{
-+	if (!bkey_deleted(k.k)) {
-+		if (!bch2_bkey_pack_key(*out, k.k, f))
-+			memcpy_u64s_small(*out, k.k, BKEY_U64s);
-+
-+		memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
-+
-+		btree_keys_account_key_add(nr, 0, *out);
-+		*out = bkey_next(*out);
-+	}
-+}
-+
 +/* Sort + repack in a new format: */
 +struct btree_nr_keys
 +bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -9264,6 +10101,7 @@ index 000000000000..537ab7919e88
 +	struct bkey_format *in_f = &src->format;
 +	struct bkey_packed *in, *out = vstruct_last(dst);
 +	struct btree_nr_keys nr;
++	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
 +
 +	memset(&nr, 0, sizeof(nr));
 +
@@ -9271,8 +10109,10 @@ index 000000000000..537ab7919e88
 +		if (filter_whiteouts && bkey_deleted(in))
 +			continue;
 +
-+		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-+				       ? in_f : &bch2_bkey_format_current, in))
++		if (!transform)
++			bkey_copy(out, in);
++		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
++					     ? in_f : &bch2_bkey_format_current, in))
 +			out->format = KEY_FORMAT_LOCAL_BTREE;
 +		else
 +			bch2_bkey_unpack(src, (void *) out, in);
@@ -9285,47 +10125,6 @@ index 000000000000..537ab7919e88
 +	return nr;
 +}
 +
-+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
-+struct btree_nr_keys
-+bch2_sort_repack_merge(struct bch_fs *c,
-+		       struct bset *dst, struct btree *src,
-+		       struct btree_node_iter *iter,
-+		       struct bkey_format *out_f,
-+		       bool filter_whiteouts)
-+{
-+	struct bkey_packed *out = vstruct_last(dst), *k_packed;
-+	struct bkey_buf k;
-+	struct btree_nr_keys nr;
-+
-+	memset(&nr, 0, sizeof(nr));
-+	bch2_bkey_buf_init(&k);
-+
-+	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-+		if (filter_whiteouts && bkey_deleted(k_packed))
-+			continue;
-+
-+		/*
-+		 * NOTE:
-+		 * bch2_bkey_normalize may modify the key we pass it (dropping
-+		 * stale pointers) and we don't have a write lock on the src
-+		 * node; we have to make a copy of the entire key before calling
-+		 * normalize
-+		 */
-+		bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
-+		bch2_bkey_unpack(src, k.k, k_packed);
-+
-+		if (filter_whiteouts &&
-+		    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
-+			continue;
-+
-+		extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
-+	}
-+
-+	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-+	bch2_bkey_buf_exit(&k, c);
-+	return nr;
-+}
-+
 +static inline int sort_keys_cmp(struct btree *b,
 +				struct bkey_packed *l,
 +				struct bkey_packed *r)
@@ -9373,10 +10172,10 @@ index 000000000000..537ab7919e88
 +}
 diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
 new file mode 100644
-index 000000000000..1059996dac78
+index 000000000000..79cf11d1b4e7
 --- /dev/null
 +++ b/fs/bcachefs/bkey_sort.h
-@@ -0,0 +1,49 @@
+@@ -0,0 +1,44 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_SORT_H
 +#define _BCACHEFS_BKEY_SORT_H
@@ -9416,11 +10215,6 @@ index 000000000000..1059996dac78
 +bch2_sort_repack(struct bset *, struct btree *,
 +		 struct btree_node_iter *,
 +		 struct bkey_format *, bool);
-+struct btree_nr_keys
-+bch2_sort_repack_merge(struct bch_fs *,
-+		       struct bset *, struct btree *,
-+		       struct btree_node_iter *,
-+		       struct bkey_format *, bool);
 +
 +unsigned bch2_sort_keys(struct bkey_packed *,
 +			struct sort_iter *, bool);
@@ -9428,10 +10222,10 @@ index 000000000000..1059996dac78
 +#endif /* _BCACHEFS_BKEY_SORT_H */
 diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
 new file mode 100644
-index 000000000000..59e4c1d1a2a5
+index 000000000000..c7a41d0dc781
 --- /dev/null
 +++ b/fs/bcachefs/bset.c
-@@ -0,0 +1,1712 @@
+@@ -0,0 +1,1598 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for working with individual keys, and sorted sets of keys with in a
@@ -9504,7 +10298,7 @@ index 000000000000..59e4c1d1a2a5
 +	struct bkey_packed *_k, *_n;
 +	struct bkey uk, n;
 +	struct bkey_s_c k;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +
 +	if (!i->u64s)
 +		return;
@@ -9515,12 +10309,14 @@ index 000000000000..59e4c1d1a2a5
 +		_n = bkey_next(_k);
 +
 +		k = bkey_disassemble(b, _k, &uk);
++
++		printbuf_reset(&buf);
 +		if (c)
-+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
++			bch2_bkey_val_to_text(&buf, c, k);
 +		else
-+			bch2_bkey_to_text(&PBUF(buf), k.k);
++			bch2_bkey_to_text(&buf, k.k);
 +		printk(KERN_ERR "block %u key %5zu: %s\n", set,
-+		       _k->_data - i->_data, buf);
++		       _k->_data - i->_data, buf.buf);
 +
 +		if (_n == vstruct_last(i))
 +			continue;
@@ -9536,6 +10332,8 @@ index 000000000000..59e4c1d1a2a5
 +		    !bpos_cmp(n.p, k.k->p))
 +			printk(KERN_ERR "Duplicate keys\n");
 +	}
++
++	printbuf_exit(&buf);
 +}
 +
 +void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
@@ -9552,6 +10350,7 @@ index 000000000000..59e4c1d1a2a5
 +			      struct btree_node_iter *iter)
 +{
 +	struct btree_node_iter_set *set;
++	struct printbuf buf = PRINTBUF;
 +
 +	printk(KERN_ERR "btree node iter with %u/%u sets:\n",
 +	       __btree_node_iter_used(iter), b->nsets);
@@ -9560,12 +10359,14 @@ index 000000000000..59e4c1d1a2a5
 +		struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
 +		struct bset_tree *t = bch2_bkey_to_bset(b, k);
 +		struct bkey uk = bkey_unpack_key(b, k);
-+		char buf[100];
 +
-+		bch2_bkey_to_text(&PBUF(buf), &uk);
++		printbuf_reset(&buf);
++		bch2_bkey_to_text(&buf, &uk);
 +		printk(KERN_ERR "set %zu key %u: %s\n",
-+		       t - b->set, set->k, buf);
++		       t - b->set, set->k, buf.buf);
 +	}
++
++	printbuf_exit(&buf);
 +}
 +
 +#ifdef CONFIG_BCACHEFS_DEBUG
@@ -9601,13 +10402,14 @@ index 000000000000..59e4c1d1a2a5
 +		struct btree_node_iter_set *set;
 +		struct bkey ku = bkey_unpack_key(b, k);
 +		struct bkey nu = bkey_unpack_key(b, n);
-+		char buf1[80], buf2[80];
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
 +
 +		bch2_dump_btree_node(NULL, b);
-+		bch2_bkey_to_text(&PBUF(buf1), &ku);
-+		bch2_bkey_to_text(&PBUF(buf2), &nu);
++		bch2_bkey_to_text(&buf1, &ku);
++		bch2_bkey_to_text(&buf2, &nu);
 +		printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
-+		       buf1, buf2);
++		       buf1.buf, buf2.buf);
 +		printk(KERN_ERR "iter was:");
 +
 +		btree_node_iter_for_each(_iter, set) {
@@ -9672,6 +10474,8 @@ index 000000000000..59e4c1d1a2a5
 +	struct bset_tree *t = bch2_bkey_to_bset(b, where);
 +	struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
 +	struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
 +#if 0
 +	BUG_ON(prev &&
 +	       bkey_iter_cmp(b, prev, insert) > 0);
@@ -9680,17 +10484,15 @@ index 000000000000..59e4c1d1a2a5
 +	    bkey_iter_cmp(b, prev, insert) > 0) {
 +		struct bkey k1 = bkey_unpack_key(b, prev);
 +		struct bkey k2 = bkey_unpack_key(b, insert);
-+		char buf1[100];
-+		char buf2[100];
 +
 +		bch2_dump_btree_node(NULL, b);
-+		bch2_bkey_to_text(&PBUF(buf1), &k1);
-+		bch2_bkey_to_text(&PBUF(buf2), &k2);
++		bch2_bkey_to_text(&buf1, &k1);
++		bch2_bkey_to_text(&buf2, &k2);
 +
 +		panic("prev > insert:\n"
 +		      "prev    key %s\n"
 +		      "insert  key %s\n",
-+		      buf1, buf2);
++		      buf1.buf, buf2.buf);
 +	}
 +#endif
 +#if 0
@@ -9701,17 +10503,15 @@ index 000000000000..59e4c1d1a2a5
 +	    bkey_iter_cmp(b, insert, next) > 0) {
 +		struct bkey k1 = bkey_unpack_key(b, insert);
 +		struct bkey k2 = bkey_unpack_key(b, next);
-+		char buf1[100];
-+		char buf2[100];
 +
 +		bch2_dump_btree_node(NULL, b);
-+		bch2_bkey_to_text(&PBUF(buf1), &k1);
-+		bch2_bkey_to_text(&PBUF(buf2), &k2);
++		bch2_bkey_to_text(&buf1, &k1);
++		bch2_bkey_to_text(&buf2, &k2);
 +
 +		panic("insert > next:\n"
 +		      "insert  key %s\n"
 +		      "next    key %s\n",
-+		      buf1, buf2);
++		      buf1.buf, buf2.buf);
 +	}
 +#endif
 +}
@@ -9907,7 +10707,7 @@ index 000000000000..59e4c1d1a2a5
 +					       unsigned j)
 +{
 +	return cacheline_to_bkey(b, t,
-+			__eytzinger1_to_inorder(j, t->size, t->extra),
++			__eytzinger1_to_inorder(j, t->size - 1, t->extra),
 +			bkey_float(b, t, j)->key_offset);
 +}
 +
@@ -10041,10 +10841,10 @@ index 000000000000..59e4c1d1a2a5
 +}
 +
 +__always_inline
-+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
-+				 unsigned j,
-+				 struct bkey_packed *min_key,
-+				 struct bkey_packed *max_key)
++static inline void make_bfloat(struct btree *b, struct bset_tree *t,
++			       unsigned j,
++			       struct bkey_packed *min_key,
++			       struct bkey_packed *max_key)
 +{
 +	struct bkey_float *f = bkey_float(b, t, j);
 +	struct bkey_packed *m = tree_to_bkey(b, t, j);
@@ -10113,34 +10913,6 @@ index 000000000000..59e4c1d1a2a5
 +	f->mantissa = mantissa;
 +}
 +
-+static void make_bfloat(struct btree *b, struct bset_tree *t,
-+			unsigned j,
-+			struct bkey_packed *min_key,
-+			struct bkey_packed *max_key)
-+{
-+	struct bkey_i *k;
-+
-+	if (is_power_of_2(j) &&
-+	    !min_key->u64s) {
-+		if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
-+			k = (void *) min_key;
-+			bkey_init(&k->k);
-+			k->k.p = b->data->min_key;
-+		}
-+	}
-+
-+	if (is_power_of_2(j + 1) &&
-+	    !max_key->u64s) {
-+		if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
-+			k = (void *) max_key;
-+			bkey_init(&k->k);
-+			k->k.p = b->data->max_key;
-+		}
-+	}
-+
-+	__make_bfloat(b, t, j, min_key, max_key);
-+}
-+
 +/* bytes remaining - only valid for last bset: */
 +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 +{
@@ -10197,7 +10969,7 @@ index 000000000000..59e4c1d1a2a5
 +	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
 +
 +	/* First we figure out where the first key in each cacheline is */
-+	eytzinger1_for_each(j, t->size) {
++	eytzinger1_for_each(j, t->size - 1) {
 +		while (bkey_to_cacheline(b, t, k) < cacheline)
 +			prev = k, k = bkey_next(k);
 +
@@ -10229,10 +11001,10 @@ index 000000000000..59e4c1d1a2a5
 +	}
 +
 +	/* Then we build the tree */
-+	eytzinger1_for_each(j, t->size)
-+		__make_bfloat(b, t, j,
-+			      bkey_to_packed(&min_key),
-+			      bkey_to_packed(&max_key));
++	eytzinger1_for_each(j, t->size - 1)
++		make_bfloat(b, t, j,
++			    bkey_to_packed(&min_key),
++			    bkey_to_packed(&max_key));
 +}
 +
 +static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -10331,7 +11103,7 @@ index 000000000000..59e4c1d1a2a5
 +		do {
 +			p = j ? tree_to_bkey(b, t,
 +					__inorder_to_eytzinger1(j--,
-+							t->size, t->extra))
++							t->size - 1, t->extra))
 +			      : btree_bkey_first(b, t);
 +		} while (p >= k);
 +		break;
@@ -10377,91 +11149,6 @@ index 000000000000..59e4c1d1a2a5
 +
 +/* Insert */
 +
-+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
-+					    struct bset_tree *t,
-+					    struct bkey_packed *k)
-+{
-+	unsigned offset = __btree_node_key_to_offset(b, k);
-+	unsigned j = rw_aux_tree_bsearch(b, t, offset);
-+
-+	if (j < t->size &&
-+	    rw_aux_tree(b, t)[j].offset == offset)
-+		rw_aux_tree_set(b, t, j, k);
-+
-+	bch2_bset_verify_rw_aux_tree(b, t);
-+}
-+
-+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
-+					    struct bset_tree *t,
-+					    struct bkey_packed *k)
-+{
-+	struct bkey_packed min_key, max_key;
-+	unsigned inorder, j;
-+
-+	EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-+
-+	/* signal to make_bfloat() that they're uninitialized: */
-+	min_key.u64s = max_key.u64s = 0;
-+
-+	if (bkey_next(k) == btree_bkey_last(b, t)) {
-+		for (j = 1; j < t->size; j = j * 2 + 1)
-+			make_bfloat(b, t, j, &min_key, &max_key);
-+	}
-+
-+	inorder = bkey_to_cacheline(b, t, k);
-+
-+	if (inorder &&
-+	    inorder < t->size) {
-+		j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-+
-+		if (k == tree_to_bkey(b, t, j)) {
-+			/* Fix the node this key corresponds to */
-+			make_bfloat(b, t, j, &min_key, &max_key);
-+
-+			/* Children for which this key is the right boundary */
-+			for (j = eytzinger1_left_child(j);
-+			     j < t->size;
-+			     j = eytzinger1_right_child(j))
-+				make_bfloat(b, t, j, &min_key, &max_key);
-+		}
-+	}
-+
-+	if (inorder + 1 < t->size) {
-+		j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
-+
-+		if (k == tree_to_prev_bkey(b, t, j)) {
-+			make_bfloat(b, t, j, &min_key, &max_key);
-+
-+			/* Children for which this key is the left boundary */
-+			for (j = eytzinger1_right_child(j);
-+			     j < t->size;
-+			     j = eytzinger1_left_child(j))
-+				make_bfloat(b, t, j, &min_key, &max_key);
-+		}
-+	}
-+}
-+
-+/**
-+ * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
-+ * modified, fix any auxiliary search tree by remaking all the nodes in the
-+ * auxiliary search tree that @k corresponds to
-+ */
-+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
-+{
-+	struct bset_tree *t = bch2_bkey_to_bset(b, k);
-+
-+	switch (bset_aux_tree_type(t)) {
-+	case BSET_NO_AUX_TREE:
-+		break;
-+	case BSET_RO_AUX_TREE:
-+		ro_aux_tree_fix_invalidated_key(b, t, k);
-+		break;
-+	case BSET_RW_AUX_TREE:
-+		rw_aux_tree_fix_invalidated_key(b, t, k);
-+		break;
-+	}
-+}
-+
 +static void bch2_bset_fix_lookup_table(struct btree *b,
 +				       struct bset_tree *t,
 +				       struct bkey_packed *_where,
@@ -10696,7 +11383,7 @@ index 000000000000..59e4c1d1a2a5
 +		n = n * 2 + (cmp < 0);
 +	} while (n < t->size);
 +
-+	inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
++	inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
 +
 +	/*
 +	 * n would have been the node we recursed to - the low bit tells us if
@@ -10707,7 +11394,7 @@ index 000000000000..59e4c1d1a2a5
 +		if (unlikely(!inorder))
 +			return btree_bkey_first(b, t);
 +
-+		f = &base->f[eytzinger1_prev(n >> 1, t->size)];
++		f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
 +	}
 +
 +	return cacheline_to_bkey(b, t, inorder, f->key_offset);
@@ -10981,10 +11668,6 @@ index 000000000000..59e4c1d1a2a5
 +
 +	EBUG_ON(iter->data->k > iter->data->end);
 +
-+	while (!__btree_node_iter_set_end(iter, 0) &&
-+	       !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
-+		iter->data->k++;
-+
 +	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
 +		bch2_btree_node_iter_set_drop(iter, iter->data);
 +		return;
@@ -11118,9 +11801,6 @@ index 000000000000..59e4c1d1a2a5
 +	struct bkey uk;
 +	unsigned j, inorder;
 +
-+	if (out->pos != out->end)
-+		*out->pos = '\0';
-+
 +	if (!bset_has_ro_aux_tree(t))
 +		return;
 +
@@ -11128,7 +11808,7 @@ index 000000000000..59e4c1d1a2a5
 +	if (!inorder || inorder >= t->size)
 +		return;
 +
-+	j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
++	j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
 +	if (k != tree_to_bkey(b, t, j))
 +		return;
 +
@@ -11146,10 +11826,10 @@ index 000000000000..59e4c1d1a2a5
 +}
 diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
 new file mode 100644
-index 000000000000..e42f866cf2ec
+index 000000000000..0d46534c3dcd
 --- /dev/null
 +++ b/fs/bcachefs/bset.h
-@@ -0,0 +1,616 @@
+@@ -0,0 +1,615 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BSET_H
 +#define _BCACHEFS_BSET_H
@@ -11513,7 +12193,6 @@ index 000000000000..e42f866cf2ec
 +void bch2_bset_init_next(struct bch_fs *, struct btree *,
 +			 struct btree_node_entry *);
 +void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
 +
 +void bch2_bset_insert(struct btree *, struct btree_node_iter *,
 +		     struct bkey_packed *, struct bkey_i *, unsigned);
@@ -11768,10 +12447,10 @@ index 000000000000..e42f866cf2ec
 +#endif /* _BCACHEFS_BSET_H */
 diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
 new file mode 100644
-index 000000000000..5ae61e5d3923
+index 000000000000..0dcdc30c6888
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.c
-@@ -0,0 +1,1095 @@
+@@ -0,0 +1,1160 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -11789,6 +12468,13 @@ index 000000000000..5ae61e5d3923
 +
 +struct lock_class_key bch2_btree_node_lock_key;
 +
++const char * const bch2_btree_node_flags[] = {
++#define x(f)	#f,
++	BTREE_FLAGS()
++#undef x
++	NULL
++};
++
 +void bch2_recalc_btree_reserve(struct bch_fs *c)
 +{
 +	unsigned i, reserve = 16;
@@ -11809,6 +12495,14 @@ index 000000000000..5ae61e5d3923
 +	return max_t(int, 0, bc->used - bc->reserve);
 +}
 +
++static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
++{
++	if (b->c.lock.readers)
++		list_move(&b->list, &bc->freed_pcpu);
++	else
++		list_move(&b->list, &bc->freed_nonpcpu);
++}
++
 +static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 +{
 +	struct btree_cache *bc = &c->btree_cache;
@@ -11825,7 +12519,8 @@ index 000000000000..5ae61e5d3923
 +	b->aux_data = NULL;
 +
 +	bc->used--;
-+	list_move(&b->list, &bc->freed);
++
++	btree_node_to_freedlist(bc, b);
 +}
 +
 +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -11857,6 +12552,8 @@ index 000000000000..5ae61e5d3923
 +	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
 +			   PROT_READ|PROT_WRITE|PROT_EXEC,
 +			   MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
++	if (b->aux_data == MAP_FAILED)
++		b->aux_data = NULL;
 +#endif
 +	if (!b->aux_data) {
 +		kvpfree(b->data, btree_bytes(c));
@@ -11928,11 +12625,6 @@ index 000000000000..5ae61e5d3923
 +	b->c.level	= level;
 +	b->c.btree_id	= id;
 +
-+	if (level)
-+		six_lock_pcpu_alloc(&b->c.lock);
-+	else
-+		six_lock_pcpu_free_rcu(&b->c.lock);
-+
 +	mutex_lock(&bc->lock);
 +	ret = __bch2_btree_node_hash_insert(bc, b);
 +	if (!ret)
@@ -11989,15 +12681,13 @@ index 000000000000..5ae61e5d3923
 +		goto wait_on_io;
 +	}
 +
-+	if (btree_node_noevict(b))
-+		goto out_unlock;
-+
-+	if (!btree_node_may_write(b))
++	if (btree_node_noevict(b) ||
++	    btree_node_write_blocked(b) ||
++	    btree_node_will_make_reachable(b))
 +		goto out_unlock;
 +
 +	if (btree_node_dirty(b)) {
-+		if (!flush ||
-+		    test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
++		if (!flush)
 +			goto out_unlock;
 +		/*
 +		 * Using the underscore version because we don't want to compact
@@ -12006,9 +12696,9 @@ index 000000000000..5ae61e5d3923
 +		 * the post write cleanup:
 +		 */
 +		if (bch2_verify_btree_ondisk)
-+			bch2_btree_node_write(c, b, SIX_LOCK_intent);
++			bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
 +		else
-+			__bch2_btree_node_write(c, b, false);
++			__bch2_btree_node_write(c, b, 0);
 +
 +		six_unlock_write(&b->c.lock);
 +		six_unlock_intent(&b->c.lock);
@@ -12048,6 +12738,7 @@ index 000000000000..5ae61e5d3923
 +	unsigned long touched = 0;
 +	unsigned long freed = 0;
 +	unsigned i, flags;
++	unsigned long ret = SHRINK_STOP;
 +
 +	if (bch2_btree_shrinker_disabled)
 +		return SHRINK_STOP;
@@ -12056,7 +12747,7 @@ index 000000000000..5ae61e5d3923
 +	if (sc->gfp_mask & __GFP_FS)
 +		mutex_lock(&bc->lock);
 +	else if (!mutex_trylock(&bc->lock))
-+		return -1;
++		goto out_norestore;
 +
 +	flags = memalloc_nofs_save();
 +
@@ -12073,13 +12764,19 @@ index 000000000000..5ae61e5d3923
 +
 +	i = 0;
 +	list_for_each_entry_safe(b, t, &bc->freeable, list) {
++		/*
++		 * Leave a few nodes on the freeable list, so that a btree split
++		 * won't have to hit the system allocator:
++		 */
++		if (++i <= 3)
++			continue;
++
 +		touched++;
 +
 +		if (touched >= nr)
 +			break;
 +
-+		if (++i > 3 &&
-+		    !btree_node_reclaim(c, b)) {
++		if (!btree_node_reclaim(c, b)) {
 +			btree_node_data_free(c, b);
 +			six_unlock_write(&b->c.lock);
 +			six_unlock_intent(&b->c.lock);
@@ -12088,17 +12785,13 @@ index 000000000000..5ae61e5d3923
 +	}
 +restart:
 +	list_for_each_entry_safe(b, t, &bc->live, list) {
-+		touched++;
-+
-+		if (touched >= nr) {
-+			/* Save position */
-+			if (&t->list != &bc->live)
-+				list_move_tail(&bc->live, &t->list);
-+			break;
++		/* tweak this */
++		if (btree_node_accessed(b)) {
++			clear_btree_node_accessed(b);
++			goto touched;
 +		}
 +
-+		if (!btree_node_accessed(b) &&
-+		    !btree_node_reclaim(c, b)) {
++		if (!btree_node_reclaim(c, b)) {
 +			/* can't call bch2_btree_node_hash_remove under lock  */
 +			freed++;
 +			if (&t->list != &bc->live)
@@ -12119,14 +12812,30 @@ index 000000000000..5ae61e5d3923
 +			else if (!mutex_trylock(&bc->lock))
 +				goto out;
 +			goto restart;
-+		} else
-+			clear_btree_node_accessed(b);
++		} else {
++			continue;
++		}
++touched:
++		touched++;
++
++		if (touched >= nr) {
++			/* Save position */
++			if (&t->list != &bc->live)
++				list_move_tail(&bc->live, &t->list);
++			break;
++		}
 +	}
 +
 +	mutex_unlock(&bc->lock);
 +out:
++	ret = (unsigned long) freed * btree_pages(c);
 +	memalloc_nofs_restore(flags);
-+	return (unsigned long) freed * btree_pages(c);
++out_norestore:
++	trace_btree_cache_scan(sc->nr_to_scan,
++			       sc->nr_to_scan / btree_pages(c),
++			       btree_cache_can_free(bc),
++			       ret);
++	return ret;
 +}
 +
 +static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
@@ -12174,15 +12883,17 @@ index 000000000000..5ae61e5d3923
 +
 +		if (btree_node_dirty(b))
 +			bch2_btree_complete_write(c, b, btree_current_write(b));
-+		clear_btree_node_dirty(c, b);
++		clear_btree_node_dirty_acct(c, b);
 +
 +		btree_node_data_free(c, b);
 +	}
 +
 +	BUG_ON(atomic_read(&c->btree_cache.dirty));
 +
-+	while (!list_empty(&bc->freed)) {
-+		b = list_first_entry(&bc->freed, struct btree, list);
++	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
++
++	while (!list_empty(&bc->freed_nonpcpu)) {
++		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
 +		list_del(&b->list);
 +		six_lock_pcpu_free(&b->c.lock);
 +		kfree(b);
@@ -12236,7 +12947,8 @@ index 000000000000..5ae61e5d3923
 +	mutex_init(&bc->lock);
 +	INIT_LIST_HEAD(&bc->live);
 +	INIT_LIST_HEAD(&bc->freeable);
-+	INIT_LIST_HEAD(&bc->freed);
++	INIT_LIST_HEAD(&bc->freed_pcpu);
++	INIT_LIST_HEAD(&bc->freed_nonpcpu);
 +}
 +
 +/*
@@ -12311,10 +13023,13 @@ index 000000000000..5ae61e5d3923
 +	}
 +}
 +
-+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
 +{
 +	struct btree_cache *bc = &c->btree_cache;
-+	struct btree *b;
++	struct list_head *freed = pcpu_read_locks
++		? &bc->freed_pcpu
++		: &bc->freed_nonpcpu;
++	struct btree *b, *b2;
 +	u64 start_time = local_clock();
 +	unsigned flags;
 +
@@ -12322,44 +13037,49 @@ index 000000000000..5ae61e5d3923
 +	mutex_lock(&bc->lock);
 +
 +	/*
-+	 * btree_free() doesn't free memory; it sticks the node on the end of
-+	 * the list. Check if there's any freed nodes there:
-+	 */
-+	list_for_each_entry(b, &bc->freeable, list)
-+		if (!btree_node_reclaim(c, b))
-+			goto got_node;
-+
-+	/*
 +	 * We never free struct btree itself, just the memory that holds the on
 +	 * disk node. Check the freed list before allocating a new one:
 +	 */
-+	list_for_each_entry(b, &bc->freed, list)
-+		if (!btree_node_reclaim(c, b))
++	list_for_each_entry(b, freed, list)
++		if (!btree_node_reclaim(c, b)) {
++			list_del_init(&b->list);
 +			goto got_node;
++		}
 +
-+	b = NULL;
++	b = __btree_node_mem_alloc(c);
++	if (!b)
++		goto err_locked;
++
++	if (pcpu_read_locks)
++		six_lock_pcpu_alloc(&b->c.lock);
++
++	BUG_ON(!six_trylock_intent(&b->c.lock));
++	BUG_ON(!six_trylock_write(&b->c.lock));
 +got_node:
-+	if (b)
-+		list_del_init(&b->list);
++
++	/*
++	 * btree_free() doesn't free memory; it sticks the node on the end of
++	 * the list. Check if there's any freed nodes there:
++	 */
++	list_for_each_entry(b2, &bc->freeable, list)
++		if (!btree_node_reclaim(c, b2)) {
++			swap(b->data, b2->data);
++			swap(b->aux_data, b2->aux_data);
++			btree_node_to_freedlist(bc, b2);
++			six_unlock_write(&b2->c.lock);
++			six_unlock_intent(&b2->c.lock);
++			goto got_mem;
++		}
++
 +	mutex_unlock(&bc->lock);
 +
-+	if (!b) {
-+		b = __btree_node_mem_alloc(c);
-+		if (!b)
-+			goto err;
++	if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
++		goto err;
 +
-+		BUG_ON(!six_trylock_intent(&b->c.lock));
-+		BUG_ON(!six_trylock_write(&b->c.lock));
-+	}
-+
-+	if (!b->data) {
-+		if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
-+			goto err;
-+
-+		mutex_lock(&bc->lock);
-+		bc->used++;
-+		mutex_unlock(&bc->lock);
-+	}
++	mutex_lock(&bc->lock);
++	bc->used++;
++got_mem:
++	mutex_unlock(&bc->lock);
 +
 +	BUG_ON(btree_node_hashed(b));
 +	BUG_ON(btree_node_dirty(b));
@@ -12381,20 +13101,24 @@ index 000000000000..5ae61e5d3923
 +	return b;
 +err:
 +	mutex_lock(&bc->lock);
-+
-+	if (b) {
-+		list_add(&b->list, &bc->freed);
-+		six_unlock_write(&b->c.lock);
-+		six_unlock_intent(&b->c.lock);
-+	}
-+
++err_locked:
 +	/* Try to cannibalize another cached btree node: */
 +	if (bc->alloc_lock == current) {
-+		b = btree_node_cannibalize(c);
-+		list_del_init(&b->list);
-+		mutex_unlock(&bc->lock);
++		b2 = btree_node_cannibalize(c);
++		bch2_btree_node_hash_remove(bc, b2);
 +
-+		bch2_btree_node_hash_remove(bc, b);
++		if (b) {
++			swap(b->data, b2->data);
++			swap(b->aux_data, b2->aux_data);
++			btree_node_to_freedlist(bc, b2);
++			six_unlock_write(&b2->c.lock);
++			six_unlock_intent(&b2->c.lock);
++		} else {
++			b = b2;
++			list_del_init(&b->list);
++		}
++
++		mutex_unlock(&bc->lock);
 +
 +		trace_btree_node_cannibalize(c);
 +		goto out;
@@ -12425,11 +13149,22 @@ index 000000000000..5ae61e5d3923
 +	 * been freed:
 +	 */
 +	if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
++		trace_trans_restart_relock_parent_for_fill(trans->fn,
++					_THIS_IP_, btree_id, &path->pos);
++		btree_trans_restart(trans);
++		return ERR_PTR(-EINTR);
++	}
++
++	b = bch2_btree_node_mem_alloc(c, level != 0);
++
++	if (trans && b == ERR_PTR(-ENOMEM)) {
++		trans->memory_allocation_failure = true;
++		trace_trans_restart_memory_allocation_failure(trans->fn,
++				_THIS_IP_, btree_id, &path->pos);
 +		btree_trans_restart(trans);
 +		return ERR_PTR(-EINTR);
 +	}
 +
-+	b = bch2_btree_node_mem_alloc(c);
 +	if (IS_ERR(b))
 +		return b;
 +
@@ -12472,6 +13207,8 @@ index 000000000000..5ae61e5d3923
 +	}
 +
 +	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
++		trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
++					   btree_id, &path->pos);
 +		btree_trans_restart(trans);
 +		return ERR_PTR(-EINTR);
 +	}
@@ -12489,14 +13226,16 @@ index 000000000000..5ae61e5d3923
 +
 +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 +{
-+	char buf1[200], buf2[100], buf3[100];
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	struct printbuf buf3 = PRINTBUF;
 +
 +	if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
 +		return;
 +
-+	bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&b->key));
-+	bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
-+	bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
++	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&b->key));
++	bch2_bpos_to_text(&buf2, b->data->min_key);
++	bch2_bpos_to_text(&buf3, b->data->max_key);
 +
 +	bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
 +			     "btree %s level %u\n"
@@ -12504,10 +13243,14 @@ index 000000000000..5ae61e5d3923
 +			     "header: btree %s level %llu\n"
 +			     "min %s max %s\n",
 +			     bch2_btree_ids[b->c.btree_id], b->c.level,
-+			     buf1,
++			     buf1.buf,
 +			     bch2_btree_ids[BTREE_NODE_ID(b->data)],
 +			     BTREE_NODE_LEVEL(b->data),
-+			     buf2, buf3);
++			     buf2.buf, buf3.buf);
++
++	printbuf_exit(&buf3);
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +}
 +
 +static inline void btree_check_header(struct bch_fs *c, struct btree *b)
@@ -12542,16 +13285,17 @@ index 000000000000..5ae61e5d3923
 +
 +	EBUG_ON(level >= BTREE_MAX_DEPTH);
 +
-+	if (c->opts.btree_node_mem_ptr_optimization) {
-+		b = btree_node_mem_ptr(k);
-+		/*
-+		 * Check b->hash_val _before_ calling btree_node_lock() - this
-+		 * might not be the node we want anymore, and trying to lock the
-+		 * wrong node could cause an unneccessary transaction restart:
-+		 */
-+		if (b && b->hash_val == btree_ptr_hash_val(k))
++	b = btree_node_mem_ptr(k);
++
++	/*
++	 * Check b->hash_val _before_ calling btree_node_lock() - this might not
++	 * be the node we want anymore, and trying to lock the wrong node could
++	 * cause an unneccessary transaction restart:
++	 */
++	if (likely(c->opts.btree_node_mem_ptr_optimization &&
++		   b &&
++		   b->hash_val == btree_ptr_hash_val(k)))
 +			goto lock_node;
-+	}
 +retry:
 +	b = btree_cache_find(bc, k);
 +	if (unlikely(!b)) {
@@ -12616,7 +13360,7 @@ index 000000000000..5ae61e5d3923
 +			if (bch2_btree_node_relock(trans, path, level + 1))
 +				goto retry;
 +
-+			trace_trans_restart_btree_node_reused(trans->ip,
++			trace_trans_restart_btree_node_reused(trans->fn,
 +							      trace_ip,
 +							      path->btree_id,
 +							      &path->pos);
@@ -12798,7 +13542,7 @@ index 000000000000..5ae61e5d3923
 +	six_lock_write(&b->c.lock, NULL, NULL);
 +
 +	if (btree_node_dirty(b)) {
-+		__bch2_btree_node_write(c, b, false);
++		__bch2_btree_node_write(c, b, 0);
 +		six_unlock_write(&b->c.lock);
 +		six_unlock_intent(&b->c.lock);
 +		goto wait_on_io;
@@ -12869,10 +13613,10 @@ index 000000000000..5ae61e5d3923
 +}
 diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
 new file mode 100644
-index 000000000000..402cec1802bc
+index 000000000000..25906127c023
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.h
-@@ -0,0 +1,105 @@
+@@ -0,0 +1,107 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_CACHE_H
 +#define _BCACHEFS_BTREE_CACHE_H
@@ -12882,6 +13626,8 @@ index 000000000000..402cec1802bc
 +
 +extern struct lock_class_key bch2_btree_node_lock_key;
 +
++extern const char * const bch2_btree_node_flags[];
++
 +struct btree_iter;
 +
 +void bch2_recalc_btree_reserve(struct bch_fs *);
@@ -12895,7 +13641,7 @@ index 000000000000..402cec1802bc
 +int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 +
 +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
-+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool);
 +
 +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
 +				  const struct bkey_i *, unsigned,
@@ -12946,7 +13692,7 @@ index 000000000000..402cec1802bc
 +
 +static inline size_t btree_bytes(struct bch_fs *c)
 +{
-+	return c->opts.btree_node_size << 9;
++	return c->opts.btree_node_size;
 +}
 +
 +static inline size_t btree_max_u64s(struct bch_fs *c)
@@ -12961,7 +13707,7 @@ index 000000000000..402cec1802bc
 +
 +static inline unsigned btree_blocks(struct bch_fs *c)
 +{
-+	return c->opts.btree_node_size >> c->block_bits;
++	return btree_sectors(c) >> c->block_bits;
 +}
 +
 +#define BTREE_SPLIT_THRESHOLD(c)		(btree_max_u64s(c) * 2 / 3)
@@ -12980,10 +13726,10 @@ index 000000000000..402cec1802bc
 +#endif /* _BCACHEFS_BTREE_CACHE_H */
 diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
 new file mode 100644
-index 000000000000..091bddee575d
+index 000000000000..e19991796c82
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.c
-@@ -0,0 +1,1952 @@
+@@ -0,0 +1,2102 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
@@ -12995,6 +13741,7 @@ index 000000000000..091bddee575d
 +#include "alloc_foreground.h"
 +#include "bkey_methods.h"
 +#include "bkey_buf.h"
++#include "btree_key_cache.h"
 +#include "btree_locking.h"
 +#include "btree_update_interior.h"
 +#include "btree_io.h"
@@ -13055,23 +13802,23 @@ index 000000000000..091bddee575d
 +	struct bpos expected_start = bkey_deleted(&prev->k->k)
 +		? node_start
 +		: bpos_successor(prev->k->k.p);
-+	char buf1[200], buf2[200];
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +	int ret = 0;
 +
 +	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
 +		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
 +
-+		if (bkey_deleted(&prev->k->k)) {
-+			struct printbuf out = PBUF(buf1);
-+			pr_buf(&out, "start of node: ");
-+			bch2_bpos_to_text(&out, node_start);
-+		} else {
-+			bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
-+		}
-+
 +		if (bpos_cmp(expected_start, bp->v.min_key)) {
 +			bch2_topology_error(c);
 +
++			if (bkey_deleted(&prev->k->k)) {
++				pr_buf(&buf1, "start of node: ");
++				bch2_bpos_to_text(&buf1, node_start);
++			} else {
++				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
++			}
++			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
++
 +			if (__fsck_err(c,
 +				  FSCK_CAN_FIX|
 +				  FSCK_CAN_IGNORE|
@@ -13080,11 +13827,11 @@ index 000000000000..091bddee575d
 +				  "  prev %s\n"
 +				  "  cur %s",
 +				  bch2_btree_ids[b->c.btree_id], b->c.level,
-+				  buf1,
-+				  (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)) &&
++				  buf1.buf, buf2.buf) &&
 +			    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 +				bch_info(c, "Halting mark and sweep to start topology repair pass");
-+				return FSCK_ERR_START_TOPOLOGY_REPAIR;
++				ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
++				goto err;
 +			} else {
 +				set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
 +			}
@@ -13094,6 +13841,12 @@ index 000000000000..091bddee575d
 +	if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
 +		bch2_topology_error(c);
 +
++		printbuf_reset(&buf1);
++		printbuf_reset(&buf2);
++
++		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
++		bch2_bpos_to_text(&buf2, node_end);
++
 +		if (__fsck_err(c,
 +			  FSCK_CAN_FIX|
 +			  FSCK_CAN_IGNORE|
@@ -13102,18 +13855,21 @@ index 000000000000..091bddee575d
 +			  "  %s\n"
 +			  "  expected %s",
 +			  bch2_btree_ids[b->c.btree_id], b->c.level,
-+			  (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-+			  (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)) &&
++			  buf1.buf, buf2.buf) &&
 +		    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 +			bch_info(c, "Halting mark and sweep to start topology repair pass");
-+			return FSCK_ERR_START_TOPOLOGY_REPAIR;
++			ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
++			goto err;
 +		} else {
 +			set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
 +		}
 +	}
 +
 +	bch2_bkey_buf_copy(prev, c, cur.k);
++err:
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
@@ -13141,6 +13897,34 @@ index 000000000000..091bddee575d
 +	}
 +}
 +
++static void bch2_btree_node_update_key_early(struct bch_fs *c,
++					     enum btree_id btree, unsigned level,
++					     struct bkey_s_c old, struct bkey_i *new)
++{
++	struct btree *b;
++	struct bkey_buf tmp;
++	int ret;
++
++	bch2_bkey_buf_init(&tmp);
++	bch2_bkey_buf_reassemble(&tmp, c, old);
++
++	b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
++	if (!IS_ERR_OR_NULL(b)) {
++		mutex_lock(&c->btree_cache.lock);
++
++		bch2_btree_node_hash_remove(&c->btree_cache, b);
++
++		bkey_copy(&b->key, new);
++		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
++		BUG_ON(ret);
++
++		mutex_unlock(&c->btree_cache.lock);
++		six_unlock_read(&b->c.lock);
++	}
++
++	bch2_bkey_buf_exit(&tmp, c);
++}
++
 +static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 +{
 +	struct bkey_i_btree_ptr_v2 *new;
@@ -13155,7 +13939,7 @@ index 000000000000..091bddee575d
 +	new->v.min_key		= new_min;
 +	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 +
-+	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
++	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
 +	if (ret) {
 +		kfree(new);
 +		return ret;
@@ -13184,7 +13968,7 @@ index 000000000000..091bddee575d
 +	new->k.p		= new_max;
 +	SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 +
-+	ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
++	ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
 +	if (ret) {
 +		kfree(new);
 +		return ret;
@@ -13208,18 +13992,17 @@ index 000000000000..091bddee575d
 +	struct bpos expected_start = !prev
 +		? b->data->min_key
 +		: bpos_successor(prev->key.k.p);
-+	char buf1[200], buf2[200];
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +	int ret = 0;
 +
 +	if (!prev) {
-+		struct printbuf out = PBUF(buf1);
-+		pr_buf(&out, "start of node: ");
-+		bch2_bpos_to_text(&out, b->data->min_key);
++		pr_buf(&buf1, "start of node: ");
++		bch2_bpos_to_text(&buf1, b->data->min_key);
 +	} else {
-+		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
++		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
 +	}
 +
-+	bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
++	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
 +
 +	if (prev &&
 +	    bpos_cmp(expected_start, cur->data->min_key) > 0 &&
@@ -13232,8 +14015,10 @@ index 000000000000..091bddee575d
 +				"  node %s\n"
 +				"  next %s",
 +				bch2_btree_ids[b->c.btree_id], b->c.level,
-+				buf1, buf2))
-+			return DROP_PREV_NODE;
++				buf1.buf, buf2.buf)) {
++			ret = DROP_PREV_NODE;
++			goto out;
++		}
 +
 +		if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
 +						 bpos_predecessor(cur->data->min_key)), c,
@@ -13241,7 +14026,7 @@ index 000000000000..091bddee575d
 +				"  node %s\n"
 +				"  next %s",
 +				bch2_btree_ids[b->c.btree_id], b->c.level,
-+				buf1, buf2))
++				buf1.buf, buf2.buf))
 +			ret = set_node_max(c, prev,
 +					   bpos_predecessor(cur->data->min_key));
 +	} else {
@@ -13253,39 +14038,49 @@ index 000000000000..091bddee575d
 +				"  prev %s\n"
 +				"  node %s",
 +				bch2_btree_ids[b->c.btree_id], b->c.level,
-+				buf1, buf2))
-+			return DROP_THIS_NODE;
++				buf1.buf, buf2.buf)) {
++			ret = DROP_THIS_NODE;
++			goto out;
++		}
 +
 +		if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
 +				"btree node with incorrect min_key at btree %s level %u:\n"
 +				"  prev %s\n"
 +				"  node %s",
 +				bch2_btree_ids[b->c.btree_id], b->c.level,
-+				buf1, buf2))
++				buf1.buf, buf2.buf))
 +		    ret = set_node_min(c, cur, expected_start);
 +	}
++out:
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
 +static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
 +				 struct btree *child)
 +{
-+	char buf1[200], buf2[200];
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +	int ret = 0;
 +
++	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
++	bch2_bpos_to_text(&buf2, b->key.k.p);
++
 +	if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
 +			"btree node with incorrect max_key at btree %s level %u:\n"
 +			"  %s\n"
 +			"  expected %s",
 +			bch2_btree_ids[b->c.btree_id], b->c.level,
-+			(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
-+			(bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
++			buf1.buf, buf2.buf)) {
 +		ret = set_node_max(c, child, b->key.k.p);
 +		if (ret)
-+			return ret;
++			goto err;
 +	}
++err:
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
@@ -13296,7 +14091,7 @@ index 000000000000..091bddee575d
 +	struct bkey_buf prev_k, cur_k;
 +	struct btree *prev = NULL, *cur = NULL;
 +	bool have_child, dropped_children = false;
-+	char buf[200];
++	struct printbuf buf;
 +	int ret = 0;
 +
 +	if (!b->c.level)
@@ -13320,12 +14115,15 @@ index 000000000000..091bddee575d
 +					false);
 +		ret = PTR_ERR_OR_ZERO(cur);
 +
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
++
 +		if (mustfix_fsck_err_on(ret == -EIO, c,
 +				"Unreadable btree node at btree %s level %u:\n"
 +				"  %s",
 +				bch2_btree_ids[b->c.btree_id],
 +				b->c.level - 1,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
++				buf.buf)) {
 +			bch2_btree_node_evict(c, cur_k.k);
 +			ret = bch2_journal_key_delete(c, b->c.btree_id,
 +						      b->c.level, cur_k.k->k.p);
@@ -13425,12 +14223,14 @@ index 000000000000..091bddee575d
 +		have_child = true;
 +	}
 +
++	printbuf_reset(&buf);
++	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
++
 +	if (mustfix_fsck_err_on(!have_child, c,
 +			"empty interior btree node at btree %s level %u\n"
 +			"  %s",
 +			bch2_btree_ids[b->c.btree_id],
-+			b->c.level,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
++			b->c.level, buf.buf))
 +		ret = DROP_THIS_NODE;
 +err:
 +fsck_err:
@@ -13446,6 +14246,7 @@ index 000000000000..091bddee575d
 +	if (!ret && dropped_children)
 +		goto again;
 +
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -13481,7 +14282,7 @@ index 000000000000..091bddee575d
 +	const union bch_extent_entry *entry;
 +	struct extent_ptr_decoded p = { 0 };
 +	bool do_update = false;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	/*
@@ -13490,8 +14291,7 @@ index 000000000000..091bddee575d
 +	 */
 +	bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-+		struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
-+		struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
++		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
 +		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 +
 +		if (fsck_err_on(!g->gen_valid, c,
@@ -13500,103 +14300,94 @@ index 000000000000..091bddee575d
 +				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 +				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 +				p.ptr.gen,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 +			if (!p.ptr.cached) {
-+				g2->_mark.gen	= g->_mark.gen		= p.ptr.gen;
-+				g2->gen_valid	= g->gen_valid		= true;
-+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
++				g->gen_valid		= true;
++				g->gen			= p.ptr.gen;
 +			} else {
 +				do_update = true;
 +			}
 +		}
 +
-+		if (fsck_err_on(data_type == BCH_DATA_btree &&
-+				g->mark.gen != p.ptr.gen, c,
-+				"bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
-+				"while marking %s",
-+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-+				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-+				p.ptr.gen, g->mark.gen,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-+			g2->_mark.data_type	= g->_mark.data_type	= data_type;
-+			g2->gen_valid		= g->gen_valid		= true;
-+			set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-+		}
-+
-+		if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
++		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
 +				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
 +				"while marking %s",
 +				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 +				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-+				p.ptr.gen, g->mark.gen,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
++				p.ptr.gen, g->gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 +			if (!p.ptr.cached) {
-+				g2->_mark.gen	= g->_mark.gen	= p.ptr.gen;
-+				g2->gen_valid	= g->gen_valid	= true;
-+				g2->_mark.data_type		= 0;
-+				g2->_mark.dirty_sectors		= 0;
-+				g2->_mark.cached_sectors	= 0;
++				g->gen_valid		= true;
++				g->gen			= p.ptr.gen;
++				g->data_type		= 0;
++				g->dirty_sectors	= 0;
++				g->cached_sectors	= 0;
 +				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
 +			} else {
 +				do_update = true;
 +			}
 +		}
 +
-+		if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
++		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
 +				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 +				"while marking %s",
-+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
++				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
 +				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
 +				p.ptr.gen,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 +			do_update = true;
 +
 +		if (fsck_err_on(!p.ptr.cached &&
-+				gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
++				gen_cmp(p.ptr.gen, g->gen) < 0, c,
 +				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
 +				"while marking %s",
 +				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
 +				bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-+				p.ptr.gen, g->mark.gen,
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
++				p.ptr.gen, g->gen,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 +			do_update = true;
 +
-+		if (p.ptr.gen != g->mark.gen)
++		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
 +			continue;
 +
-+		if (fsck_err_on(g->mark.data_type &&
-+				g->mark.data_type != data_type, c,
++		if (fsck_err_on(g->data_type &&
++				g->data_type != data_type, c,
 +				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
 +				"while marking %s",
 +				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-+				bch2_data_types[g->mark.data_type],
++				bch2_data_types[g->data_type],
 +				bch2_data_types[data_type],
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 +			if (data_type == BCH_DATA_btree) {
-+				g2->_mark.data_type	= g->_mark.data_type	= data_type;
-+				g2->gen_valid		= g->gen_valid		= true;
-+				set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
++				g->data_type	= data_type;
++				set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
 +			} else {
 +				do_update = true;
 +			}
 +		}
 +
 +		if (p.has_ec) {
-+			struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
++			struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
 +
 +			if (fsck_err_on(!m || !m->alive, c,
 +					"pointer to nonexistent stripe %llu\n"
 +					"while marking %s",
 +					(u64) p.ec.idx,
-+					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
++					(printbuf_reset(&buf),
++					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 +				do_update = true;
 +
 +			if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
 +					"pointer does not match stripe %llu\n"
 +					"while marking %s",
 +					(u64) p.ec.idx,
-+					(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
++					(printbuf_reset(&buf),
++					 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 +				do_update = true;
 +		}
 +	}
@@ -13609,13 +14400,15 @@ index 000000000000..091bddee575d
 +
 +		if (is_root) {
 +			bch_err(c, "cannot update btree roots yet");
-+			return -EINVAL;
++			ret = -EINVAL;
++			goto err;
 +		}
 +
 +		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
 +		if (!new) {
 +			bch_err(c, "%s: error allocating new key", __func__);
-+			return -ENOMEM;
++			ret = -ENOMEM;
++			goto err;
 +		}
 +
 +		bkey_reassemble(new, *k);
@@ -13629,29 +14422,29 @@ index 000000000000..091bddee575d
 +			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 +			bkey_for_each_ptr(ptrs, ptr) {
 +				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
++				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 +
-+				ptr->gen = g->mark.gen;
++				ptr->gen = g->gen;
 +			}
 +		} else {
 +			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
 +				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+				struct bucket *g = PTR_BUCKET(ca, ptr, true);
++				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 +				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 +
 +				(ptr->cached &&
-+				 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
++				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
 +				(!ptr->cached &&
-+				 gen_cmp(ptr->gen, g->mark.gen) < 0) ||
-+				gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
-+				(g->mark.data_type &&
-+				 g->mark.data_type != data_type);
++				 gen_cmp(ptr->gen, g->gen) < 0) ||
++				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
++				(g->data_type &&
++				 g->data_type != data_type);
 +			}));
 +again:
 +			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 +			bkey_extent_entry_for_each(ptrs, entry) {
 +				if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-+					struct stripe *m = genradix_ptr(&c->stripes[true],
++					struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
 +									entry->stripe_ptr.idx);
 +					union bch_extent_entry *next_ptr;
 +
@@ -13676,13 +14469,28 @@ index 000000000000..091bddee575d
 +			}
 +		}
 +
-+		ret = bch2_journal_key_insert(c, btree_id, level, new);
-+		if (ret)
++		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
++		if (ret) {
 +			kfree(new);
-+		else
-+			*k = bkey_i_to_s_c(new);
++			goto err;
++		}
++
++		if (level)
++			bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
++
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, *k);
++		bch_info(c, "updated %s", buf.buf);
++
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
++		bch_info(c, "new key %s", buf.buf);
++
++		*k = bkey_i_to_s_c(new);
 +	}
++err:
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -13691,20 +14499,21 @@ index 000000000000..091bddee575d
 +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 +			    unsigned level, bool is_root,
 +			    struct bkey_s_c *k,
-+			    u8 *max_stale, bool initial)
++			    bool initial)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_ptrs_c ptrs;
-+	const struct bch_extent_ptr *ptr;
++	struct bkey deleted = KEY(0, 0, 0);
++	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
 +	unsigned flags =
 +		BTREE_TRIGGER_GC|
 +		(initial ? BTREE_TRIGGER_NOATOMIC : 0);
-+	char buf[200];
 +	int ret = 0;
 +
++	deleted.p = k->k->p;
++
 +	if (initial) {
 +		BUG_ON(bch2_journal_seq_verify &&
-+		       k->k->version.lo > journal_cur_seq(&c->journal));
++		       k->k->version.lo > atomic64_read(&c->journal.seq));
 +
 +		ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
 +		if (ret)
@@ -13715,32 +14524,10 @@ index 000000000000..091bddee575d
 +				k->k->version.lo,
 +				atomic64_read(&c->key_version)))
 +			atomic64_set(&c->key_version, k->k->version.lo);
-+
-+		if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-+		    fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
-+				"superblock not marked as containing replicas\n"
-+				"  while marking %s",
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-+			ret = bch2_mark_bkey_replicas(c, *k);
-+			if (ret) {
-+				bch_err(c, "error marking bkey replicas: %i", ret);
-+				goto err;
-+			}
-+		}
 +	}
 +
-+	ptrs = bch2_bkey_ptrs_c(*k);
-+	bkey_for_each_ptr(ptrs, ptr) {
-+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+		struct bucket *g = PTR_BUCKET(ca, ptr, true);
-+
-+		if (gen_after(g->oldest_gen, ptr->gen))
-+			g->oldest_gen = ptr->gen;
-+
-+		*max_stale = max(*max_stale, ptr_stale(ca, ptr));
-+	}
-+
-+	ret = bch2_mark_key(trans, *k, flags);
++	ret = __bch2_trans_do(trans, NULL, NULL, 0,
++			bch2_mark_key(trans, old, *k, flags));
 +fsck_err:
 +err:
 +	if (ret)
@@ -13748,8 +14535,7 @@ index 000000000000..091bddee575d
 +	return ret;
 +}
 +
-+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, u8 *max_stale,
-+			      bool initial)
++static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_node_iter iter;
@@ -13758,8 +14544,6 @@ index 000000000000..091bddee575d
 +	struct bkey_buf prev, cur;
 +	int ret = 0;
 +
-+	*max_stale = 0;
-+
 +	if (!btree_node_type_needs_gc(btree_node_type(b)))
 +		return 0;
 +
@@ -13770,7 +14554,7 @@ index 000000000000..091bddee575d
 +
 +	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 +		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
-+				       &k, max_stale, initial);
++				       &k, initial);
 +		if (ret)
 +			break;
 +
@@ -13801,7 +14585,6 @@ index 000000000000..091bddee575d
 +		: bch2_expensive_debug_checks		? 0
 +		: !btree_node_type_needs_gc(btree_id)	? 1
 +		: 0;
-+	u8 max_stale = 0;
 +	int ret = 0;
 +
 +	gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -13812,21 +14595,9 @@ index 000000000000..091bddee575d
 +
 +		gc_pos_set(c, gc_pos_btree_node(b));
 +
-+		ret = btree_gc_mark_node(trans, b, &max_stale, initial);
++		ret = btree_gc_mark_node(trans, b, initial);
 +		if (ret)
 +			break;
-+
-+		if (!initial) {
-+			if (max_stale > 64)
-+				bch2_btree_node_rewrite(trans, &iter, b,
-+						BTREE_INSERT_NOWAIT|
-+						BTREE_INSERT_GC_LOCK_HELD);
-+			else if (!bch2_btree_gc_rewrite_disabled &&
-+				 (bch2_btree_gc_always_rewrite || max_stale > 16))
-+				bch2_btree_node_rewrite(trans, &iter,
-+						b, BTREE_INSERT_NOWAIT|
-+						BTREE_INSERT_GC_LOCK_HELD);
-+		}
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
 +
@@ -13838,8 +14609,8 @@ index 000000000000..091bddee575d
 +	if (!btree_node_fake(b)) {
 +		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 +
-+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
-+				       &k, &max_stale, initial);
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
++				       true, &k, initial);
 +	}
 +	gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
 +	mutex_unlock(&c->btree_root_lock);
@@ -13854,8 +14625,7 @@ index 000000000000..091bddee575d
 +	struct btree_and_journal_iter iter;
 +	struct bkey_s_c k;
 +	struct bkey_buf cur, prev;
-+	u8 max_stale = 0;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -13867,8 +14637,8 @@ index 000000000000..091bddee575d
 +		BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
 +		BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 +
-+		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
-+				       &k, &max_stale, true);
++		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
++				       false, &k, true);
 +		if (ret) {
 +			bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
 +			goto fsck_err;
@@ -13916,7 +14686,8 @@ index 000000000000..091bddee575d
 +					  "  %s",
 +					  bch2_btree_ids[b->c.btree_id],
 +					  b->c.level - 1,
-+					  (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf)) &&
++					  (printbuf_reset(&buf),
++					   bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
 +				    !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) {
 +					ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
 +					bch_info(c, "Halting mark and sweep to start topology repair pass");
@@ -13946,6 +14717,7 @@ index 000000000000..091bddee575d
 +	bch2_bkey_buf_exit(&cur, c);
 +	bch2_bkey_buf_exit(&prev, c);
 +	bch2_btree_and_journal_iter_exit(&iter);
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -13959,8 +14731,7 @@ index 000000000000..091bddee575d
 +		: bch2_expensive_debug_checks		? 0
 +		: !btree_node_type_needs_gc(btree_id)	? 1
 +		: 0;
-+	u8 max_stale = 0;
-+	char buf[100];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	b = c->btree_roots[btree_id].b;
@@ -13969,17 +14740,19 @@ index 000000000000..091bddee575d
 +		return 0;
 +
 +	six_lock_read(&b->c.lock, NULL, NULL);
++	printbuf_reset(&buf);
++	bch2_bpos_to_text(&buf, b->data->min_key);
 +	if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
-+			"btree root with incorrect min_key: %s",
-+			(bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
++			"btree root with incorrect min_key: %s", buf.buf)) {
 +		bch_err(c, "repair unimplemented");
 +		ret = FSCK_ERR_EXIT;
 +		goto fsck_err;
 +	}
 +
++	printbuf_reset(&buf);
++	bch2_bpos_to_text(&buf, b->data->max_key);
 +	if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c,
-+			"btree root with incorrect max_key: %s",
-+			(bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
++			"btree root with incorrect max_key: %s", buf.buf)) {
 +		bch_err(c, "repair unimplemented");
 +		ret = FSCK_ERR_EXIT;
 +		goto fsck_err;
@@ -13992,13 +14765,14 @@ index 000000000000..091bddee575d
 +		struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 +
 +		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
-+				       &k, &max_stale, true);
++				       &k, true);
 +	}
 +fsck_err:
 +	six_unlock_read(&b->c.lock);
 +
 +	if (ret < 0)
 +		bch_err(c, "%s: ret %i", __func__, ret);
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -14017,6 +14791,9 @@ index 000000000000..091bddee575d
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
++	if (initial)
++		trans.is_initial_gc = true;
++
 +	for (i = 0; i < BTREE_ID_NR; i++)
 +		ids[i] = i;
 +	bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
@@ -14051,23 +14828,13 @@ index 000000000000..091bddee575d
 +	} while (start < end);
 +}
 +
-+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
-+			      unsigned flags)
++static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
++				     unsigned flags)
 +{
 +	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 +	unsigned i;
 +	u64 b;
 +
-+	/*
-+	 * This conditional is kind of gross, but we may be called from the
-+	 * device add path, before the new device has actually been added to the
-+	 * running filesystem:
-+	 */
-+	if (c) {
-+		lockdep_assert_held(&c->sb_lock);
-+		percpu_down_read(&c->mark_lock);
-+	}
-+
 +	for (i = 0; i < layout->nr_superblocks; i++) {
 +		u64 offset = le64_to_cpu(layout->sb_offset[i]);
 +
@@ -14086,9 +14853,6 @@ index 000000000000..091bddee575d
 +					  ca->mi.bucket_size,
 +					  gc_phase(GC_PHASE_SB), flags);
 +	}
-+
-+	if (c)
-+		percpu_up_read(&c->mark_lock);
 +}
 +
 +static void bch2_mark_superblocks(struct bch_fs *c)
@@ -14127,13 +14891,14 @@ index 000000000000..091bddee575d
 +	struct bch_dev *ca;
 +	unsigned i;
 +
-+	genradix_free(&c->stripes[1]);
++	genradix_free(&c->reflink_gc_table);
++	genradix_free(&c->gc_stripes);
 +
 +	for_each_member_device(ca, c, i) {
-+		kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
++		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
 +			sizeof(struct bucket_array) +
 +			ca->mi.nbuckets * sizeof(struct bucket));
-+		ca->buckets[1] = NULL;
++		ca->buckets_gc = NULL;
 +
 +		free_percpu(ca->usage_gc);
 +		ca->usage_gc = NULL;
@@ -14147,18 +14912,20 @@ index 000000000000..091bddee575d
 +			bool initial, bool metadata_only)
 +{
 +	struct bch_dev *ca = NULL;
++	struct printbuf buf = PRINTBUF;
 +	bool verify = !metadata_only && (!initial ||
 +		       (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
 +	unsigned i, dev;
 +	int ret = 0;
 +
++	percpu_down_write(&c->mark_lock);
++
 +#define copy_field(_f, _msg, ...)					\
 +	if (dst->_f != src->_f) {					\
 +		if (verify)						\
 +			fsck_err(c, _msg ": got %llu, should be %llu"	\
 +				, ##__VA_ARGS__, dst->_f, src->_f);	\
 +		dst->_f = src->_f;					\
-+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 +	}
 +#define copy_stripe_field(_f, _msg, ...)				\
 +	if (dst->_f != src->_f) {					\
@@ -14168,85 +14935,28 @@ index 000000000000..091bddee575d
 +				iter.pos, ##__VA_ARGS__,		\
 +				dst->_f, src->_f);			\
 +		dst->_f = src->_f;					\
-+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
-+	}
-+#define copy_bucket_field(_f)						\
-+	if (dst->b[b].mark._f != src->b[b].mark._f) {			\
-+		if (verify)						\
-+			fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f	\
-+				": got %u, should be %u", dev, b,	\
-+				dst->b[b].mark.gen,			\
-+				bch2_data_types[dst->b[b].mark.data_type],\
-+				dst->b[b].mark._f, src->b[b].mark._f);	\
-+		dst->b[b]._mark._f = src->b[b].mark._f;			\
-+		set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);		\
 +	}
 +#define copy_dev_field(_f, _msg, ...)					\
 +	copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 +#define copy_fs_field(_f, _msg, ...)					\
 +	copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 +
-+	if (!metadata_only) {
-+		struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
-+		struct stripe *dst, *src;
-+
-+		while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
-+			dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
-+
-+			if (dst->alive		!= src->alive ||
-+			    dst->sectors	!= src->sectors ||
-+			    dst->algorithm	!= src->algorithm ||
-+			    dst->nr_blocks	!= src->nr_blocks ||
-+			    dst->nr_redundant	!= src->nr_redundant) {
-+				bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
-+				ret = -EINVAL;
-+				goto fsck_err;
-+			}
-+
-+			for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
-+				copy_stripe_field(block_sectors[i],
-+						  "block_sectors[%u]", i);
-+
-+			dst->blocks_nonempty = 0;
-+			for (i = 0; i < dst->nr_blocks; i++)
-+				dst->blocks_nonempty += dst->block_sectors[i] != 0;
-+
-+			genradix_iter_advance(&iter, &c->stripes[1]);
-+		}
-+	}
-+
 +	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
 +		bch2_fs_usage_acc_to_base(c, i);
 +
 +	for_each_member_device(ca, c, dev) {
-+		struct bucket_array *dst = __bucket_array(ca, 0);
-+		struct bucket_array *src = __bucket_array(ca, 1);
-+		size_t b;
++		struct bch_dev_usage *dst = ca->usage_base;
++		struct bch_dev_usage *src = (void *)
++			bch2_acc_percpu_u64s((void *) ca->usage_gc,
++					     dev_usage_u64s());
 +
-+		for (b = 0; b < src->nbuckets; b++) {
-+			copy_bucket_field(gen);
-+			copy_bucket_field(data_type);
-+			copy_bucket_field(stripe);
-+			copy_bucket_field(dirty_sectors);
-+			copy_bucket_field(cached_sectors);
++		copy_dev_field(buckets_ec,		"buckets_ec");
++		copy_dev_field(buckets_unavailable,	"buckets_unavailable");
 +
-+			dst->b[b].oldest_gen = src->b[b].oldest_gen;
-+		}
-+
-+		{
-+			struct bch_dev_usage *dst = ca->usage_base;
-+			struct bch_dev_usage *src = (void *)
-+				bch2_acc_percpu_u64s((void *) ca->usage_gc,
-+						     dev_usage_u64s());
-+
-+			copy_dev_field(buckets_ec,		"buckets_ec");
-+			copy_dev_field(buckets_unavailable,	"buckets_unavailable");
-+
-+			for (i = 0; i < BCH_DATA_NR; i++) {
-+				copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
-+				copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
-+				copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
-+			}
++		for (i = 0; i < BCH_DATA_NR; i++) {
++			copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
++			copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
++			copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
 +		}
 +	};
 +
@@ -14273,22 +14983,21 @@ index 000000000000..091bddee575d
 +		for (i = 0; i < c->replicas.nr; i++) {
 +			struct bch_replicas_entry *e =
 +				cpu_replicas_entry(&c->replicas, i);
-+			char buf[80];
 +
 +			if (metadata_only &&
 +			    (e->data_type == BCH_DATA_user ||
 +			     e->data_type == BCH_DATA_cached))
 +				continue;
 +
-+			bch2_replicas_entry_to_text(&PBUF(buf), e);
++			printbuf_reset(&buf);
++			bch2_replicas_entry_to_text(&buf, e);
 +
-+			copy_fs_field(replicas[i], "%s", buf);
++			copy_fs_field(replicas[i], "%s", buf.buf);
 +		}
 +	}
 +
 +#undef copy_fs_field
 +#undef copy_dev_field
-+#undef copy_bucket_field
 +#undef copy_stripe_field
 +#undef copy_field
 +fsck_err:
@@ -14296,6 +15005,9 @@ index 000000000000..091bddee575d
 +		percpu_ref_put(&ca->ref);
 +	if (ret)
 +		bch_err(c, "%s: ret %i", __func__, ret);
++
++	percpu_up_write(&c->mark_lock);
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -14304,7 +15016,6 @@ index 000000000000..091bddee575d
 +{
 +	struct bch_dev *ca = NULL;
 +	unsigned i;
-+	int ret;
 +
 +	BUG_ON(c->usage_gc);
 +
@@ -14316,18 +15027,9 @@ index 000000000000..091bddee575d
 +	}
 +
 +	for_each_member_device(ca, c, i) {
-+		BUG_ON(ca->buckets[1]);
++		BUG_ON(ca->buckets_gc);
 +		BUG_ON(ca->usage_gc);
 +
-+		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
-+				ca->mi.nbuckets * sizeof(struct bucket),
-+				GFP_KERNEL|__GFP_ZERO);
-+		if (!ca->buckets[1]) {
-+			percpu_ref_put(&ca->ref);
-+			bch_err(c, "error allocating ca->buckets[gc]");
-+			return -ENOMEM;
-+		}
-+
 +		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
 +		if (!ca->usage_gc) {
 +			bch_err(c, "error allocating ca->usage_gc");
@@ -14336,110 +15038,215 @@ index 000000000000..091bddee575d
 +		}
 +	}
 +
-+	ret = bch2_ec_mem_alloc(c, true);
-+	if (ret) {
-+		bch_err(c, "error allocating ec gc mem");
-+		return ret;
-+	}
-+
-+	percpu_down_write(&c->mark_lock);
-+
-+	/*
-+	 * indicate to stripe code that we need to allocate for the gc stripes
-+	 * radix tree, too
-+	 */
-+	gc_pos_set(c, gc_phase(GC_PHASE_START));
-+
-+	for_each_member_device(ca, c, i) {
-+		struct bucket_array *dst = __bucket_array(ca, 1);
-+		struct bucket_array *src = __bucket_array(ca, 0);
-+		size_t b;
-+
-+		dst->first_bucket	= src->first_bucket;
-+		dst->nbuckets		= src->nbuckets;
-+
-+		for (b = 0; b < src->nbuckets; b++) {
-+			struct bucket *d = &dst->b[b];
-+			struct bucket *s = &src->b[b];
-+
-+			d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
-+			d->gen_valid = s->gen_valid;
-+
-+			if (metadata_only &&
-+			    (s->mark.data_type == BCH_DATA_user ||
-+			     s->mark.data_type == BCH_DATA_cached))
-+				d->_mark = s->mark;
-+		}
-+	};
-+
-+	percpu_up_write(&c->mark_lock);
-+
 +	return 0;
 +}
 +
-+static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
-+					   struct bkey_s_c k)
++/* returns true if not equal */
++static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
++				     struct bch_alloc_v4 r)
++{
++	return  l.gen != r.gen				||
++		l.oldest_gen != r.oldest_gen		||
++		l.data_type != r.data_type		||
++		l.dirty_sectors	!= r.dirty_sectors	||
++		l.cached_sectors != r.cached_sectors	 ||
++		l.stripe_redundancy != r.stripe_redundancy ||
++		l.stripe != r.stripe;
++}
++
++static int bch2_alloc_write_key(struct btree_trans *trans,
++				struct btree_iter *iter,
++				bool metadata_only)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct reflink_gc *r;
-+	const __le64 *refcount = bkey_refcount_c(k);
-+	char buf[200];
-+	int ret = 0;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
++	struct bucket gc;
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	struct bch_alloc_v4 old, new;
++	int ret;
 +
-+	if (!refcount)
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret)
++		return ret;
++
++	bch2_alloc_to_v4(k, &old);
++	new = old;
++
++	percpu_down_read(&c->mark_lock);
++	gc = *gc_bucket(ca, iter->pos.offset);
++	percpu_up_read(&c->mark_lock);
++
++	if (metadata_only &&
++	    gc.data_type != BCH_DATA_sb &&
++	    gc.data_type != BCH_DATA_journal &&
++	    gc.data_type != BCH_DATA_btree)
 +		return 0;
 +
-+	r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
-+	if (!r)
-+		return -ENOMEM;
++	if (gen_after(old.gen, gc.gen))
++		return 0;
 +
-+	if (!r ||
-+	    r->offset != k.k->p.offset ||
-+	    r->size != k.k->size) {
-+		bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-+		return -EINVAL;
-+	}
++#define copy_bucket_field(_f)						\
++	if (fsck_err_on(new._f != gc._f, c,				\
++			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
++			": got %u, should be %u",			\
++			iter->pos.inode, iter->pos.offset,		\
++			gc.gen,						\
++			bch2_data_types[gc.data_type],			\
++			new._f, gc._f))					\
++		new._f = gc._f;						\
 +
-+	if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-+			"reflink key has wrong refcount:\n"
-+			"  %s\n"
-+			"  should be %u",
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-+			r->refcount)) {
-+		struct bkey_i *new;
++	copy_bucket_field(gen);
++	copy_bucket_field(data_type);
++	copy_bucket_field(dirty_sectors);
++	copy_bucket_field(cached_sectors);
++	copy_bucket_field(stripe_redundancy);
++	copy_bucket_field(stripe);
++#undef copy_bucket_field
 +
-+		new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-+		if (!new) {
-+			ret = -ENOMEM;
-+			goto fsck_err;
-+		}
++	if (!bch2_alloc_v4_cmp(old, new))
++		return 0;
 +
-+		bkey_reassemble(new, k);
++	a = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a);
++	if (ret)
++		return ret;
 +
-+		if (!r->refcount) {
-+			new->k.type = KEY_TYPE_deleted;
-+			new->k.size = 0;
-+		} else {
-+			*bkey_refcount(new) = cpu_to_le64(r->refcount);
-+		}
++	a->v = new;
 +
-+		ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
-+		if (ret)
-+			kfree(new);
-+	}
++	ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
 +fsck_err:
 +	return ret;
 +}
 +
-+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
-+				bool metadata_only)
++static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_dev *ca;
++	unsigned i;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_member_device(ca, c, i) {
++		for_each_btree_key(&trans, iter, BTREE_ID_alloc,
++				   POS(ca->dev_idx, ca->mi.first_bucket),
++				   BTREE_ITER_SLOTS|
++				   BTREE_ITER_PREFETCH, k, ret) {
++			if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
++				break;
++
++			ret = __bch2_trans_do(&trans, NULL, NULL,
++					      BTREE_INSERT_LAZY_RW,
++					bch2_alloc_write_key(&trans, &iter,
++							     metadata_only));
++			if (ret)
++				break;
++		}
++		bch2_trans_iter_exit(&trans, &iter);
++
++		if (ret) {
++			bch_err(c, "error writing alloc info: %i", ret);
++			percpu_ref_put(&ca->ref);
++			break;
++		}
++	}
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
++{
++	struct bch_dev *ca;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bucket *g;
++	struct bch_alloc_v4 a;
++	unsigned i;
++	int ret;
++
++	for_each_member_device(ca, c, i) {
++		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
++				ca->mi.nbuckets * sizeof(struct bucket),
++				GFP_KERNEL|__GFP_ZERO);
++		if (!buckets) {
++			percpu_ref_put(&ca->ref);
++			bch_err(c, "error allocating ca->buckets[gc]");
++			return -ENOMEM;
++		}
++
++		buckets->first_bucket	= ca->mi.first_bucket;
++		buckets->nbuckets	= ca->mi.nbuckets;
++		rcu_assign_pointer(ca->buckets_gc, buckets);
++	};
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		ca = bch_dev_bkey_exists(c, k.k->p.inode);
++		g = gc_bucket(ca, k.k->p.offset);
++
++		bch2_alloc_to_v4(k, &a);
++
++		g->gen_valid	= 1;
++		g->gen		= a.gen;
++
++		if (metadata_only &&
++		    (a.data_type == BCH_DATA_user ||
++		     a.data_type == BCH_DATA_cached ||
++		     a.data_type == BCH_DATA_parity)) {
++			g->data_type		= a.data_type;
++			g->dirty_sectors	= a.dirty_sectors;
++			g->cached_sectors	= a.cached_sectors;
++			g->stripe		= a.stripe;
++			g->stripe_redundancy	= a.stripe_redundancy;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		bch_err(c, "error reading alloc info at gc start: %i", ret);
++
++	return ret;
++}
++
++static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
++{
++	struct bch_dev *ca;
++	unsigned i;
++
++	for_each_member_device(ca, c, i) {
++		struct bucket_array *buckets = gc_bucket_array(ca);
++		struct bucket *g;
++
++		for_each_bucket(g, buckets) {
++			if (metadata_only &&
++			    (g->data_type == BCH_DATA_user ||
++			     g->data_type == BCH_DATA_cached ||
++			     g->data_type == BCH_DATA_parity))
++				continue;
++			g->dirty_sectors = 0;
++			g->cached_sectors = 0;
++		}
++	};
++}
++
++static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct reflink_gc *r;
 +	size_t idx = 0;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	if (metadata_only)
@@ -14447,14 +15254,6 @@ index 000000000000..091bddee575d
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	if (initial) {
-+		c->reflink_gc_idx = 0;
-+
-+		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-+				bch2_gc_reflink_done_initial_fn);
-+		goto out;
-+	}
-+
 +	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 +			   BTREE_ITER_PREFETCH, k, ret) {
 +		const __le64 *refcount = bkey_refcount_c(k);
@@ -14462,7 +15261,7 @@ index 000000000000..091bddee575d
 +		if (!refcount)
 +			continue;
 +
-+		r = genradix_ptr(&c->reflink_gc_table, idx);
++		r = genradix_ptr(&c->reflink_gc_table, idx++);
 +		if (!r ||
 +		    r->offset != k.k->p.offset ||
 +		    r->size != k.k->size) {
@@ -14475,7 +15274,8 @@ index 000000000000..091bddee575d
 +				"reflink key has wrong refcount:\n"
 +				"  %s\n"
 +				"  should be %u",
-+				(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 +				r->refcount)) {
 +			struct bkey_i *new;
 +
@@ -14493,7 +15293,7 @@ index 000000000000..091bddee575d
 +				*bkey_refcount(new) = cpu_to_le64(r->refcount);
 +
 +			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+					__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
++				__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
 +			kfree(new);
 +
 +			if (ret)
@@ -14502,36 +15302,13 @@ index 000000000000..091bddee575d
 +	}
 +fsck_err:
 +	bch2_trans_iter_exit(&trans, &iter);
-+out:
-+	genradix_free(&c->reflink_gc_table);
 +	c->reflink_gc_nr = 0;
 +	bch2_trans_exit(&trans);
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
-+static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
-+					    struct bkey_s_c k)
-+{
-+
-+	struct bch_fs *c = trans->c;
-+	struct reflink_gc *r;
-+	const __le64 *refcount = bkey_refcount_c(k);
-+
-+	if (!refcount)
-+		return 0;
-+
-+	r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-+			       GFP_KERNEL);
-+	if (!r)
-+		return -ENOMEM;
-+
-+	r->offset	= k.k->p.offset;
-+	r->size		= k.k->size;
-+	r->refcount	= 0;
-+	return 0;
-+}
-+
-+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
++static int bch2_gc_reflink_start(struct bch_fs *c,
 +				 bool metadata_only)
 +{
 +	struct btree_trans trans;
@@ -14544,15 +15321,8 @@ index 000000000000..091bddee575d
 +		return 0;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	genradix_free(&c->reflink_gc_table);
 +	c->reflink_gc_nr = 0;
 +
-+	if (initial) {
-+		ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-+						bch2_gc_reflink_start_initial_fn);
-+		goto out;
-+	}
-+
 +	for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
 +			   BTREE_ITER_PREFETCH, k, ret) {
 +		const __le64 *refcount = bkey_refcount_c(k);
@@ -14572,11 +15342,88 @@ index 000000000000..091bddee575d
 +		r->refcount	= 0;
 +	}
 +	bch2_trans_iter_exit(&trans, &iter);
-+out:
++
 +	bch2_trans_exit(&trans);
 +	return ret;
 +}
 +
++static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only)
++{
++	struct genradix_iter iter;
++	struct reflink_gc *r;
++
++	genradix_for_each(&c->reflink_gc_table, iter, r)
++		r->refcount = 0;
++}
++
++static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct gc_stripe *m;
++	const struct bch_stripe *s;
++	struct printbuf buf = PRINTBUF;
++	unsigned i;
++	int ret = 0;
++
++	if (metadata_only)
++		return 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		if (k.k->type != KEY_TYPE_stripe)
++			continue;
++
++		s = bkey_s_c_to_stripe(k).v;
++		m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
++
++		for (i = 0; i < s->nr_blocks; i++)
++			if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
++				goto inconsistent;
++		continue;
++inconsistent:
++		if (fsck_err_on(true, c,
++				"stripe has wrong block sector count %u:\n"
++				"  %s\n"
++				"  should be %u", i,
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, k), buf.buf),
++				m ? m->block_sectors[i] : 0)) {
++			struct bkey_i_stripe *new;
++
++			new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
++			if (!new) {
++				ret = -ENOMEM;
++				break;
++			}
++
++			bkey_reassemble(&new->k_i, k);
++
++			for (i = 0; i < new->v.nr_blocks; i++)
++				stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
++
++			ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++				__bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
++			kfree(new);
++		}
++	}
++fsck_err:
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
++{
++	genradix_free(&c->gc_stripes);
++}
++
 +/**
 + * bch2_gc - walk _all_ references to buckets, and recompute them:
 + *
@@ -14597,9 +15444,8 @@ index 000000000000..091bddee575d
 + */
 +int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 +{
-+	struct bch_dev *ca;
 +	u64 start_time = local_clock();
-+	unsigned i, iter = 0;
++	unsigned iter = 0;
 +	int ret;
 +
 +	lockdep_assert_held(&c->state_lock);
@@ -14610,11 +15456,14 @@ index 000000000000..091bddee575d
 +	/* flush interior btree updates: */
 +	closure_wait_event(&c->btree_interior_update_wait,
 +			   !bch2_btree_interior_updates_nr_pending(c));
-+again:
++
 +	ret   = bch2_gc_start(c, metadata_only) ?:
-+		bch2_gc_reflink_start(c, initial, metadata_only);
++		bch2_gc_alloc_start(c, metadata_only) ?:
++		bch2_gc_reflink_start(c, metadata_only);
 +	if (ret)
 +		goto out;
++again:
++	gc_pos_set(c, gc_phase(GC_PHASE_START));
 +
 +	bch2_mark_superblocks(c);
 +
@@ -14652,39 +15501,40 @@ index 000000000000..091bddee575d
 +
 +	if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
 +	    (!iter && bch2_test_restart_gc)) {
++		if (iter++ > 2) {
++			bch_info(c, "Unable to fix bucket gens, looping");
++			ret = -EINVAL;
++			goto out;
++		}
++
 +		/*
 +		 * XXX: make sure gens we fixed got saved
 +		 */
-+		if (iter++ <= 2) {
-+			bch_info(c, "Second GC pass needed, restarting:");
-+			clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-+			__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
++		bch_info(c, "Second GC pass needed, restarting:");
++		clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
++		__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 +
-+			percpu_down_write(&c->mark_lock);
-+			bch2_gc_free(c);
-+			percpu_up_write(&c->mark_lock);
-+			/* flush fsck errors, reset counters */
-+			bch2_flush_fsck_errs(c);
++		bch2_gc_stripes_reset(c, metadata_only);
++		bch2_gc_alloc_reset(c, metadata_only);
++		bch2_gc_reflink_reset(c, metadata_only);
 +
-+			goto again;
-+		}
-+
-+		bch_info(c, "Unable to fix bucket gens, looping");
-+		ret = -EINVAL;
++		/* flush fsck errors, reset counters */
++		bch2_flush_fsck_errs(c);
++		goto again;
 +	}
 +out:
 +	if (!ret) {
 +		bch2_journal_block(&c->journal);
 +
-+		percpu_down_write(&c->mark_lock);
-+		ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
++		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
++			bch2_gc_reflink_done(c, metadata_only) ?:
++			bch2_gc_alloc_done(c, metadata_only) ?:
 +			bch2_gc_done(c, initial, metadata_only);
 +
 +		bch2_journal_unblock(&c->journal);
-+	} else {
-+		percpu_down_write(&c->mark_lock);
 +	}
 +
++	percpu_down_write(&c->mark_lock);
 +	/* Indicates that gc is no longer in progress: */
 +	__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 +
@@ -14697,13 +15547,6 @@ index 000000000000..091bddee575d
 +	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 +
 +	/*
-+	 * Wake up allocator in case it was waiting for buckets
-+	 * because of not being able to inc gens
-+	 */
-+	for_each_member_device(ca, c, i)
-+		bch2_wake_allocator(ca);
-+
-+	/*
 +	 * At startup, allocations can happen directly instead of via the
 +	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 +	 */
@@ -14719,9 +15562,8 @@ index 000000000000..091bddee575d
 +	percpu_down_read(&c->mark_lock);
 +	bkey_for_each_ptr(ptrs, ptr) {
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+		struct bucket *g = PTR_BUCKET(ca, ptr, false);
 +
-+		if (gen_after(g->mark.gen, ptr->gen) > 16) {
++		if (ptr_stale(ca, ptr) > 16) {
 +			percpu_up_read(&c->mark_lock);
 +			return true;
 +		}
@@ -14729,10 +15571,10 @@ index 000000000000..091bddee575d
 +
 +	bkey_for_each_ptr(ptrs, ptr) {
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+		struct bucket *g = PTR_BUCKET(ca, ptr, false);
++		u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
 +
-+		if (gen_after(g->gc_gen, ptr->gen))
-+			g->gc_gen = ptr->gen;
++		if (gen_after(*gen, ptr->gen))
++			*gen = ptr->gen;
 +	}
 +	percpu_up_read(&c->mark_lock);
 +
@@ -14743,23 +15585,22 @@ index 000000000000..091bddee575d
 + * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
 + * node pointers currently never have cached pointers that can become stale:
 + */
-+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
++static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
 +{
-+	struct btree_trans trans;
++	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bkey_buf sk;
 +	int ret = 0, commit_err = 0;
 +
 +	bch2_bkey_buf_init(&sk);
-+	bch2_trans_init(&trans, c, 0, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
++	bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
 +			     BTREE_ITER_PREFETCH|
 +			     BTREE_ITER_NOT_EXTENTS|
 +			     BTREE_ITER_ALL_SNAPSHOTS);
 +
-+	while ((bch2_trans_begin(&trans),
++	while ((bch2_trans_begin(trans),
 +		k = bch2_btree_iter_peek(&iter)).k) {
 +		ret = bkey_err(k);
 +
@@ -14775,10 +15616,10 @@ index 000000000000..091bddee575d
 +			bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 +
 +			commit_err =
-+				bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-+				bch2_trans_commit(&trans, NULL, NULL,
-+						       BTREE_INSERT_NOWAIT|
-+						       BTREE_INSERT_NOFAIL);
++				bch2_trans_update(trans, &iter, sk.k, 0) ?:
++				bch2_trans_commit(trans, NULL, NULL,
++						  BTREE_INSERT_NOWAIT|
++						  BTREE_INSERT_NOFAIL);
 +			if (commit_err == -EINTR) {
 +				commit_err = 0;
 +				continue;
@@ -14787,19 +15628,48 @@ index 000000000000..091bddee575d
 +
 +		bch2_btree_iter_advance(&iter);
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_iter_exit(trans, &iter);
 +
-+	bch2_trans_exit(&trans);
 +	bch2_bkey_buf_exit(&sk, c);
 +
 +	return ret;
 +}
 +
++static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
++{
++	struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct bkey_i_alloc_v4 *a_mut;
++	int ret;
++
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret)
++		return ret;
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
++		return 0;
++
++	a_mut = bch2_alloc_to_v4_mut(trans, k);
++	ret = PTR_ERR_OR_ZERO(a_mut);
++	if (ret)
++		return ret;
++
++	a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
++
++	return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
++}
++
 +int bch2_gc_gens(struct bch_fs *c)
 +{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
 +	struct bch_dev *ca;
-+	struct bucket_array *buckets;
-+	struct bucket *g;
++	u64 b, start_time = local_clock();
 +	unsigned i;
 +	int ret;
 +
@@ -14808,43 +15678,69 @@ index 000000000000..091bddee575d
 +	 * introduces a deadlock in the RO path - we currently take the state
 +	 * lock at the start of going RO, thus the gc thread may get stuck:
 +	 */
++	if (!mutex_trylock(&c->gc_gens_lock))
++		return 0;
++
 +	down_read(&c->gc_lock);
++	bch2_trans_init(&trans, c, 0, 0);
 +
 +	for_each_member_device(ca, c, i) {
-+		down_read(&ca->bucket_lock);
-+		buckets = bucket_array(ca);
++		struct bucket_gens *gens;
 +
-+		for_each_bucket(g, buckets)
-+			g->gc_gen = g->mark.gen;
-+		up_read(&ca->bucket_lock);
++		BUG_ON(ca->oldest_gen);
++
++		ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
++		if (!ca->oldest_gen) {
++			percpu_ref_put(&ca->ref);
++			ret = -ENOMEM;
++			goto err;
++		}
++
++		gens = bucket_gens(ca);
++
++		for (b = gens->first_bucket;
++		     b < gens->nbuckets; b++)
++			ca->oldest_gen[b] = gens->b[b];
 +	}
 +
 +	for (i = 0; i < BTREE_ID_NR; i++)
 +		if ((1 << i) & BTREE_ID_HAS_PTRS) {
 +			c->gc_gens_btree = i;
 +			c->gc_gens_pos = POS_MIN;
-+			ret = bch2_gc_btree_gens(c, i);
++			ret = bch2_gc_btree_gens(&trans, i);
 +			if (ret) {
 +				bch_err(c, "error recalculating oldest_gen: %i", ret);
 +				goto err;
 +			}
 +		}
 +
-+	for_each_member_device(ca, c, i) {
-+		down_read(&ca->bucket_lock);
-+		buckets = bucket_array(ca);
-+
-+		for_each_bucket(g, buckets)
-+			g->oldest_gen = g->gc_gen;
-+		up_read(&ca->bucket_lock);
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		ret = __bch2_trans_do(&trans, NULL, NULL,
++				      BTREE_INSERT_NOFAIL,
++				bch2_alloc_write_oldest_gen(&trans, &iter));
++		if (ret) {
++			bch_err(c, "error writing oldest_gen: %i", ret);
++			break;
++		}
 +	}
++	bch2_trans_iter_exit(&trans, &iter);
 +
 +	c->gc_gens_btree	= 0;
 +	c->gc_gens_pos		= POS_MIN;
 +
 +	c->gc_count++;
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 +err:
++	for_each_member_device(ca, c, i) {
++		kvfree(ca->oldest_gen);
++		ca->oldest_gen = NULL;
++	}
++
++	bch2_trans_exit(&trans);
 +	up_read(&c->gc_lock);
++	mutex_unlock(&c->gc_gens_lock);
 +	return ret;
 +}
 +
@@ -14938,10 +15834,10 @@ index 000000000000..091bddee575d
 +}
 diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
 new file mode 100644
-index 000000000000..59dfb069e699
+index 000000000000..0665f5941fcc
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.h
-@@ -0,0 +1,106 @@
+@@ -0,0 +1,105 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_GC_H
 +#define _BCACHEFS_BTREE_GC_H
@@ -14952,7 +15848,6 @@ index 000000000000..59dfb069e699
 +int bch2_gc_gens(struct bch_fs *);
 +void bch2_gc_thread_stop(struct bch_fs *);
 +int bch2_gc_thread_start(struct bch_fs *);
-+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 +
 +/*
 + * For concurrent mark and sweep (with other index updates), we define a total
@@ -15050,10 +15945,10 @@ index 000000000000..59dfb069e699
 +#endif /* _BCACHEFS_BTREE_GC_H */
 diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
 new file mode 100644
-index 000000000000..f11fcab61902
+index 000000000000..4b880ea59cad
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.c
-@@ -0,0 +1,2124 @@
+@@ -0,0 +1,2111 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -15447,16 +16342,10 @@ index 000000000000..f11fcab61902
 +
 +	bch2_btree_node_iter_init_from_start(&src_iter, src);
 +
-+	if (btree_node_is_extents(src))
-+		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
-+				src, &src_iter,
-+				&dst->format,
-+				true);
-+	else
-+		nr = bch2_sort_repack(btree_bset_first(dst),
-+				src, &src_iter,
-+				&dst->format,
-+				true);
++	nr = bch2_sort_repack(btree_bset_first(dst),
++			src, &src_iter,
++			&dst->format,
++			true);
 +
 +	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 +			       start_time);
@@ -15539,7 +16428,7 @@ index 000000000000..f11fcab61902
 +		};
 +
 +		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-+			bch2_btree_node_write(c, b, SIX_LOCK_write);
++			bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
 +			reinit_iter = true;
 +		}
 +	}
@@ -15602,13 +16491,7 @@ index 000000000000..f11fcab61902
 +#define btree_err(type, c, ca, b, i, msg, ...)				\
 +({									\
 +	__label__ out;							\
-+	char _buf[300];							\
-+	char *_buf2 = _buf;						\
-+	struct printbuf out = PBUF(_buf);				\
-+									\
-+	_buf2 = kmalloc(4096, GFP_ATOMIC);				\
-+	if (_buf2)							\
-+		out = _PBUF(_buf2, 4986);				\
++	struct printbuf out = PRINTBUF;					\
 +									\
 +	btree_err_msg(&out, c, ca, b, i, b->written, write);		\
 +	pr_buf(&out, ": " msg, ##__VA_ARGS__);				\
@@ -15616,13 +16499,13 @@ index 000000000000..f11fcab61902
 +	if (type == BTREE_ERR_FIXABLE &&				\
 +	    write == READ &&						\
 +	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
-+		mustfix_fsck_err(c, "%s", _buf2);			\
++		mustfix_fsck_err(c, "%s", out.buf);			\
 +		goto out;						\
 +	}								\
 +									\
 +	switch (write) {						\
 +	case READ:							\
-+		bch_err(c, "%s", _buf2);				\
++		bch_err(c, "%s", out.buf);				\
 +									\
 +		switch (type) {						\
 +		case BTREE_ERR_FIXABLE:					\
@@ -15643,7 +16526,7 @@ index 000000000000..f11fcab61902
 +		}							\
 +		break;							\
 +	case WRITE:							\
-+		bch_err(c, "corrupt metadata before write: %s", _buf2);	\
++		bch_err(c, "corrupt metadata before write: %s", out.buf);\
 +									\
 +		if (bch2_fs_inconsistent(c)) {				\
 +			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
@@ -15652,8 +16535,7 @@ index 000000000000..f11fcab61902
 +		break;							\
 +	}								\
 +out:									\
-+	if (_buf2 != _buf)						\
-+		kfree(_buf2);						\
++	printbuf_exit(&out);						\
 +	true;								\
 +})
 +
@@ -15714,8 +16596,8 @@ index 000000000000..f11fcab61902
 +{
 +	unsigned version = le16_to_cpu(i->version);
 +	const char *err;
-+	char buf1[100];
-+	char buf2[100];
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
 +	int ret = 0;
 +
 +	btree_err_on((version != BCH_BSET_VERSION_OLD &&
@@ -15748,11 +16630,12 @@ index 000000000000..f11fcab61902
 +		     BTREE_ERR_FATAL, c, ca, b, i,
 +		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 +
-+	if (btree_err_on(offset + sectors > c->opts.btree_node_size,
++	if (btree_err_on(offset + sectors > btree_sectors(c),
 +			 BTREE_ERR_FIXABLE, c, ca, b, i,
 +			 "bset past end of btree node")) {
 +		i->u64s = 0;
-+		return 0;
++		ret = 0;
++		goto out;
 +	}
 +
 +	btree_err_on(offset && !i->u64s,
@@ -15803,14 +16686,17 @@ index 000000000000..f11fcab61902
 +			btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
 +				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
 +				     "incorrect min_key: got %s should be %s",
-+				     (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
-+				     (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
++				     (printbuf_reset(&buf1),
++				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
++				     (printbuf_reset(&buf2),
++				      bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf));
 +		}
 +
 +		btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
 +			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
 +			     "incorrect max key %s",
-+			     (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
++			     (printbuf_reset(&buf1),
++			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
 +
 +		if (write)
 +			compat_btree_node(b->c.level, b->c.btree_id, version,
@@ -15825,7 +16711,10 @@ index 000000000000..f11fcab61902
 +			       BSET_BIG_ENDIAN(i), write,
 +			       &bn->format);
 +	}
++out:
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
@@ -15835,6 +16724,8 @@ index 000000000000..f11fcab61902
 +{
 +	unsigned version = le16_to_cpu(i->version);
 +	struct bkey_packed *k, *prev = NULL;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
 +	bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
 +		BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
 +	int ret = 0;
@@ -15873,11 +16764,10 @@ index 000000000000..f11fcab61902
 +			(!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
 +			(write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
 +		if (invalid) {
-+			char buf[160];
-+
-+			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
++			printbuf_reset(&buf1);
++			bch2_bkey_val_to_text(&buf1, c, u.s_c);
 +			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-+				  "invalid bkey: %s\n%s", invalid, buf);
++				  "invalid bkey: %s\n%s", invalid, buf1.buf);
 +
 +			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 +			memmove_u64s_down(k, bkey_next(k),
@@ -15891,18 +16781,18 @@ index 000000000000..f11fcab61902
 +				    &b->format, k);
 +
 +		if (prev && bkey_iter_cmp(b, prev, k) > 0) {
-+			char buf1[80];
-+			char buf2[80];
 +			struct bkey up = bkey_unpack_key(b, prev);
 +
-+			bch2_bkey_to_text(&PBUF(buf1), &up);
-+			bch2_bkey_to_text(&PBUF(buf2), u.k);
++			printbuf_reset(&buf1);
++			bch2_bkey_to_text(&buf1, &up);
++			printbuf_reset(&buf2);
++			bch2_bkey_to_text(&buf2, u.k);
 +
 +			bch2_dump_bset(c, b, i, 0);
 +
 +			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
 +				      "keys out of order: %s > %s",
-+				      buf1, buf2)) {
++				      buf1.buf, buf2.buf)) {
 +				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 +				memmove_u64s_down(k, bkey_next(k),
 +						  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -15914,6 +16804,8 @@ index 000000000000..f11fcab61902
 +		k = bkey_next(k);
 +	}
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
@@ -15946,11 +16838,12 @@ index 000000000000..f11fcab61902
 +
 +	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
 +		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-+		     "bad magic");
++		     "bad magic: want %llx, got %llx",
++		     bset_magic(c), le64_to_cpu(b->data->magic));
 +
 +	btree_err_on(!b->data->keys.seq,
 +		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-+		     "bad btree header");
++		     "bad btree header: seq 0");
 +
 +	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
 +		struct bch_btree_ptr_v2 *bp =
@@ -15962,7 +16855,7 @@ index 000000000000..f11fcab61902
 +			     b->data->keys.seq, bp->seq);
 +	}
 +
-+	while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
++	while (b->written < (ptr_written ?: btree_sectors(c))) {
 +		unsigned sectors, whiteout_u64s = 0;
 +		struct nonce nonce;
 +		struct bch_csum csum;
@@ -15983,9 +16876,12 @@ index 000000000000..f11fcab61902
 +				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 +				     "invalid checksum");
 +
-+			bset_encrypt(c, i, b->written << 9);
++			ret = bset_encrypt(c, i, b->written << 9);
++			if (bch2_fs_fatal_err_on(ret, c,
++					"error decrypting btree node: %i", ret))
++				goto fsck_err;
 +
-+			btree_err_on(btree_node_is_extents(b) &&
++			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 +				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
 +				     BTREE_ERR_FATAL, c, NULL, b, NULL,
 +				     "btree node does not have NEW_EXTENT_OVERWRITE set");
@@ -16010,7 +16906,10 @@ index 000000000000..f11fcab61902
 +				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
 +				     "invalid checksum");
 +
-+			bset_encrypt(c, i, b->written << 9);
++			ret = bset_encrypt(c, i, b->written << 9);
++			if (bch2_fs_fatal_err_on(ret, c,
++					"error decrypting btree node: %i\n", ret))
++				goto fsck_err;
 +
 +			sectors = vstruct_sectors(bne, c->block_bits);
 +		}
@@ -16033,19 +16932,23 @@ index 000000000000..f11fcab61902
 +
 +		SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
 +
-+		b->written += sectors;
-+
 +		blacklisted = bch2_journal_seq_is_blacklisted(c,
 +					le64_to_cpu(i->journal_seq),
 +					true);
 +
 +		btree_err_on(blacklisted && first,
 +			     BTREE_ERR_FIXABLE, c, ca, b, i,
-+			     "first btree node bset has blacklisted journal seq");
++			     "first btree node bset has blacklisted journal seq (%llu)",
++			     le64_to_cpu(i->journal_seq));
 +
 +		btree_err_on(blacklisted && ptr_written,
 +			     BTREE_ERR_FIXABLE, c, ca, b, i,
-+			     "found blacklisted bset in btree node with sectors_written");
++			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
++			     le64_to_cpu(i->journal_seq),
++			     b->written, b->written + sectors, ptr_written);
++
++		b->written += sectors;
++
 +		if (blacklisted && !first)
 +			continue;
 +
@@ -16118,11 +17021,12 @@ index 000000000000..f11fcab61902
 +		if (invalid ||
 +		    (bch2_inject_invalid_keys &&
 +		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-+			char buf[160];
++			struct printbuf buf = PRINTBUF;
 +
-+			bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
++			bch2_bkey_val_to_text(&buf, c, u.s_c);
 +			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
-+				  "invalid bkey %s: %s", buf, invalid);
++				  "invalid bkey %s: %s", buf.buf, invalid);
++			printbuf_exit(&buf);
 +
 +			btree_keys_account_key_drop(&b->nr, 0, k);
 +
@@ -16179,8 +17083,7 @@ index 000000000000..f11fcab61902
 +	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 +	struct bio *bio		= &rb->bio;
 +	struct bch_io_failures failed = { .nr = 0 };
-+	char buf[200];
-+	struct printbuf out;
++	struct printbuf buf = PRINTBUF;
 +	bool saw_error = false;
 +	bool can_retry;
 +
@@ -16201,10 +17104,10 @@ index 000000000000..f11fcab61902
 +			bio->bi_status = BLK_STS_REMOVED;
 +		}
 +start:
-+		out = PBUF(buf);
-+		btree_pos_to_text(&out, c, b);
++		printbuf_reset(&buf);
++		btree_pos_to_text(&buf, c, b);
 +		bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
-+				   bch2_blk_status_to_str(bio->bi_status), buf);
++				   bch2_blk_status_to_str(bio->bi_status), buf.buf);
 +		if (rb->have_ioref)
 +			percpu_ref_put(&ca->io_ref);
 +		rb->have_ioref = false;
@@ -16230,6 +17133,7 @@ index 000000000000..f11fcab61902
 +	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
 +			       rb->start_time);
 +	bio_put(&rb->bio);
++	printbuf_exit(&buf);
 +
 +	if (saw_error && !btree_node_read_error(b))
 +		bch2_btree_node_rewrite_async(c, b);
@@ -16271,7 +17175,7 @@ index 000000000000..f11fcab61902
 +	if (le64_to_cpu(bn->magic) !=  bset_magic(c))
 +		return 0;
 +
-+	while (offset < c->opts.btree_node_size) {
++	while (offset < btree_sectors(c)) {
 +		if (!offset) {
 +			offset += vstruct_sectors(bn, c->block_bits);
 +		} else {
@@ -16293,7 +17197,7 @@ index 000000000000..f11fcab61902
 +	if (!offset)
 +		return false;
 +
-+	while (offset < c->opts.btree_node_size) {
++	while (offset < btree_sectors(c)) {
 +		bne = data + (offset << 9);
 +		if (bne->keys.seq == bn->keys.seq)
 +			return true;
@@ -16310,6 +17214,7 @@ index 000000000000..f11fcab61902
 +		container_of(cl, struct btree_node_read_all, cl);
 +	struct bch_fs *c = ra->c;
 +	struct btree *b = ra->b;
++	struct printbuf buf = PRINTBUF;
 +	bool dump_bset_maps = false;
 +	bool have_retry = false;
 +	int ret = 0, best = -1, write = READ;
@@ -16353,8 +17258,6 @@ index 000000000000..f11fcab61902
 +fsck_err:
 +	if (dump_bset_maps) {
 +		for (i = 0; i < ra->nr; i++) {
-+			char buf[200];
-+			struct printbuf out = PBUF(buf);
 +			struct btree_node *bn = ra->buf[i];
 +			struct btree_node_entry *bne = NULL;
 +			unsigned offset = 0, sectors;
@@ -16363,7 +17266,9 @@ index 000000000000..f11fcab61902
 +			if (ra->err[i])
 +				continue;
 +
-+			while (offset < c->opts.btree_node_size) {
++			printbuf_reset(&buf);
++
++			while (offset < btree_sectors(c)) {
 +				if (!offset) {
 +					sectors = vstruct_sectors(bn, c->block_bits);
 +				} else {
@@ -16373,30 +17278,30 @@ index 000000000000..f11fcab61902
 +					sectors = vstruct_sectors(bne, c->block_bits);
 +				}
 +
-+				pr_buf(&out, " %u-%u", offset, offset + sectors);
++				pr_buf(&buf, " %u-%u", offset, offset + sectors);
 +				if (bne && bch2_journal_seq_is_blacklisted(c,
 +							le64_to_cpu(bne->keys.journal_seq), false))
-+					pr_buf(&out, "*");
++					pr_buf(&buf, "*");
 +				offset += sectors;
 +			}
 +
-+			while (offset < c->opts.btree_node_size) {
++			while (offset < btree_sectors(c)) {
 +				bne = ra->buf[i] + (offset << 9);
 +				if (bne->keys.seq == bn->keys.seq) {
 +					if (!gap)
-+						pr_buf(&out, " GAP");
++						pr_buf(&buf, " GAP");
 +					gap = true;
 +
 +					sectors = vstruct_sectors(bne, c->block_bits);
-+					pr_buf(&out, " %u-%u", offset, offset + sectors);
++					pr_buf(&buf, " %u-%u", offset, offset + sectors);
 +					if (bch2_journal_seq_is_blacklisted(c,
 +							le64_to_cpu(bne->keys.journal_seq), false))
-+						pr_buf(&out, "*");
++						pr_buf(&buf, "*");
 +				}
 +				offset++;
 +			}
 +
-+			bch_err(c, "replica %u:%s", i, buf);
++			bch_err(c, "replica %u:%s", i, buf.buf);
 +		}
 +	}
 +
@@ -16417,6 +17322,7 @@ index 000000000000..f11fcab61902
 +
 +	closure_debug_destroy(&ra->cl);
 +	kfree(ra);
++	printbuf_exit(&buf);
 +
 +	clear_btree_node_read_in_flight(b);
 +	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -16516,23 +17422,23 @@ index 000000000000..f11fcab61902
 +	struct btree_read_bio *rb;
 +	struct bch_dev *ca;
 +	struct bio *bio;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret;
 +
-+	btree_pos_to_text(&PBUF(buf), c, b);
++	btree_pos_to_text(&buf, c, b);
 +	trace_btree_read(c, b);
 +
 +	if (bch2_verify_all_btree_replicas &&
 +	    !btree_node_read_all_replicas(c, b, sync))
-+		return;
++		goto out;
 +
 +	ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
 +					 NULL, &pick);
 +	if (bch2_fs_fatal_err_on(ret <= 0, c,
 +			"btree node read error: no device to read from\n"
-+			" at %s", buf)) {
++			" at %s", buf.buf)) {
 +		set_btree_node_read_error(b);
-+		return;
++		goto out;
 +	}
 +
 +	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -16573,6 +17479,8 @@ index 000000000000..f11fcab61902
 +		else
 +			queue_work(c->io_complete_wq, &rb->work);
 +	}
++out:
++	printbuf_exit(&buf);
 +}
 +
 +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
@@ -16589,7 +17497,7 @@ index 000000000000..f11fcab61902
 +		closure_sync(&cl);
 +	} while (ret);
 +
-+	b = bch2_btree_node_mem_alloc(c);
++	b = bch2_btree_node_mem_alloc(c, level != 0);
 +	bch2_btree_cache_cannibalize_unlock(c);
 +
 +	BUG_ON(IS_ERR(b));
@@ -16639,7 +17547,7 @@ index 000000000000..f11fcab61902
 +	bch2_journal_pin_drop(&c->journal, &w->journal);
 +}
 +
-+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
++static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 +{
 +	struct btree_write *w = btree_prev_write(b);
 +	unsigned long old, new, v;
@@ -16650,26 +17558,11 @@ index 000000000000..f11fcab61902
 +	do {
 +		old = new = v;
 +
-+		if (old & (1U << BTREE_NODE_need_write))
-+			goto do_write;
-+
-+		new &= ~(1U << BTREE_NODE_write_in_flight);
-+		new &= ~(1U << BTREE_NODE_write_in_flight_inner);
-+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-+
-+	wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
-+	return;
-+
-+do_write:
-+	six_lock_read(&b->c.lock, NULL, NULL);
-+	v = READ_ONCE(b->flags);
-+	do {
-+		old = new = v;
-+
 +		if ((old & (1U << BTREE_NODE_dirty)) &&
 +		    (old & (1U << BTREE_NODE_need_write)) &&
 +		    !(old & (1U << BTREE_NODE_never_write)) &&
-+		    btree_node_may_write(b)) {
++		    !(old & (1U << BTREE_NODE_write_blocked)) &&
++		    !(old & (1U << BTREE_NODE_will_make_reachable))) {
 +			new &= ~(1U << BTREE_NODE_dirty);
 +			new &= ~(1U << BTREE_NODE_need_write);
 +			new |=  (1U << BTREE_NODE_write_in_flight);
@@ -16683,8 +17576,15 @@ index 000000000000..f11fcab61902
 +	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 +
 +	if (new & (1U << BTREE_NODE_write_in_flight))
-+		__bch2_btree_node_write(c, b, true);
++		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
++	else
++		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
++}
 +
++static void btree_node_write_done(struct bch_fs *c, struct btree *b)
++{
++	six_lock_read(&b->c.lock, NULL, NULL);
++	__btree_node_write_done(c, b);
 +	six_unlock_read(&b->c.lock);
 +}
 +
@@ -16799,7 +17699,7 @@ index 000000000000..f11fcab61902
 +	bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k);
 +}
 +
-+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
 +{
 +	struct btree_write_bio *wbio;
 +	struct bset_tree *t;
@@ -16814,13 +17714,11 @@ index 000000000000..f11fcab61902
 +	unsigned long old, new;
 +	bool validate_before_checksum = false;
 +	void *data;
++	int ret;
 +
-+	if (already_started)
++	if (flags & BTREE_WRITE_ALREADY_STARTED)
 +		goto do_write;
 +
-+	if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
-+		return;
-+
 +	/*
 +	 * We may only have a read lock on the btree node - the dirty bit is our
 +	 * "lock" against racing with other threads that may be trying to start
@@ -16834,13 +17732,21 @@ index 000000000000..f11fcab61902
 +		if (!(old & (1 << BTREE_NODE_dirty)))
 +			return;
 +
-+		if (!btree_node_may_write(b))
++		if ((flags & BTREE_WRITE_ONLY_IF_NEED) &&
++		    !(old & (1 << BTREE_NODE_need_write)))
 +			return;
 +
-+		if (old & (1 << BTREE_NODE_never_write))
++		if (old &
++		    ((1 << BTREE_NODE_never_write)|
++		     (1 << BTREE_NODE_write_blocked)))
 +			return;
 +
-+		BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
++		if (b->written &&
++		    (old & (1 << BTREE_NODE_will_make_reachable)))
++			return;
++
++		if (old & (1 << BTREE_NODE_write_in_flight))
++			return;
 +
 +		new &= ~(1 << BTREE_NODE_dirty);
 +		new &= ~(1 << BTREE_NODE_need_write);
@@ -16858,8 +17764,8 @@ index 000000000000..f11fcab61902
 +	BUG_ON(btree_node_fake(b));
 +	BUG_ON((b->will_make_reachable != 0) != !b->written);
 +
-+	BUG_ON(b->written >= c->opts.btree_node_size);
-+	BUG_ON(b->written & (c->opts.block_size - 1));
++	BUG_ON(b->written >= btree_sectors(c));
++	BUG_ON(b->written & (block_sectors(c) - 1));
 +	BUG_ON(bset_written(b, btree_bset_last(b)));
 +	BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
 +	BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@@ -16932,11 +17838,11 @@ index 000000000000..f11fcab61902
 +	memset(data + bytes_to_write, 0,
 +	       (sectors_to_write << 9) - bytes_to_write);
 +
-+	BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
++	BUG_ON(b->written + sectors_to_write > btree_sectors(c));
 +	BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
 +	BUG_ON(i->seq != b->data->keys.seq);
 +
-+	i->version = c->sb.version < bcachefs_metadata_version_new_versioning
++	i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
 +		? cpu_to_le16(BCH_BSET_VERSION_OLD)
 +		: cpu_to_le16(c->sb.version);
 +	SET_BSET_OFFSET(i, b->written);
@@ -16954,7 +17860,10 @@ index 000000000000..f11fcab61902
 +	    validate_bset_for_write(c, b, i, sectors_to_write))
 +		goto err;
 +
-+	bset_encrypt(c, i, b->written << 9);
++	ret = bset_encrypt(c, i, b->written << 9);
++	if (bch2_fs_fatal_err_on(ret, c,
++			"error encrypting btree node: %i\n", ret))
++		goto err;
 +
 +	nonce = btree_nonce(i, b->written << 9);
 +
@@ -17037,7 +17946,7 @@ index 000000000000..f11fcab61902
 +	b->written += sectors_to_write;
 +nowrite:
 +	btree_bounce_free(c, bytes, used_mempool, data);
-+	btree_node_write_done(c, b);
++	__btree_node_write_done(c, b);
 +}
 +
 +/*
@@ -17100,12 +18009,13 @@ index 000000000000..f11fcab61902
 + * Use this one if the node is intent locked:
 + */
 +void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
-+			   enum six_lock_type lock_type_held)
++			   enum six_lock_type lock_type_held,
++			   unsigned flags)
 +{
 +	if (lock_type_held == SIX_LOCK_intent ||
 +	    (lock_type_held == SIX_LOCK_read &&
 +	     six_lock_tryupgrade(&b->c.lock))) {
-+		__bch2_btree_node_write(c, b, false);
++		__bch2_btree_node_write(c, b, flags);
 +
 +		/* don't cycle lock unnecessarily: */
 +		if (btree_node_just_written(b) &&
@@ -17117,7 +18027,7 @@ index 000000000000..f11fcab61902
 +		if (lock_type_held == SIX_LOCK_read)
 +			six_lock_downgrade(&b->c.lock);
 +	} else {
-+		__bch2_btree_node_write(c, b, false);
++		__bch2_btree_node_write(c, b, flags);
 +		if (lock_type_held == SIX_LOCK_write &&
 +		    btree_node_just_written(b))
 +			bch2_btree_post_write_cleanup(c, b);
@@ -17137,7 +18047,6 @@ index 000000000000..f11fcab61902
 +			rcu_read_unlock();
 +			wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
 +			goto restart;
-+
 +		}
 +	rcu_read_unlock();
 +}
@@ -17151,39 +18060,12 @@ index 000000000000..f11fcab61902
 +{
 +	__bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 +}
-+
-+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
-+{
-+	struct bucket_table *tbl;
-+	struct rhash_head *pos;
-+	struct btree *b;
-+	unsigned i;
-+
-+	rcu_read_lock();
-+	for_each_cached_btree(b, c, tbl, i, pos) {
-+		unsigned long flags = READ_ONCE(b->flags);
-+
-+		if (!(flags & (1 << BTREE_NODE_dirty)))
-+			continue;
-+
-+		pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
-+		       b,
-+		       (flags & (1 << BTREE_NODE_dirty)) != 0,
-+		       (flags & (1 << BTREE_NODE_need_write)) != 0,
-+		       b->c.level,
-+		       b->written,
-+		       !list_empty_careful(&b->write_blocked),
-+		       b->will_make_reachable != 0,
-+		       b->will_make_reachable & 1);
-+	}
-+	rcu_read_unlock();
-+}
 diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
 new file mode 100644
-index 000000000000..0f20224e2a77
+index 000000000000..d818d87661e8
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.h
-@@ -0,0 +1,248 @@
+@@ -0,0 +1,222 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_IO_H
 +#define _BCACHEFS_BTREE_IO_H
@@ -17201,18 +18083,13 @@ index 000000000000..0f20224e2a77
 +struct btree_iter;
 +struct btree_node_read_all;
 +
-+static inline bool btree_node_dirty(struct btree *b)
-+{
-+	return test_bit(BTREE_NODE_dirty, &b->flags);
-+}
-+
-+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
++static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 +{
 +	if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
 +		atomic_inc(&c->btree_cache.dirty);
 +}
 +
-+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
++static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b)
 +{
 +	if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
 +		atomic_dec(&c->btree_cache.dirty);
@@ -17253,12 +18130,6 @@ index 000000000000..0f20224e2a77
 +void bch2_btree_node_wait_on_read(struct btree *);
 +void bch2_btree_node_wait_on_write(struct btree *);
 +
-+static inline bool btree_node_may_write(struct btree *b)
-+{
-+	return list_empty_careful(&b->write_blocked) &&
-+		(!b->written || !b->will_make_reachable);
-+}
-+
 +enum compact_mode {
 +	COMPACT_LAZY,
 +	COMPACT_ALL,
@@ -17297,22 +18168,25 @@ index 000000000000..0f20224e2a77
 +	}};
 +}
 +
-+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
++static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
 +{
 +	struct nonce nonce = btree_nonce(i, offset);
++	int ret;
 +
 +	if (!offset) {
 +		struct btree_node *bn = container_of(i, struct btree_node, keys);
 +		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
 +
-+		bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
-+			     bytes);
++		ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
++				   &bn->flags, bytes);
++		if (ret)
++			return ret;
 +
 +		nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
 +	}
 +
-+	bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
-+		     vstruct_end(i) - (void *) i->_data);
++	return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
++			    vstruct_end(i) - (void *) i->_data);
 +}
 +
 +void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
@@ -17331,41 +18205,23 @@ index 000000000000..0f20224e2a77
 +void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 +			      struct btree_write *);
 +
-+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 +
++#define BTREE_WRITE_ONLY_IF_NEED	(1U << 0)
++#define BTREE_WRITE_ALREADY_STARTED	(1U << 1)
++
++void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 +void bch2_btree_node_write(struct bch_fs *, struct btree *,
-+			  enum six_lock_type);
++			   enum six_lock_type, unsigned);
 +
 +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
 +					    enum six_lock_type lock_held)
 +{
-+	if (b->written &&
-+	    btree_node_need_write(b) &&
-+	    btree_node_may_write(b) &&
-+	    !btree_node_write_in_flight(b))
-+		bch2_btree_node_write(c, b, lock_held);
++	bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 +}
 +
-+#define bch2_btree_node_write_cond(_c, _b, cond)			\
-+do {									\
-+	unsigned long old, new, v = READ_ONCE((_b)->flags);		\
-+									\
-+	do {								\
-+		old = new = v;						\
-+									\
-+		if (!(old & (1 << BTREE_NODE_dirty)) || !(cond))	\
-+			break;						\
-+									\
-+		new |= (1 << BTREE_NODE_need_write);			\
-+	} while ((v = cmpxchg(&(_b)->flags, old, new)) != old);		\
-+									\
-+	btree_node_write_if_need(_c, _b, SIX_LOCK_read);		\
-+} while (0)
-+
 +void bch2_btree_flush_all_reads(struct bch_fs *);
 +void bch2_btree_flush_all_writes(struct bch_fs *);
-+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
 +
 +static inline void compat_bformat(unsigned level, enum btree_id btree_id,
 +				  unsigned version, unsigned big_endian,
@@ -17434,10 +18290,10 @@ index 000000000000..0f20224e2a77
 +#endif /* _BCACHEFS_BTREE_IO_H */
 diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
 new file mode 100644
-index 000000000000..f43044e6fa37
+index 000000000000..25d254ee9eac
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.c
-@@ -0,0 +1,2960 @@
+@@ -0,0 +1,3329 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -17452,6 +18308,7 @@ index 000000000000..f43044e6fa37
 +#include "error.h"
 +#include "extents.h"
 +#include "journal.h"
++#include "recovery.h"
 +#include "replicas.h"
 +#include "subvolume.h"
 +
@@ -17459,12 +18316,21 @@ index 000000000000..f43044e6fa37
 +#include <trace/events/bcachefs.h>
 +
 +static void btree_trans_verify_sorted(struct btree_trans *);
-+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
++inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
 +
 +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
 +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
 +				       struct btree_path *);
 +
++static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	return iter->ip_allocated;
++#else
++	return 0;
++#endif
++}
++
 +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 +
 +/*
@@ -17488,6 +18354,9 @@ index 000000000000..f43044e6fa37
 +				   struct bpos		r_pos,
 +				   unsigned		r_level)
 +{
++	/*
++	 * Must match lock ordering as defined by __bch2_btree_node_lock:
++	 */
 +	return   cmp_int(l->btree_id,	r_btree_id) ?:
 +		 cmp_int((int) l->cached,	(int) r_cached) ?:
 +		 bpos_cmp(l->pos,	r_pos) ?:
@@ -17586,11 +18455,19 @@ index 000000000000..f43044e6fa37
 +	 * goes to 0, and it's safe because we have the node intent
 +	 * locked:
 +	 */
-+	atomic64_sub(__SIX_VAL(read_lock, readers),
-+		     &b->c.lock.state.counter);
-+	btree_node_lock_type(trans->c, b, SIX_LOCK_write);
-+	atomic64_add(__SIX_VAL(read_lock, readers),
-+		     &b->c.lock.state.counter);
++	if (!b->c.lock.readers)
++		atomic64_sub(__SIX_VAL(read_lock, readers),
++			     &b->c.lock.state.counter);
++	else
++		this_cpu_sub(*b->c.lock.readers, readers);
++
++	six_lock_write(&b->c.lock, NULL, NULL);
++
++	if (!b->c.lock.readers)
++		atomic64_add(__SIX_VAL(read_lock, readers),
++			     &b->c.lock.state.counter);
++	else
++		this_cpu_add(*b->c.lock.readers, readers);
 +}
 +
 +bool __bch2_btree_node_relock(struct btree_trans *trans,
@@ -17600,19 +18477,25 @@ index 000000000000..f43044e6fa37
 +	int want = __btree_lock_want(path, level);
 +
 +	if (!is_btree_node(path, level))
-+		return false;
++		goto fail;
 +
 +	if (race_fault())
-+		return false;
++		goto fail;
 +
 +	if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
 +	    (btree_node_lock_seq_matches(path, b, level) &&
 +	     btree_node_lock_increment(trans, b, level, want))) {
-+		mark_btree_node_locked(path, level, want);
++		mark_btree_node_locked(trans, path, level, want);
 +		return true;
-+	} else {
-+		return false;
 +	}
++fail:
++	trace_btree_node_relock_fail(trans->fn, _RET_IP_,
++				     path->btree_id,
++				     &path->pos,
++				     (unsigned long) b,
++				     path->l[level].lock_seq,
++				     is_btree_node(path, level) ? b->c.lock.state.seq : 0);
++	return false;
 +}
 +
 +bool bch2_btree_node_upgrade(struct btree_trans *trans,
@@ -17653,13 +18536,13 @@ index 000000000000..f43044e6fa37
 +
 +	return false;
 +success:
-+	mark_btree_node_intent_locked(path, level);
++	mark_btree_node_intent_locked(trans, path, level);
 +	return true;
 +}
 +
 +static inline bool btree_path_get_locks(struct btree_trans *trans,
 +					struct btree_path *path,
-+					bool upgrade, unsigned long trace_ip)
++					bool upgrade)
 +{
 +	unsigned l = path->level;
 +	int fail_idx = -1;
@@ -17716,10 +18599,8 @@ index 000000000000..f43044e6fa37
 +			    six_lock_should_sleep_fn should_sleep_fn, void *p,
 +			    unsigned long ip)
 +{
-+	struct btree_path *linked, *deadlock_path = NULL;
-+	u64 start_time = local_clock();
-+	unsigned reason = 9;
-+	bool ret;
++	struct btree_path *linked;
++	unsigned reason;
 +
 +	/* Check if it's safe to block: */
 +	trans_for_each_path(trans, linked) {
@@ -17740,28 +18621,28 @@ index 000000000000..f43044e6fa37
 +		 */
 +		if (type == SIX_LOCK_intent &&
 +		    linked->nodes_locked != linked->nodes_intent_locked) {
-+			deadlock_path = linked;
 +			reason = 1;
++			goto deadlock;
 +		}
 +
 +		if (linked->btree_id != path->btree_id) {
-+			if (linked->btree_id > path->btree_id) {
-+				deadlock_path = linked;
-+				reason = 3;
-+			}
-+			continue;
++			if (linked->btree_id < path->btree_id)
++				continue;
++
++			reason = 3;
++			goto deadlock;
 +		}
 +
 +		/*
-+		 * Within the same btree, cached paths come before non
-+		 * cached paths:
++		 * Within the same btree, non-cached paths come before cached
++		 * paths:
 +		 */
 +		if (linked->cached != path->cached) {
-+			if (path->cached) {
-+				deadlock_path = linked;
-+				reason = 4;
-+			}
-+			continue;
++			if (!linked->cached)
++				continue;
++
++			reason = 4;
++			goto deadlock;
 +		}
 +
 +		/*
@@ -17770,53 +18651,33 @@ index 000000000000..f43044e6fa37
 +		 * we're about to lock, it must have the ancestors locked too:
 +		 */
 +		if (level > __fls(linked->nodes_locked)) {
-+			deadlock_path = linked;
 +			reason = 5;
++			goto deadlock;
 +		}
 +
 +		/* Must lock btree nodes in key order: */
 +		if (btree_node_locked(linked, level) &&
 +		    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
 +						 linked->cached)) <= 0) {
-+			deadlock_path = linked;
-+			reason = 7;
 +			BUG_ON(trans->in_traverse_all);
++			reason = 7;
++			goto deadlock;
 +		}
 +	}
 +
-+	if (unlikely(deadlock_path)) {
-+		trace_trans_restart_would_deadlock(trans->ip, ip,
-+				trans->in_traverse_all, reason,
-+				deadlock_path->btree_id,
-+				deadlock_path->cached,
-+				&deadlock_path->pos,
-+				path->btree_id,
-+				path->cached,
-+				&pos);
-+		btree_trans_restart(trans);
-+		return false;
-+	}
-+
-+	if (six_trylock_type(&b->c.lock, type))
-+		return true;
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	trans->locking_path_idx = path->idx;
-+	trans->locking_pos	= pos;
-+	trans->locking_btree_id	= path->btree_id;
-+	trans->locking_level	= level;
-+	trans->locking		= b;
-+#endif
-+
-+	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	trans->locking = NULL;
-+#endif
-+	if (ret)
-+		bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
-+				       start_time);
-+	return ret;
++	return btree_node_lock_type(trans, path, b, pos, level,
++				    type, should_sleep_fn, p);
++deadlock:
++	trace_trans_restart_would_deadlock(trans->fn, ip,
++			trans->in_traverse_all, reason,
++			linked->btree_id,
++			linked->cached,
++			&linked->pos,
++			path->btree_id,
++			path->cached,
++			&pos);
++	btree_trans_restart(trans);
++	return false;
 +}
 +
 +/* Btree iterator locking: */
@@ -17865,6 +18726,8 @@ index 000000000000..f43044e6fa37
 +		if (!bch2_btree_node_relock(trans, path, l)) {
 +			__bch2_btree_path_unlock(path);
 +			btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++			trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
++						   path->btree_id, &path->pos);
 +			btree_trans_restart(trans);
 +			return false;
 +		}
@@ -17877,10 +18740,13 @@ index 000000000000..f43044e6fa37
 +static bool bch2_btree_path_relock(struct btree_trans *trans,
 +			struct btree_path *path, unsigned long trace_ip)
 +{
-+	bool ret = btree_path_get_locks(trans, path, false, trace_ip);
++	bool ret = btree_path_get_locks(trans, path, false);
 +
-+	if (!ret)
++	if (!ret) {
++		trace_trans_restart_relock_path(trans->fn, trace_ip,
++						path->btree_id, &path->pos);
 +		btree_trans_restart(trans);
++	}
 +	return ret;
 +}
 +
@@ -17894,7 +18760,7 @@ index 000000000000..f43044e6fa37
 +
 +	path->locks_want = new_locks_want;
 +
-+	if (btree_path_get_locks(trans, path, true, _THIS_IP_))
++	if (btree_path_get_locks(trans, path, true))
 +		return true;
 +
 +	/*
@@ -17916,14 +18782,15 @@ index 000000000000..f43044e6fa37
 +	 * before interior nodes - now that's handled by
 +	 * bch2_btree_path_traverse_all().
 +	 */
-+	trans_for_each_path(trans, linked)
-+		if (linked != path &&
-+		    linked->cached == path->cached &&
-+		    linked->btree_id == path->btree_id &&
-+		    linked->locks_want < new_locks_want) {
-+			linked->locks_want = new_locks_want;
-+			btree_path_get_locks(trans, linked, true, _THIS_IP_);
-+		}
++	if (!path->cached && !trans->in_traverse_all)
++		trans_for_each_path(trans, linked)
++			if (linked != path &&
++			    linked->cached == path->cached &&
++			    linked->btree_id == path->btree_id &&
++			    linked->locks_want < new_locks_want) {
++				linked->locks_want = new_locks_want;
++				btree_path_get_locks(trans, linked, true);
++			}
 +
 +	return false;
 +}
@@ -17973,7 +18840,7 @@ index 000000000000..f43044e6fa37
 +	trans_for_each_path(trans, path)
 +		if (path->should_be_locked &&
 +		    !bch2_btree_path_relock(trans, path, _RET_IP_)) {
-+			trace_trans_restart_relock(trans->ip, _RET_IP_,
++			trace_trans_restart_relock(trans->fn, _RET_IP_,
 +					path->btree_id, &path->pos);
 +			BUG_ON(!trans->restarted);
 +			return false;
@@ -17988,7 +18855,12 @@ index 000000000000..f43044e6fa37
 +	trans_for_each_path(trans, path)
 +		__bch2_btree_path_unlock(path);
 +
-+	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++	/*
++	 * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking
++	 * btree nodes, it implements its own walking:
++	 */
++	BUG_ON(!trans->is_initial_gc &&
++	       lock_class_is_held(&bch2_btree_node_lock_key));
 +}
 +
 +/* Btree iterator: */
@@ -18019,7 +18891,9 @@ index 000000000000..f43044e6fa37
 +	struct btree_node_iter tmp;
 +	bool locked;
 +	struct bkey_packed *p, *k;
-+	char buf1[100], buf2[100], buf3[100];
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	struct printbuf buf3 = PRINTBUF;
 +	const char *msg;
 +
 +	if (!bch2_debug_check_iterators)
@@ -18067,26 +18941,27 @@ index 000000000000..f43044e6fa37
 +		btree_node_unlock(path, level);
 +	return;
 +err:
-+	strcpy(buf2, "(none)");
-+	strcpy(buf3, "(none)");
-+
-+	bch2_bpos_to_text(&PBUF(buf1), path->pos);
++	bch2_bpos_to_text(&buf1, path->pos);
 +
 +	if (p) {
 +		struct bkey uk = bkey_unpack_key(l->b, p);
-+		bch2_bkey_to_text(&PBUF(buf2), &uk);
++		bch2_bkey_to_text(&buf2, &uk);
++	} else {
++		pr_buf(&buf2, "(none)");
 +	}
 +
 +	if (k) {
 +		struct bkey uk = bkey_unpack_key(l->b, k);
-+		bch2_bkey_to_text(&PBUF(buf3), &uk);
++		bch2_bkey_to_text(&buf3, &uk);
++	} else {
++		pr_buf(&buf3, "(none)");
 +	}
 +
 +	panic("path should be %s key at level %u:\n"
 +	      "path pos %s\n"
 +	      "prev key %s\n"
 +	      "cur  key %s\n",
-+	      msg, level, buf1, buf2, buf3);
++	      msg, level, buf1.buf, buf2.buf, buf3.buf);
 +}
 +
 +static void bch2_btree_path_verify(struct btree_trans *trans,
@@ -18126,9 +19001,6 @@ index 000000000000..f43044e6fa37
 +
 +	BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 +
-+	BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-+	       iter->pos.snapshot != iter->snapshot);
-+
 +	BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
 +	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
 +
@@ -18136,6 +19008,8 @@ index 000000000000..f43044e6fa37
 +	       (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
 +	       !btree_type_has_snapshots(iter->btree_id));
 +
++	if (iter->update_path)
++		bch2_btree_path_verify(trans, iter->update_path);
 +	bch2_btree_path_verify(trans, iter->path);
 +}
 +
@@ -18172,6 +19046,7 @@ index 000000000000..f43044e6fa37
 +					  k.k->p.snapshot));
 +
 +	bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
++			     BTREE_ITER_NOPRESERVE|
 +			     BTREE_ITER_ALL_SNAPSHOTS);
 +	prev = bch2_btree_iter_prev(&copy);
 +	if (!prev.k)
@@ -18184,16 +19059,16 @@ index 000000000000..f43044e6fa37
 +	if (!bkey_cmp(prev.k->p, k.k->p) &&
 +	    bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
 +				      prev.k->p.snapshot) > 0) {
-+		char buf1[100], buf2[200];
++		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +
-+		bch2_bkey_to_text(&PBUF(buf1), k.k);
-+		bch2_bkey_to_text(&PBUF(buf2), prev.k);
++		bch2_bkey_to_text(&buf1, k.k);
++		bch2_bkey_to_text(&buf2, prev.k);
 +
 +		panic("iter snap %u\n"
 +		      "k    %s\n"
 +		      "prev %s\n",
 +		      iter->snapshot,
-+		      buf1, buf2);
++		      buf1.buf, buf2.buf);
 +	}
 +out:
 +	bch2_trans_iter_exit(trans, &copy);
@@ -18205,7 +19080,7 @@ index 000000000000..f43044e6fa37
 +{
 +	struct btree_path *path;
 +	unsigned idx;
-+	char buf[100];
++	struct printbuf buf = PRINTBUF;
 +
 +	trans_for_each_path_inorder(trans, path, idx) {
 +		int cmp = cmp_int(path->btree_id, id) ?:
@@ -18231,9 +19106,10 @@ index 000000000000..f43044e6fa37
 +	}
 +
 +	bch2_dump_trans_paths_updates(trans);
++	bch2_bpos_to_text(&buf, pos);
++
 +	panic("not locked: %s %s%s\n",
-+	      bch2_btree_ids[id],
-+	      (bch2_bpos_to_text(&PBUF(buf), pos), buf),
++	      bch2_btree_ids[id], buf.buf,
 +	      key_cache ? " cached" : "");
 +}
 +
@@ -18419,8 +19295,6 @@ index 000000000000..f43044e6fa37
 +						  struct bkey *u,
 +						  struct bkey_packed *k)
 +{
-+	struct bkey_s_c ret;
-+
 +	if (unlikely(!k)) {
 +		/*
 +		 * signal to bch2_btree_iter_peek_slot() that we're currently at
@@ -18430,19 +19304,7 @@ index 000000000000..f43044e6fa37
 +		return bkey_s_c_null;
 +	}
 +
-+	ret = bkey_disassemble(l->b, k, u);
-+
-+	/*
-+	 * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
-+	 * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
-+	 * being overwritten but doesn't change k->size. But this is ok, because
-+	 * those keys are never written out, we just have to avoid a spurious
-+	 * assertion here:
-+	 */
-+	if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
-+		bch2_bkey_debugcheck(c, l->b, ret);
-+
-+	return ret;
++	return bkey_disassemble(l->b, k, u);
 +}
 +
 +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
@@ -18502,6 +19364,7 @@ index 000000000000..f43044e6fa37
 +static void btree_path_verify_new_node(struct btree_trans *trans,
 +				       struct btree_path *path, struct btree *b)
 +{
++	struct bch_fs *c = trans->c;
 +	struct btree_path_level *l;
 +	unsigned plevel;
 +	bool parent_locked;
@@ -18510,6 +19373,9 @@ index 000000000000..f43044e6fa37
 +	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
 +		return;
 +
++	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
++		return;
++
 +	plevel = b->c.level + 1;
 +	if (!btree_path_node(path, plevel))
 +		return;
@@ -18524,23 +19390,23 @@ index 000000000000..f43044e6fa37
 +	if (!k ||
 +	    bkey_deleted(k) ||
 +	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
-+		char buf1[100];
-+		char buf2[100];
-+		char buf3[100];
-+		char buf4[100];
++		struct printbuf buf1 = PRINTBUF;
++		struct printbuf buf2 = PRINTBUF;
++		struct printbuf buf3 = PRINTBUF;
++		struct printbuf buf4 = PRINTBUF;
 +		struct bkey uk = bkey_unpack_key(b, k);
 +
-+		bch2_dump_btree_node(trans->c, l->b);
-+		bch2_bpos_to_text(&PBUF(buf1), path->pos);
-+		bch2_bkey_to_text(&PBUF(buf2), &uk);
-+		bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
-+		bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
++		bch2_dump_btree_node(c, l->b);
++		bch2_bpos_to_text(&buf1, path->pos);
++		bch2_bkey_to_text(&buf2, &uk);
++		bch2_bpos_to_text(&buf3, b->data->min_key);
++		bch2_bpos_to_text(&buf3, b->data->max_key);
 +		panic("parent iter doesn't point to new node:\n"
 +		      "iter pos %s %s\n"
 +		      "iter key %s\n"
 +		      "new node %s-%s\n",
-+		      bch2_btree_ids[path->btree_id], buf1,
-+		      buf2, buf3, buf4);
++		      bch2_btree_ids[path->btree_id],
++		      buf1.buf, buf2.buf, buf3.buf, buf4.buf);
 +	}
 +
 +	if (!parent_locked)
@@ -18598,7 +19464,7 @@ index 000000000000..f43044e6fa37
 +			    t != BTREE_NODE_UNLOCKED) {
 +				btree_node_unlock(path, b->c.level);
 +				six_lock_increment(&b->c.lock, t);
-+				mark_btree_node_locked(path, b->c.level, t);
++				mark_btree_node_locked(trans, path, b->c.level, t);
 +			}
 +
 +			btree_path_level_init(trans, path, b);
@@ -18675,7 +19541,7 @@ index 000000000000..f43044e6fa37
 +			for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
 +				path->l[i].b = NULL;
 +
-+			mark_btree_node_locked(path, path->level, lock_type);
++			mark_btree_node_locked(trans, path, path->level, lock_type);
 +			btree_path_level_init(trans, path, b);
 +			return 0;
 +		}
@@ -18721,6 +19587,41 @@ index 000000000000..f43044e6fa37
 +	return ret;
 +}
 +
++static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
++				 struct btree_and_journal_iter *jiter)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c k;
++	struct bkey_buf tmp;
++	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
++		? (path->level > 1 ? 0 :  2)
++		: (path->level > 1 ? 1 : 16);
++	bool was_locked = btree_node_locked(path, path->level);
++	int ret = 0;
++
++	bch2_bkey_buf_init(&tmp);
++
++	while (nr && !ret) {
++		if (!bch2_btree_node_relock(trans, path, path->level))
++			break;
++
++		bch2_btree_and_journal_iter_advance(jiter);
++		k = bch2_btree_and_journal_iter_peek(jiter);
++		if (!k.k)
++			break;
++
++		bch2_bkey_buf_reassemble(&tmp, c, k);
++		ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
++					       path->level - 1);
++	}
++
++	if (!was_locked)
++		btree_node_unlock(path, path->level);
++
++	bch2_bkey_buf_exit(&tmp, c);
++	return ret;
++}
++
 +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
 +					    struct btree_path *path,
 +					    unsigned plevel, struct btree *b)
@@ -18743,6 +19644,30 @@ index 000000000000..f43044e6fa37
 +		btree_node_unlock(path, plevel);
 +}
 +
++static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
++						     struct btree_path *path,
++						     unsigned flags,
++						     struct bkey_buf *out)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_path_level *l = path_l(path);
++	struct btree_and_journal_iter jiter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
++
++	k = bch2_btree_and_journal_iter_peek(&jiter);
++
++	bch2_bkey_buf_reassemble(out, c, k);
++
++	if (flags & BTREE_ITER_PREFETCH)
++		ret = btree_path_prefetch_j(trans, path, &jiter);
++
++	bch2_btree_and_journal_iter_exit(&jiter);
++	return ret;
++}
++
 +static __always_inline int btree_path_down(struct btree_trans *trans,
 +					   struct btree_path *path,
 +					   unsigned flags,
@@ -18753,30 +19678,41 @@ index 000000000000..f43044e6fa37
 +	struct btree *b;
 +	unsigned level = path->level - 1;
 +	enum six_lock_type lock_type = __btree_lock_want(path, level);
++	bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 +	struct bkey_buf tmp;
 +	int ret;
 +
 +	EBUG_ON(!btree_node_locked(path, path->level));
 +
 +	bch2_bkey_buf_init(&tmp);
-+	bch2_bkey_buf_unpack(&tmp, c, l->b,
-+			 bch2_btree_node_iter_peek(&l->iter, l->b));
++
++	if (unlikely(!replay_done)) {
++		ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
++		if (ret)
++			goto err;
++	} else {
++		bch2_bkey_buf_unpack(&tmp, c, l->b,
++				 bch2_btree_node_iter_peek(&l->iter, l->b));
++
++		if (flags & BTREE_ITER_PREFETCH) {
++			ret = btree_path_prefetch(trans, path);
++			if (ret)
++				goto err;
++		}
++	}
 +
 +	b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
 +	ret = PTR_ERR_OR_ZERO(b);
 +	if (unlikely(ret))
 +		goto err;
 +
-+	mark_btree_node_locked(path, level, lock_type);
++	mark_btree_node_locked(trans, path, level, lock_type);
 +	btree_path_level_init(trans, path, b);
 +
-+	if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
++	if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
 +	    unlikely(b != btree_node_mem_ptr(tmp.k)))
 +		btree_node_mem_ptr_set(trans, path, level + 1, b);
 +
-+	if (flags & BTREE_ITER_PREFETCH)
-+		ret = btree_path_prefetch(trans, path);
-+
 +	if (btree_node_read_locked(path, level + 1))
 +		btree_node_unlock(path, level + 1);
 +	path->level = level;
@@ -18790,12 +19726,12 @@ index 000000000000..f43044e6fa37
 +static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
 +				   unsigned, unsigned long);
 +
-+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
-+				     unsigned long trace_ip)
++static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_path *path;
-+	int i;
++	unsigned long trace_ip = _RET_IP_;
++	int i, ret = 0;
 +
 +	if (trans->in_traverse_all)
 +		return -EINTR;
@@ -18803,6 +19739,7 @@ index 000000000000..f43044e6fa37
 +	trans->in_traverse_all = true;
 +retry_all:
 +	trans->restarted = false;
++	trans->traverse_all_idx = U8_MAX;
 +
 +	trans_for_each_path(trans, path)
 +		path->should_be_locked = false;
@@ -18823,7 +19760,7 @@ index 000000000000..f43044e6fa37
 +	bch2_trans_unlock(trans);
 +	cond_resched();
 +
-+	if (unlikely(ret == -ENOMEM)) {
++	if (unlikely(trans->memory_allocation_failure)) {
 +		struct closure cl;
 +
 +		closure_init_stack(&cl);
@@ -18834,27 +19771,25 @@ index 000000000000..f43044e6fa37
 +		} while (ret);
 +	}
 +
-+	if (unlikely(ret == -EIO))
-+		goto out;
-+
-+	BUG_ON(ret && ret != -EINTR);
-+
 +	/* Now, redo traversals in correct order: */
-+	i = 0;
-+	while (i < trans->nr_sorted) {
-+		path = trans->paths + trans->sorted[i];
++	trans->traverse_all_idx = 0;
++	while (trans->traverse_all_idx < trans->nr_sorted) {
++		path = trans->paths + trans->sorted[trans->traverse_all_idx];
 +
-+		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-+
-+		ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
-+		if (ret)
-+			goto retry_all;
-+
-+		EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
-+
-+		if (path->nodes_locked ||
-+		    !btree_path_node(path, path->level))
-+			i++;
++		/*
++		 * Traversing a path can cause another path to be added at about
++		 * the same position:
++		 */
++		if (path->uptodate) {
++			ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
++			if (ret == -EINTR || ret == -ENOMEM)
++				goto retry_all;
++			if (ret)
++				goto err;
++			BUG_ON(path->uptodate);
++		} else {
++			trans->traverse_all_idx++;
++		}
 +	}
 +
 +	/*
@@ -18864,20 +19799,15 @@ index 000000000000..f43044e6fa37
 +	 */
 +	trans_for_each_path(trans, path)
 +		BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
-+out:
++err:
 +	bch2_btree_cache_cannibalize_unlock(c);
 +
 +	trans->in_traverse_all = false;
 +
-+	trace_trans_traverse_all(trans->ip, trace_ip);
++	trace_trans_traverse_all(trans->fn, trace_ip);
 +	return ret;
 +}
 +
-+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
-+{
-+	return __btree_path_traverse_all(trans, 0, _RET_IP_);
-+}
-+
 +static inline bool btree_path_good_node(struct btree_trans *trans,
 +					struct btree_path *path,
 +					unsigned l, int check_pos)
@@ -19001,8 +19931,6 @@ index 000000000000..f43044e6fa37
 +	return ret;
 +}
 +
-+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
-+
 +int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
 +					  struct btree_path *path, unsigned flags)
 +{
@@ -19026,7 +19954,7 @@ index 000000000000..f43044e6fa37
 +			six_lock_increment(&dst->l[i].b->c.lock,
 +					   __btree_lock_want(dst, i));
 +
-+	btree_path_check_sort(trans, dst, 0);
++	bch2_btree_path_check_sort(trans, dst, 0);
 +}
 +
 +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -19041,25 +19969,27 @@ index 000000000000..f43044e6fa37
 +
 +inline struct btree_path * __must_check
 +bch2_btree_path_make_mut(struct btree_trans *trans,
-+			 struct btree_path *path, bool intent)
++			 struct btree_path *path, bool intent,
++			 unsigned long ip)
 +{
 +	if (path->ref > 1 || path->preserve) {
 +		__btree_path_put(path, intent);
 +		path = btree_path_clone(trans, path, intent);
 +		path->preserve = false;
 +#ifdef CONFIG_BCACHEFS_DEBUG
-+		path->ip_allocated = _RET_IP_;
++		path->ip_allocated = ip;
 +#endif
 +		btree_trans_verify_sorted(trans);
 +	}
 +
++	path->should_be_locked = false;
 +	return path;
 +}
 +
-+static struct btree_path * __must_check
-+btree_path_set_pos(struct btree_trans *trans,
++struct btree_path * __must_check
++bch2_btree_path_set_pos(struct btree_trans *trans,
 +		   struct btree_path *path, struct bpos new_pos,
-+		   bool intent)
++		   bool intent, unsigned long ip)
 +{
 +	int cmp = bpos_cmp(new_pos, path->pos);
 +	unsigned l = path->level;
@@ -19070,12 +20000,11 @@ index 000000000000..f43044e6fa37
 +	if (!cmp)
 +		return path;
 +
-+	path = bch2_btree_path_make_mut(trans, path, intent);
++	path = bch2_btree_path_make_mut(trans, path, intent, ip);
 +
-+	path->pos		= new_pos;
-+	path->should_be_locked	= false;
++	path->pos = new_pos;
 +
-+	btree_path_check_sort(trans, path, cmp);
++	bch2_btree_path_check_sort(trans, path, cmp);
 +
 +	if (unlikely(path->cached)) {
 +		btree_node_unlock(path, 0);
@@ -19087,6 +20016,7 @@ index 000000000000..f43044e6fa37
 +	l = btree_path_up_until_good_node(trans, path, cmp);
 +
 +	if (btree_path_node(path, l)) {
++		BUG_ON(!btree_node_locked(path, l));
 +		/*
 +		 * We might have to skip over many keys, or just a few: try
 +		 * advancing the node iterator, and if we have to skip over too
@@ -19179,23 +20109,64 @@ index 000000000000..f43044e6fa37
 +	__bch2_path_free(trans, path);
 +}
 +
++void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	pr_buf(buf, "transaction updates for %s journal seq %llu",
++	       trans->fn, trans->journal_res.seq);
++	pr_newline(buf);
++	pr_indent_push(buf, 2);
++
++	trans_for_each_update(trans, i) {
++		struct bkey_s_c old = { &i->old_k, i->old_v };
++
++		pr_buf(buf, "update: btree %s %pS",
++		       bch2_btree_ids[i->btree_id],
++		       (void *) i->ip_allocated);
++		pr_newline(buf);
++
++		pr_buf(buf, "  old ");
++		bch2_bkey_val_to_text(buf, trans->c, old);
++		pr_newline(buf);
++
++		pr_buf(buf, "  new ");
++		bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k));
++		pr_newline(buf);
++	}
++
++	pr_indent_pop(buf, 2);
++}
++
++noinline __cold
++void bch2_dump_trans_updates(struct btree_trans *trans)
++{
++	struct printbuf buf = PRINTBUF;
++
++	bch2_trans_updates_to_text(&buf, trans);
++	bch_err(trans->c, "%s", buf.buf);
++	printbuf_exit(&buf);
++}
++
 +noinline __cold
 +void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 +{
 +	struct btree_path *path;
-+	struct btree_insert_entry *i;
++	struct printbuf buf = PRINTBUF;
 +	unsigned idx;
-+	char buf1[300], buf2[300];
 +
-+	btree_trans_verify_sorted(trans);
++	trans_for_each_path_inorder(trans, path, idx) {
++		printbuf_reset(&buf);
 +
-+	trans_for_each_path_inorder(trans, path, idx)
-+		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
++		bch2_bpos_to_text(&buf, path->pos);
++
++		printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n",
 +		       path->idx, path->ref, path->intent_ref,
 +		       path->should_be_locked ? " S" : "",
 +		       path->preserve ? " P" : "",
 +		       bch2_btree_ids[path->btree_id],
-+		       (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
++		       path->level,
++		       buf.buf,
 +		       path->nodes_locked,
 +#ifdef CONFIG_BCACHEFS_DEBUG
 +		       (void *) path->ip_allocated
@@ -19203,17 +20174,11 @@ index 000000000000..f43044e6fa37
 +		       NULL
 +#endif
 +		       );
-+
-+	trans_for_each_update(trans, i) {
-+		struct bkey u;
-+		struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
-+
-+		printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
-+		       bch2_btree_ids[i->btree_id],
-+		       (void *) i->ip_allocated,
-+		       (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
-+		       (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
 +	}
++
++	printbuf_exit(&buf);
++
++	bch2_dump_trans_updates(trans);
 +}
 +
 +static struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -19243,15 +20208,19 @@ index 000000000000..f43044e6fa37
 +	return path;
 +}
 +
-+struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
++struct btree_path *bch2_path_get(struct btree_trans *trans,
 +				 enum btree_id btree_id, struct bpos pos,
 +				 unsigned locks_want, unsigned level,
-+				 bool intent)
++				 unsigned flags, unsigned long ip)
 +{
 +	struct btree_path *path, *path_pos = NULL;
++	bool cached = flags & BTREE_ITER_CACHED;
++	bool intent = flags & BTREE_ITER_INTENT;
 +	int i;
 +
 +	BUG_ON(trans->restarted);
++	btree_trans_verify_sorted(trans);
++	bch2_trans_verify_locks(trans);
 +
 +	trans_for_each_path_inorder(trans, path, i) {
 +		if (__btree_path_cmp(path,
@@ -19269,8 +20238,7 @@ index 000000000000..f43044e6fa37
 +	    path_pos->btree_id	== btree_id &&
 +	    path_pos->level	== level) {
 +		__btree_path_get(path_pos, intent);
-+		path = btree_path_set_pos(trans, path_pos, pos, intent);
-+		path->preserve = true;
++		path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
 +	} else {
 +		path = btree_path_alloc(trans, path_pos);
 +		path_pos = NULL;
@@ -19279,7 +20247,6 @@ index 000000000000..f43044e6fa37
 +		path->pos			= pos;
 +		path->btree_id			= btree_id;
 +		path->cached			= cached;
-+		path->preserve			= true;
 +		path->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 +		path->should_be_locked		= false;
 +		path->level			= level;
@@ -19289,11 +20256,14 @@ index 000000000000..f43044e6fa37
 +		for (i = 0; i < ARRAY_SIZE(path->l); i++)
 +			path->l[i].b		= BTREE_ITER_NO_NODE_INIT;
 +#ifdef CONFIG_BCACHEFS_DEBUG
-+		path->ip_allocated		= _RET_IP_;
++		path->ip_allocated		= ip;
 +#endif
 +		btree_trans_verify_sorted(trans);
 +	}
 +
++	if (!(flags & BTREE_ITER_NOPRESERVE))
++		path->preserve = true;
++
 +	if (path->intent_ref)
 +		locks_want = max(locks_want, level + 1);
 +
@@ -19308,7 +20278,7 @@ index 000000000000..f43044e6fa37
 +	locks_want = min(locks_want, BTREE_MAX_DEPTH);
 +	if (locks_want > path->locks_want) {
 +		path->locks_want = locks_want;
-+		btree_path_get_locks(trans, path, true, _THIS_IP_);
++		btree_path_get_locks(trans, path, true);
 +	}
 +
 +	return path;
@@ -19319,13 +20289,13 @@ index 000000000000..f43044e6fa37
 +
 +	struct bkey_s_c k;
 +
-+	BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-+
 +	if (!path->cached) {
 +		struct btree_path_level *l = path_l(path);
-+		struct bkey_packed *_k =
-+			bch2_btree_node_iter_peek_all(&l->iter, l->b);
++		struct bkey_packed *_k;
 +
++		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
++
++		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 +		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 +
 +		EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
@@ -19335,13 +20305,17 @@ index 000000000000..f43044e6fa37
 +	} else {
 +		struct bkey_cached *ck = (void *) path->l[0].b;
 +
-+		EBUG_ON(path->btree_id != ck->key.btree_id ||
-+			bkey_cmp(path->pos, ck->key.pos));
++		EBUG_ON(ck &&
++			(path->btree_id != ck->key.btree_id ||
++			 bkey_cmp(path->pos, ck->key.pos)));
 +
-+		/* BTREE_ITER_CACHED_NOFILL? */
-+		if (unlikely(!ck->valid))
-+			goto hole;
++		/* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
++		if (unlikely(!ck || !ck->valid))
++			return bkey_s_c_null;
 +
++		EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
++
++		*u = ck->k->k;
 +		k = bkey_i_to_s_c(ck->k);
 +	}
 +
@@ -19365,9 +20339,10 @@ index 000000000000..f43044e6fa37
 +{
 +	int ret;
 +
-+	iter->path = btree_path_set_pos(iter->trans, iter->path,
++	iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
 +					btree_iter_search_key(iter),
-+					iter->flags & BTREE_ITER_INTENT);
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
 +
 +	ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
 +	if (ret)
@@ -19401,8 +20376,9 @@ index 000000000000..f43044e6fa37
 +	bkey_init(&iter->k);
 +	iter->k.p = iter->pos = b->key.k.p;
 +
-+	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
-+					iter->flags & BTREE_ITER_INTENT);
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
 +	iter->path->should_be_locked = true;
 +	BUG_ON(iter->path->uptodate);
 +out:
@@ -19436,6 +20412,7 @@ index 000000000000..f43044e6fa37
 +		btree_node_unlock(path, path->level);
 +		path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
 +		path->level++;
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +		return NULL;
 +	}
 +
@@ -19443,6 +20420,9 @@ index 000000000000..f43044e6fa37
 +		__bch2_btree_path_unlock(path);
 +		path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
 +		path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
++		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
++		trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
++					   path->btree_id, &path->pos);
 +		btree_trans_restart(trans);
 +		ret = -EINTR;
 +		goto err;
@@ -19460,8 +20440,9 @@ index 000000000000..f43044e6fa37
 +		 * the next child node
 +		 */
 +		path = iter->path =
-+			btree_path_set_pos(trans, path, bpos_successor(iter->pos),
-+					   iter->flags & BTREE_ITER_INTENT);
++			bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
++					   iter->flags & BTREE_ITER_INTENT,
++					   btree_iter_ip_allocated(iter));
 +
 +		path->level = iter->min_depth;
 +
@@ -19482,8 +20463,9 @@ index 000000000000..f43044e6fa37
 +	bkey_init(&iter->k);
 +	iter->k.p = iter->pos = b->key.k.p;
 +
-+	iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
-+					iter->flags & BTREE_ITER_INTENT);
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
 +	iter->path->should_be_locked = true;
 +	BUG_ON(iter->path->uptodate);
 +out:
@@ -19524,25 +20506,90 @@ index 000000000000..f43044e6fa37
 +	return ret;
 +}
 +
-+/**
-+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
-+ * current position
++static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
++						      enum btree_id btree_id,
++						      struct bpos pos)
++{
++	struct btree_insert_entry *i;
++
++	trans_for_each_update(trans, i)
++		if ((cmp_int(btree_id,	i->btree_id) ?:
++		     bpos_cmp(pos,	i->k->k.p)) <= 0) {
++			if (btree_id ==	i->btree_id)
++				return i->k;
++			break;
++		}
++
++	return NULL;
++}
++
++static noinline
++struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
++					 struct btree_iter *iter,
++					 struct bkey_s_c k)
++{
++	struct bkey_i *next_journal =
++		bch2_journal_keys_peek(trans->c, iter->btree_id, 0,
++				       iter->path->pos);
++
++	if (next_journal &&
++	    bpos_cmp(next_journal->k.p,
++		     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
++		iter->k = next_journal->k;
++		k = bkey_i_to_s_c(next_journal);
++	}
++
++	return k;
++}
++
++/*
++ * Checks btree key cache for key at iter->pos and returns it if present, or
++ * bkey_s_c_null:
 + */
-+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
++static noinline
++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bch_fs *c = trans->c;
++	struct bkey u;
++	int ret;
++
++	if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
++		return bkey_s_c_null;
++
++	if (!iter->key_cache_path)
++		iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
++						     iter->flags & BTREE_ITER_INTENT, 0,
++						     iter->flags|BTREE_ITER_CACHED,
++						     _THIS_IP_);
++
++	iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
++
++	ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
++	if (unlikely(ret))
++		return bkey_s_c_err(ret);
++
++	iter->key_cache_path->should_be_locked = true;
++
++	return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
++}
++
++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 +{
 +	struct btree_trans *trans = iter->trans;
-+	struct bpos search_key = btree_iter_search_key(iter);
 +	struct bkey_i *next_update;
-+	struct bkey_s_c k;
-+	int ret, cmp;
++	struct bkey_s_c k, k2;
++	int ret;
 +
 +	EBUG_ON(iter->path->cached || iter->path->level);
 +	bch2_btree_iter_verify(iter);
-+	bch2_btree_iter_verify_entry_exit(iter);
 +
 +	while (1) {
-+		iter->path = btree_path_set_pos(trans, iter->path, search_key,
-+				   iter->flags & BTREE_ITER_INTENT);
++		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
 +
 +		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 +		if (unlikely(ret)) {
@@ -19552,19 +20599,30 @@ index 000000000000..f43044e6fa37
 +			goto out;
 +		}
 +
++		iter->path->should_be_locked = true;
++
++		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
++		    k.k &&
++		    (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
++			ret = bkey_err(k2);
++			if (ret) {
++				k = k2;
++				bch2_btree_iter_set_pos(iter, iter->pos);
++				goto out;
++			}
++
++			k = k2;
++			iter->k = *k.k;
++		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
++			k = btree_trans_peek_journal(trans, iter, k);
++
 +		next_update = iter->flags & BTREE_ITER_WITH_UPDATES
 +			? btree_trans_peek_updates(trans, iter->btree_id, search_key)
 +			: NULL;
-+		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
-+
-+		/* * In the btree, deleted keys sort before non deleted: */
-+		if (k.k && bkey_deleted(k.k) &&
-+		    (!next_update ||
-+		     bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
-+			search_key = k.k->p;
-+			continue;
-+		}
-+
 +		if (next_update &&
 +		    bpos_cmp(next_update->k.p,
 +			     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
@@ -19572,25 +20630,21 @@ index 000000000000..f43044e6fa37
 +			k = bkey_i_to_s_c(next_update);
 +		}
 +
-+		if (likely(k.k)) {
++		if (k.k && bkey_deleted(k.k)) {
 +			/*
-+			 * We can never have a key in a leaf node at POS_MAX, so
-+			 * we don't have to check these successor() calls:
++			 * If we've got a whiteout, and it's after the search
++			 * key, advance the search key to the whiteout instead
++			 * of just after the whiteout - it might be a btree
++			 * whiteout, with a real key at the same position, since
++			 * in the btree deleted keys sort before non deleted.
 +			 */
-+			if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
-+			    !bch2_snapshot_is_ancestor(trans->c,
-+						       iter->snapshot,
-+						       k.k->p.snapshot)) {
-+				search_key = bpos_successor(k.k->p);
-+				continue;
-+			}
-+
-+			if (bkey_whiteout(k.k) &&
-+			    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
-+				search_key = bkey_successor(iter, k.k->p);
-+				continue;
-+			}
++			search_key = bpos_cmp(search_key, k.k->p)
++				? k.k->p
++				: bpos_successor(k.k->p);
++			continue;
++		}
 +
++		if (likely(k.k)) {
 +			break;
 +		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
 +			/* Advance to next leaf node: */
@@ -19602,34 +20656,137 @@ index 000000000000..f43044e6fa37
 +			goto out;
 +		}
 +	}
-+
-+	/*
-+	 * iter->pos should be mononotically increasing, and always be equal to
-+	 * the key we just returned - except extents can straddle iter->pos:
-+	 */
-+	if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
-+		iter->pos = k.k->p;
-+	else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
-+		iter->pos = bkey_start_pos(k.k);
-+
-+	if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
-+		iter->pos.snapshot = iter->snapshot;
-+
-+	cmp = bpos_cmp(k.k->p, iter->path->pos);
-+	if (cmp) {
-+		iter->path = bch2_btree_path_make_mut(trans, iter->path,
-+					iter->flags & BTREE_ITER_INTENT);
-+		iter->path->pos = k.k->p;
-+		btree_path_check_sort(trans, iter->path, cmp);
-+	}
 +out:
-+	iter->path->should_be_locked = true;
++	bch2_btree_iter_verify(iter);
++
++	return k;
++}
++
++/**
++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
++ * current position
++ */
++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
++{
++	struct btree_trans *trans = iter->trans;
++	struct bpos search_key = btree_iter_search_key(iter);
++	struct bkey_s_c k;
++	struct bpos iter_pos;
++	int ret;
++
++	if (iter->update_path) {
++		bch2_path_put(trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++		iter->update_path = NULL;
++	}
 +
 +	bch2_btree_iter_verify_entry_exit(iter);
-+	bch2_btree_iter_verify(iter);
++
++	while (1) {
++		k = __bch2_btree_iter_peek(iter, search_key);
++		if (!k.k || bkey_err(k))
++			goto out;
++
++		/*
++		 * iter->pos should be mononotically increasing, and always be
++		 * equal to the key we just returned - except extents can
++		 * straddle iter->pos:
++		 */
++		if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
++			iter_pos = k.k->p;
++		else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
++			iter_pos = bkey_start_pos(k.k);
++		else
++			iter_pos = iter->pos;
++
++		if (bkey_cmp(iter_pos, end) > 0) {
++			bch2_btree_iter_set_pos(iter, end);
++			k = bkey_s_c_null;
++			goto out;
++		}
++
++		if (iter->update_path &&
++		    bkey_cmp(iter->update_path->pos, k.k->p)) {
++			bch2_path_put(trans, iter->update_path,
++				      iter->flags & BTREE_ITER_INTENT);
++			iter->update_path = NULL;
++		}
++
++		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
++		    (iter->flags & BTREE_ITER_INTENT) &&
++		    !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
++		    !iter->update_path) {
++			struct bpos pos = k.k->p;
++
++			if (pos.snapshot < iter->snapshot) {
++				search_key = bpos_successor(k.k->p);
++				continue;
++			}
++
++			pos.snapshot = iter->snapshot;
++
++			/*
++			 * advance, same as on exit for iter->path, but only up
++			 * to snapshot
++			 */
++			__btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
++			iter->update_path = iter->path;
++
++			iter->update_path = bch2_btree_path_set_pos(trans,
++						iter->update_path, pos,
++						iter->flags & BTREE_ITER_INTENT,
++						_THIS_IP_);
++		}
++
++		/*
++		 * We can never have a key in a leaf node at POS_MAX, so
++		 * we don't have to check these successor() calls:
++		 */
++		if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
++		    !bch2_snapshot_is_ancestor(trans->c,
++					       iter->snapshot,
++					       k.k->p.snapshot)) {
++			search_key = bpos_successor(k.k->p);
++			continue;
++		}
++
++		if (bkey_whiteout(k.k) &&
++		    !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
++			search_key = bkey_successor(iter, k.k->p);
++			continue;
++		}
++
++		break;
++	}
++
++	iter->pos = iter_pos;
++
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
++				iter->flags & BTREE_ITER_INTENT,
++				btree_iter_ip_allocated(iter));
++	BUG_ON(!iter->path->nodes_locked);
++out:
++	if (iter->update_path) {
++		if (iter->update_path->uptodate &&
++		    !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) {
++			k = bkey_s_c_err(-EINTR);
++		} else {
++			BUG_ON(!(iter->update_path->nodes_locked & 1));
++			iter->update_path->should_be_locked = true;
++		}
++	}
++	iter->path->should_be_locked = true;
++
++	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
++		iter->pos.snapshot = iter->snapshot;
++
 +	ret = bch2_btree_iter_verify_ret(iter, k);
-+	if (unlikely(ret))
-+		return bkey_s_c_err(ret);
++	if (unlikely(ret)) {
++		bch2_btree_iter_set_pos(iter, iter->pos);
++		k = bkey_s_c_err(ret);
++	}
++
++	bch2_btree_iter_verify_entry_exit(iter);
 +
 +	return k;
 +}
@@ -19662,6 +20819,10 @@ index 000000000000..f43044e6fa37
 +
 +	EBUG_ON(iter->path->cached || iter->path->level);
 +	EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
++
++	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
++		return bkey_s_c_err(-EIO);
++
 +	bch2_btree_iter_verify(iter);
 +	bch2_btree_iter_verify_entry_exit(iter);
 +
@@ -19669,8 +20830,9 @@ index 000000000000..f43044e6fa37
 +		search_key.snapshot = U32_MAX;
 +
 +	while (1) {
-+		iter->path = btree_path_set_pos(trans, iter->path, search_key,
-+						iter->flags & BTREE_ITER_INTENT);
++		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++						iter->flags & BTREE_ITER_INTENT,
++						btree_iter_ip_allocated(iter));
 +
 +		ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 +		if (unlikely(ret)) {
@@ -19689,7 +20851,7 @@ index 000000000000..f43044e6fa37
 +			k = btree_path_level_prev(trans->c, iter->path,
 +						  &iter->path->l[0], &iter->k);
 +
-+		btree_path_check_sort(trans, iter->path, 0);
++		bch2_btree_path_check_sort(trans, iter->path, 0);
 +
 +		if (likely(k.k)) {
 +			if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -19799,8 +20961,9 @@ index 000000000000..f43044e6fa37
 +	}
 +
 +	search_key = btree_iter_search_key(iter);
-+	iter->path = btree_path_set_pos(trans, iter->path, search_key,
-+					iter->flags & BTREE_ITER_INTENT);
++	iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
++					iter->flags & BTREE_ITER_INTENT,
++					btree_iter_ip_allocated(iter));
 +
 +	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
 +	if (unlikely(ret))
@@ -19810,25 +20973,44 @@ index 000000000000..f43044e6fa37
 +	    !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
 +		struct bkey_i *next_update;
 +
-+		next_update = iter->flags & BTREE_ITER_WITH_UPDATES
-+			? btree_trans_peek_updates(trans, iter->btree_id, search_key)
-+			: NULL;
-+
-+		if (next_update &&
++		if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
++		    (next_update = btree_trans_peek_updates(trans,
++						iter->btree_id, search_key)) &&
 +		    !bpos_cmp(next_update->k.p, iter->pos)) {
 +			iter->k = next_update->k;
 +			k = bkey_i_to_s_c(next_update);
-+		} else {
-+			k = bch2_btree_path_peek_slot(iter->path, &iter->k);
++			goto out;
 +		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
++		    (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id,
++							  0, iter->pos)) &&
++		    !bpos_cmp(next_update->k.p, iter->pos)) {
++			iter->k = next_update->k;
++			k = bkey_i_to_s_c(next_update);
++			goto out;
++		}
++
++		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
++		    (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
++			if (!bkey_err(k))
++				iter->k = *k.k;
++			goto out;
++		}
++
++		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
 +	} else {
 +		struct bpos next;
 +
 +		if (iter->flags & BTREE_ITER_INTENT) {
 +			struct btree_iter iter2;
++			struct bpos end = iter->pos;
++
++			if (iter->flags & BTREE_ITER_IS_EXTENTS)
++				end.offset = U64_MAX;
 +
 +			bch2_trans_copy_iter(&iter2, iter);
-+			k = bch2_btree_iter_peek(&iter2);
++			k = bch2_btree_iter_peek_upto(&iter2, end);
 +
 +			if (k.k && !bkey_err(k)) {
 +				iter->k = iter2.k;
@@ -19850,18 +21032,21 @@ index 000000000000..f43044e6fa37
 +		if (bkey_cmp(iter->pos, next) < 0) {
 +			bkey_init(&iter->k);
 +			iter->k.p = iter->pos;
-+			bch2_key_resize(&iter->k,
-+					min_t(u64, KEY_SIZE_MAX,
-+					      (next.inode == iter->pos.inode
-+					       ? next.offset
-+					       : KEY_OFFSET_MAX) -
-+					      iter->pos.offset));
++
++			if (iter->flags & BTREE_ITER_IS_EXTENTS) {
++				bch2_key_resize(&iter->k,
++						min_t(u64, KEY_SIZE_MAX,
++						      (next.inode == iter->pos.inode
++						       ? next.offset
++						       : KEY_OFFSET_MAX) -
++						      iter->pos.offset));
++				EBUG_ON(!iter->k.size);
++			}
 +
 +			k = (struct bkey_s_c) { &iter->k, NULL };
-+			EBUG_ON(!k.k->size);
 +		}
 +	}
-+
++out:
 +	iter->path->should_be_locked = true;
 +
 +	bch2_btree_iter_verify_entry_exit(iter);
@@ -19916,7 +21101,10 @@ index 000000000000..f43044e6fa37
 +	unsigned i;
 +
 +	trans_for_each_path_inorder(trans, path, i) {
-+		BUG_ON(prev && btree_path_cmp(prev, path) > 0);
++		if (prev && btree_path_cmp(prev, path) > 0) {
++			bch2_dump_trans_paths_updates(trans);
++			panic("trans paths out of order!\n");
++		}
 +		prev = path;
 +	}
 +#endif
@@ -19933,8 +21121,8 @@ index 000000000000..f43044e6fa37
 +	btree_path_verify_sorted_ref(trans, r);
 +}
 +
-+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
-+				  int cmp)
++inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
++				       int cmp)
 +{
 +	struct btree_path *n;
 +
@@ -19990,6 +21178,11 @@ index 000000000000..f43044e6fa37
 +
 +	path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
 +
++	if (trans->in_traverse_all &&
++	    trans->traverse_all_idx != U8_MAX &&
++	    trans->traverse_all_idx >= path->sorted_idx)
++		trans->traverse_all_idx++;
++
 +	array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
 +
 +	for (i = path->sorted_idx; i < trans->nr_sorted; i++)
@@ -20003,7 +21196,15 @@ index 000000000000..f43044e6fa37
 +	if (iter->path)
 +		bch2_path_put(trans, iter->path,
 +			      iter->flags & BTREE_ITER_INTENT);
++	if (iter->update_path)
++		bch2_path_put(trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++	if (iter->key_cache_path)
++		bch2_path_put(trans, iter->key_cache_path,
++			      iter->flags & BTREE_ITER_INTENT);
 +	iter->path = NULL;
++	iter->update_path = NULL;
++	iter->key_cache_path = NULL;
 +}
 +
 +static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -20011,7 +21212,8 @@ index 000000000000..f43044e6fa37
 +				   unsigned btree_id, struct bpos pos,
 +				   unsigned locks_want,
 +				   unsigned depth,
-+				   unsigned flags)
++				   unsigned flags,
++				   unsigned long ip)
 +{
 +	EBUG_ON(trans->restarted);
 +
@@ -20027,8 +21229,19 @@ index 000000000000..f43044e6fa37
 +	    btree_type_has_snapshots(btree_id))
 +		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 +
++	if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
++		flags |= BTREE_ITER_WITH_JOURNAL;
++
++	if (!btree_id_cached(trans->c, btree_id)) {
++		flags &= ~BTREE_ITER_CACHED;
++		flags &= ~BTREE_ITER_WITH_KEY_CACHE;
++	} else if (!(flags & BTREE_ITER_CACHED))
++		flags |= BTREE_ITER_WITH_KEY_CACHE;
++
 +	iter->trans	= trans;
 +	iter->path	= NULL;
++	iter->update_path = NULL;
++	iter->key_cache_path = NULL;
 +	iter->btree_id	= btree_id;
 +	iter->min_depth	= depth;
 +	iter->flags	= flags;
@@ -20037,14 +21250,12 @@ index 000000000000..f43044e6fa37
 +	iter->k.type	= KEY_TYPE_deleted;
 +	iter->k.p	= pos;
 +	iter->k.size	= 0;
++#ifdef CONFIG_BCACHEFS_DEBUG
++	iter->ip_allocated = ip;
++#endif
 +
-+	iter->path = bch2_path_get(trans,
-+				   flags & BTREE_ITER_CACHED,
-+				   btree_id,
-+				   iter->pos,
-+				   locks_want,
-+				   depth,
-+				   flags & BTREE_ITER_INTENT);
++	iter->path = bch2_path_get(trans, btree_id, iter->pos,
++				   locks_want, depth, flags, ip);
 +}
 +
 +void bch2_trans_iter_init(struct btree_trans *trans,
@@ -20053,7 +21264,7 @@ index 000000000000..f43044e6fa37
 +			  unsigned flags)
 +{
 +	__bch2_trans_iter_init(trans, iter, btree_id, pos,
-+			       0, 0, flags);
++			       0, 0, flags, _RET_IP_);
 +}
 +
 +void bch2_trans_node_iter_init(struct btree_trans *trans,
@@ -20068,7 +21279,7 @@ index 000000000000..f43044e6fa37
 +			       BTREE_ITER_NOT_EXTENTS|
 +			       __BTREE_ITER_ALL_SNAPSHOTS|
 +			       BTREE_ITER_ALL_SNAPSHOTS|
-+			       flags);
++			       flags, _RET_IP_);
 +	BUG_ON(iter->path->locks_want	 < min(locks_want, BTREE_MAX_DEPTH));
 +	BUG_ON(iter->path->level	!= depth);
 +	BUG_ON(iter->min_depth		!= depth);
@@ -20079,6 +21290,9 @@ index 000000000000..f43044e6fa37
 +	*dst = *src;
 +	if (src->path)
 +		__btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
++	if (src->update_path)
++		__btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
++	dst->key_cache_path = NULL;
 +}
 +
 +void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -20107,7 +21321,7 @@ index 000000000000..f43044e6fa37
 +		trans->mem_bytes = new_bytes;
 +
 +		if (old_bytes) {
-+			trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
++			trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
 +			btree_trans_restart(trans);
 +			return ERR_PTR(-EINTR);
 +		}
@@ -20141,8 +21355,7 @@ index 000000000000..f43044e6fa37
 +	trans->mem_top			= 0;
 +
 +	trans->hooks			= NULL;
-+	trans->extra_journal_entries	= NULL;
-+	trans->extra_journal_entry_u64s	= 0;
++	trans->extra_journal_entries.nr	= 0;
 +
 +	if (trans->fs_usage_deltas) {
 +		trans->fs_usage_deltas->used = 0;
@@ -20155,13 +21368,21 @@ index 000000000000..f43044e6fa37
 +		path->should_be_locked = false;
 +
 +		/*
++		 * If the transaction wasn't restarted, we're presuming to be
++		 * doing something new: dont keep iterators excpt the ones that
++		 * are in use - except for the subvolumes btree:
++		 */
++		if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes)
++			path->preserve = false;
++
++		/*
 +		 * XXX: we probably shouldn't be doing this if the transaction
 +		 * was restarted, but currently we still overflow transaction
 +		 * iterators if we do that
 +		 */
 +		if (!path->ref && !path->preserve)
 +			__bch2_path_free(trans, path);
-+		else if (!path->ref)
++		else
 +			path->preserve = false;
 +	}
 +
@@ -20191,14 +21412,17 @@ index 000000000000..f43044e6fa37
 +	trans->updates		= p; p += updates_bytes;
 +}
 +
-+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
-+		     unsigned expected_nr_iters,
-+		     size_t expected_mem_bytes)
++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
++		       unsigned expected_nr_iters,
++		       size_t expected_mem_bytes,
++		       const char *fn)
 +	__acquires(&c->btree_trans_barrier)
 +{
++	BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
++
 +	memset(trans, 0, sizeof(*trans));
 +	trans->c		= c;
-+	trans->ip		= _RET_IP_;
++	trans->fn		= fn;
 +
 +	bch2_trans_alloc_paths(trans, c);
 +
@@ -20214,12 +21438,10 @@ index 000000000000..f43044e6fa37
 +
 +	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 +
-+#ifdef CONFIG_BCACHEFS_DEBUG
 +	trans->pid = current->pid;
 +	mutex_lock(&c->btree_trans_lock);
 +	list_add(&trans->list, &c->btree_trans_list);
 +	mutex_unlock(&c->btree_trans_lock);
-+#endif
 +}
 +
 +static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -20233,7 +21455,7 @@ index 000000000000..f43044e6fa37
 +			goto leaked;
 +	return;
 +leaked:
-+	bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
++	bch_err(c, "btree paths leaked from %s!", trans->fn);
 +	trans_for_each_path(trans, path)
 +		if (path->ref)
 +			printk(KERN_ERR "  btree %s %pS\n",
@@ -20258,16 +21480,16 @@ index 000000000000..f43044e6fa37
 +
 +	check_btree_paths_leaked(trans);
 +
-+#ifdef CONFIG_BCACHEFS_DEBUG
 +	mutex_lock(&c->btree_trans_lock);
 +	list_del(&trans->list);
 +	mutex_unlock(&c->btree_trans_lock);
-+#endif
 +
 +	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 +
 +	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 +
++	kfree(trans->extra_journal_entries.data);
++
 +	if (trans->fs_usage_deltas) {
 +		if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
 +		    REPLICAS_DELTA_LIST_MAX)
@@ -20306,7 +21528,6 @@ index 000000000000..f43044e6fa37
 +	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 +}
 +
-+#ifdef CONFIG_BCACHEFS_DEBUG
 +static bool trans_has_locks(struct btree_trans *trans)
 +{
 +	struct btree_path *path;
@@ -20316,14 +21537,13 @@ index 000000000000..f43044e6fa37
 +			return true;
 +	return false;
 +}
-+#endif
 +
 +void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 +{
-+#ifdef CONFIG_BCACHEFS_DEBUG
 +	struct btree_trans *trans;
 +	struct btree_path *path;
 +	struct btree *b;
++	static char lock_types[] = { 'r', 'i', 'w' };
 +	unsigned l;
 +
 +	mutex_lock(&c->btree_trans_lock);
@@ -20331,7 +21551,7 @@ index 000000000000..f43044e6fa37
 +		if (!trans_has_locks(trans))
 +			continue;
 +
-+		pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
++		pr_buf(out, "%i %s\n", trans->pid, trans->fn);
 +
 +		trans_for_each_path(trans, path) {
 +			if (!path->nodes_locked)
@@ -20360,10 +21580,11 @@ index 000000000000..f43044e6fa37
 +		b = READ_ONCE(trans->locking);
 +		if (b) {
 +			path = &trans->paths[trans->locking_path_idx];
-+			pr_buf(out, "  locking path %u %c l=%u %s:",
++			pr_buf(out, "  locking path %u %c l=%u %c %s:",
 +			       trans->locking_path_idx,
 +			       path->cached ? 'c' : 'b',
 +			       trans->locking_level,
++			       lock_types[trans->locking_lock_type],
 +			       bch2_btree_ids[trans->locking_btree_id]);
 +			bch2_bpos_to_text(out, trans->locking_pos);
 +
@@ -20374,36 +21595,40 @@ index 000000000000..f43044e6fa37
 +		}
 +	}
 +	mutex_unlock(&c->btree_trans_lock);
-+#endif
 +}
 +
 +void bch2_fs_btree_iter_exit(struct bch_fs *c)
 +{
++	if (c->btree_trans_barrier_initialized)
++		cleanup_srcu_struct(&c->btree_trans_barrier);
 +	mempool_exit(&c->btree_trans_mem_pool);
 +	mempool_exit(&c->btree_paths_pool);
-+	cleanup_srcu_struct(&c->btree_trans_barrier);
 +}
 +
 +int bch2_fs_btree_iter_init(struct bch_fs *c)
 +{
 +	unsigned nr = BTREE_ITER_MAX;
++	int ret;
 +
 +	INIT_LIST_HEAD(&c->btree_trans_list);
 +	mutex_init(&c->btree_trans_lock);
 +
-+	return  init_srcu_struct(&c->btree_trans_barrier) ?:
-+		mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
++	ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
 +			sizeof(struct btree_path) * nr +
 +			sizeof(struct btree_insert_entry) * nr) ?:
 +		mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-+					  BTREE_TRANS_MEM_MAX);
++					  BTREE_TRANS_MEM_MAX) ?:
++		init_srcu_struct(&c->btree_trans_barrier);
++	if (!ret)
++		c->btree_trans_barrier_initialized = true;
++	return ret;
 +}
 diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
 new file mode 100644
-index 000000000000..31d2dda7ca05
+index 000000000000..f6700295e1a7
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.h
-@@ -0,0 +1,364 @@
+@@ -0,0 +1,406 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_ITER_H
 +#define _BCACHEFS_BTREE_ITER_H
@@ -20456,11 +21681,6 @@ index 000000000000..31d2dda7ca05
 +	return btree_path_node(path, b->c.level + 1);
 +}
 +
-+static inline int btree_iter_err(const struct btree_iter *iter)
-+{
-+	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
-+}
-+
 +/* Iterate over paths within a transaction: */
 +
 +static inline struct btree_path *
@@ -20481,6 +21701,8 @@ index 000000000000..31d2dda7ca05
 +	return &trans->paths[idx];
 +}
 +
++void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
++
 +#define trans_for_each_path(_trans, _path)				\
 +	for (_path = __trans_next_path((_trans), 0);			\
 +	     (_path);							\
@@ -20536,11 +21758,15 @@ index 000000000000..31d2dda7ca05
 +						 (_path)->idx + 1))
 +
 +struct btree_path * __must_check
-+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool);
++bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
++			 bool, unsigned long);
++struct btree_path * __must_check
++bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
++			struct bpos, bool, unsigned long);
 +int __must_check bch2_btree_path_traverse(struct btree_trans *,
 +					  struct btree_path *, unsigned);
-+struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
-+				 struct bpos, unsigned, unsigned, bool);
++struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
++				 unsigned, unsigned, unsigned, unsigned long);
 +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 +
 +#ifdef CONFIG_BCACHEFS_DEBUG
@@ -20614,9 +21840,14 @@ index 000000000000..31d2dda7ca05
 +struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
 +struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 +
-+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
++struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 +
++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
++{
++	return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
++}
++
 +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
 +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
 +
@@ -20627,11 +21858,8 @@ index 000000000000..31d2dda7ca05
 +bool bch2_btree_iter_advance(struct btree_iter *);
 +bool bch2_btree_iter_rewind(struct btree_iter *);
 +
-+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
++static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 +{
-+	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
-+		new_pos.snapshot = iter->snapshot;
-+
 +	iter->k.type = KEY_TYPE_deleted;
 +	iter->k.p.inode		= iter->pos.inode	= new_pos.inode;
 +	iter->k.p.offset	= iter->pos.offset	= new_pos.offset;
@@ -20639,6 +21867,19 @@ index 000000000000..31d2dda7ca05
 +	iter->k.size = 0;
 +}
 +
++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
++{
++	if (unlikely(iter->update_path))
++		bch2_path_put(iter->trans, iter->update_path,
++			      iter->flags & BTREE_ITER_INTENT);
++	iter->update_path = NULL;
++
++	if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
++		new_pos.snapshot = iter->snapshot;
++
++	__bch2_btree_iter_set_pos(iter, new_pos);
++}
++
 +static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
 +{
 +	BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS));
@@ -20700,14 +21941,27 @@ index 000000000000..31d2dda7ca05
 +	return PTR_ERR_OR_ZERO(k.k);
 +}
 +
-+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
-+						     unsigned flags)
++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
++							unsigned flags)
 +{
 +	return flags & BTREE_ITER_SLOTS
 +		? bch2_btree_iter_peek_slot(iter)
 +		: bch2_btree_iter_peek(iter);
 +}
 +
++static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
++							     struct bpos end,
++							     unsigned flags)
++{
++	if (!(flags & BTREE_ITER_SLOTS))
++		return bch2_btree_iter_peek_upto(iter, end);
++
++	if (bkey_cmp(iter->pos, end) > 0)
++		return bkey_s_c_null;
++
++	return bch2_btree_iter_peek_slot(iter);
++}
++
 +static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 +{
 +	return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
@@ -20721,7 +21975,7 @@ index 000000000000..31d2dda7ca05
 +	struct bkey_s_c k;
 +
 +	while (btree_trans_too_many_iters(trans) ||
-+	       (k = __bch2_btree_iter_peek(iter, flags),
++	       (k = bch2_btree_iter_peek_type(iter, flags),
 +		bkey_err(k) == -EINTR))
 +		bch2_trans_begin(trans);
 +
@@ -20740,7 +21994,15 @@ index 000000000000..31d2dda7ca05
 +			   _start, _flags, _k, _ret)			\
 +	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
 +				  (_start), (_flags));			\
-+	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
++	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
++	     !((_ret) = bkey_err(_k)) && (_k).k;			\
++	     bch2_btree_iter_advance(&(_iter)))
++
++#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id,	\
++			   _start, _end, _flags, _k, _ret)		\
++	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
++				  (_start), (_flags));			\
++	     (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
 +	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 +	     bch2_btree_iter_advance(&(_iter)))
 +
@@ -20752,16 +22014,21 @@ index 000000000000..31d2dda7ca05
 +
 +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
 +	for (;								\
-+	     (_k) = __bch2_btree_iter_peek(&(_iter), _flags),		\
++	     (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),	\
 +	     !((_ret) = bkey_err(_k)) && (_k).k;			\
 +	     bch2_btree_iter_advance(&(_iter)))
 +
 +/* new multiple iterator interface: */
 +
++void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
++void bch2_dump_trans_updates(struct btree_trans *);
 +void bch2_dump_trans_paths_updates(struct btree_trans *);
-+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
++void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
++		       unsigned, size_t, const char *);
 +void bch2_trans_exit(struct btree_trans *);
 +
++#define bch2_trans_init(...)	__bch2_trans_init(__VA_ARGS__, __func__)
++
 +void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
 +
 +void bch2_fs_btree_iter_exit(struct bch_fs *);
@@ -20770,10 +22037,10 @@ index 000000000000..31d2dda7ca05
 +#endif /* _BCACHEFS_BTREE_ITER_H */
 diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
 new file mode 100644
-index 000000000000..4f1bc1d165aa
+index 000000000000..f5a942b6bbf7
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.c
-@@ -0,0 +1,736 @@
+@@ -0,0 +1,743 @@
 +
 +#include "bcachefs.h"
 +#include "btree_cache.h"
@@ -20922,28 +22189,32 @@ index 000000000000..4f1bc1d165aa
 +}
 +
 +static struct bkey_cached *
-+btree_key_cache_create(struct btree_key_cache *c,
++btree_key_cache_create(struct bch_fs *c,
 +		       enum btree_id btree_id,
 +		       struct bpos pos)
 +{
++	struct btree_key_cache *bc = &c->btree_key_cache;
 +	struct bkey_cached *ck;
 +	bool was_new = true;
 +
-+	ck = bkey_cached_alloc(c);
++	ck = bkey_cached_alloc(bc);
 +
 +	if (unlikely(!ck)) {
-+		ck = bkey_cached_reuse(c);
-+		if (unlikely(!ck))
++		ck = bkey_cached_reuse(bc);
++		if (unlikely(!ck)) {
++			bch_err(c, "error allocating memory for key cache item, btree %s",
++				bch2_btree_ids[btree_id]);
 +			return ERR_PTR(-ENOMEM);
++		}
 +
 +		was_new = false;
++	} else {
++		if (btree_id == BTREE_ID_subvolumes)
++			six_lock_pcpu_alloc(&ck->c.lock);
++		else
++			six_lock_pcpu_free(&ck->c.lock);
 +	}
 +
-+	if (btree_id == BTREE_ID_subvolumes)
-+		six_lock_pcpu_alloc(&ck->c.lock);
-+	else
-+		six_lock_pcpu_free(&ck->c.lock);
-+
 +	ck->c.level		= 0;
 +	ck->c.btree_id		= btree_id;
 +	ck->key.btree_id	= btree_id;
@@ -20951,7 +22222,7 @@ index 000000000000..4f1bc1d165aa
 +	ck->valid		= false;
 +	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 +
-+	if (unlikely(rhashtable_lookup_insert_fast(&c->table,
++	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 +					  &ck->hash,
 +					  bch2_btree_key_cache_params))) {
 +		/* We raced with another fill: */
@@ -20961,15 +22232,15 @@ index 000000000000..4f1bc1d165aa
 +			six_unlock_intent(&ck->c.lock);
 +			kfree(ck);
 +		} else {
-+			mutex_lock(&c->lock);
-+			bkey_cached_free(c, ck);
-+			mutex_unlock(&c->lock);
++			mutex_lock(&bc->lock);
++			bkey_cached_free(bc, ck);
++			mutex_unlock(&bc->lock);
 +		}
 +
 +		return NULL;
 +	}
 +
-+	atomic_long_inc(&c->nr_keys);
++	atomic_long_inc(&bc->nr_keys);
 +
 +	six_unlock_write(&ck->c.lock);
 +
@@ -20980,21 +22251,24 @@ index 000000000000..4f1bc1d165aa
 +				struct btree_path *ck_path,
 +				struct bkey_cached *ck)
 +{
-+	struct btree_iter iter;
++	struct btree_path *path;
 +	struct bkey_s_c k;
 +	unsigned new_u64s = 0;
 +	struct bkey_i *new_k = NULL;
++	struct bkey u;
 +	int ret;
 +
-+	bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
-+			     ck->key.pos, BTREE_ITER_SLOTS);
-+	k = bch2_btree_iter_peek_slot(&iter);
-+	ret = bkey_err(k);
++	path = bch2_path_get(trans, ck->key.btree_id,
++			     ck->key.pos, 0, 0, 0, _THIS_IP_);
++	ret = bch2_btree_path_traverse(trans, path, 0);
 +	if (ret)
 +		goto err;
 +
++	k = bch2_btree_path_peek_slot(path, &u);
++
 +	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
-+		trace_transaction_restart_ip(trans->ip, _THIS_IP_);
++		trace_trans_restart_relock_key_cache_fill(trans->fn,
++				_THIS_IP_, ck_path->btree_id, &ck_path->pos);
 +		ret = btree_trans_restart(trans);
 +		goto err;
 +	}
@@ -21009,6 +22283,8 @@ index 000000000000..4f1bc1d165aa
 +		new_u64s = roundup_pow_of_two(new_u64s);
 +		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 +		if (!new_k) {
++			bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
++				bch2_btree_ids[ck->key.btree_id], new_u64s);
 +			ret = -ENOMEM;
 +			goto err;
 +		}
@@ -21030,9 +22306,9 @@ index 000000000000..4f1bc1d165aa
 +	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 +
 +	/* We're not likely to need this iterator again: */
-+	set_btree_iter_dontneed(&iter);
++	path->preserve = false;
 +err:
-+	bch2_trans_iter_exit(trans, &iter);
++	bch2_path_put(trans, path, 0);
 +	return ret;
 +}
 +
@@ -21069,15 +22345,14 @@ index 000000000000..4f1bc1d165aa
 +			return 0;
 +		}
 +
-+		ck = btree_key_cache_create(&c->btree_key_cache,
-+					    path->btree_id, path->pos);
++		ck = btree_key_cache_create(c, path->btree_id, path->pos);
 +		ret = PTR_ERR_OR_ZERO(ck);
 +		if (ret)
 +			goto err;
 +		if (!ck)
 +			goto retry;
 +
-+		mark_btree_node_locked(path, 0, SIX_LOCK_intent);
++		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 +		path->locks_want = 1;
 +	} else {
 +		enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -21088,7 +22363,6 @@ index 000000000000..4f1bc1d165aa
 +			if (!trans->restarted)
 +				goto retry;
 +
-+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
 +			ret = -EINTR;
 +			goto err;
 +		}
@@ -21099,7 +22373,7 @@ index 000000000000..4f1bc1d165aa
 +			goto retry;
 +		}
 +
-+		mark_btree_node_locked(path, 0, lock_want);
++		mark_btree_node_locked(trans, path, 0, lock_want);
 +	}
 +
 +	path->l[0].lock_seq	= ck->c.lock.state.seq;
@@ -21108,7 +22382,7 @@ index 000000000000..4f1bc1d165aa
 +	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 +		if (!path->locks_want &&
 +		    !__bch2_btree_path_upgrade(trans, path, 1)) {
-+			trace_transaction_restart_ip(trans->ip, _THIS_IP_);
++			trace_transaction_restart_ip(trans->fn, _THIS_IP_);
 +			ret = btree_trans_restart(trans);
 +			goto err;
 +		}
@@ -21154,21 +22428,27 @@ index 000000000000..4f1bc1d165aa
 +			     BTREE_ITER_CACHED_NOFILL|
 +			     BTREE_ITER_CACHED_NOCREATE|
 +			     BTREE_ITER_INTENT);
++	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
++
 +	ret = bch2_btree_iter_traverse(&c_iter);
 +	if (ret)
 +		goto out;
 +
 +	ck = (void *) c_iter.path->l[0].b;
-+	if (!ck ||
-+	    (journal_seq && ck->journal.seq != journal_seq))
++	if (!ck)
 +		goto out;
 +
 +	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-+		if (!evict)
-+			goto out;
-+		goto evict;
++		if (evict)
++			goto evict;
++		goto out;
 +	}
 +
++	BUG_ON(!ck->valid);
++
++	if (journal_seq && ck->journal.seq != journal_seq)
++		goto out;
++
 +	/*
 +	 * Since journal reclaim depends on us making progress here, and the
 +	 * allocator/copygc depend on journal reclaim making progress, we need
@@ -21176,6 +22456,7 @@ index 000000000000..4f1bc1d165aa
 +	 * */
 +	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 +		bch2_trans_update(trans, &b_iter, ck->k,
++				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
 +				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 +				  BTREE_TRIGGER_NORUN) ?:
 +		bch2_trans_commit(trans, NULL, NULL,
@@ -21183,7 +22464,7 @@ index 000000000000..4f1bc1d165aa
 +				  BTREE_INSERT_NOFAIL|
 +				  BTREE_INSERT_USE_RESERVE|
 +				  (ck->journal.seq == journal_last_seq(j)
-+				   ? BTREE_INSERT_JOURNAL_RESERVED
++				   ? JOURNAL_WATERMARK_reserved
 +				   : 0)|
 +				  commit_flags);
 +	if (ret) {
@@ -21317,14 +22598,6 @@ index 000000000000..4f1bc1d165aa
 +	return true;
 +}
 +
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-+			       enum btree_id id, struct bpos pos)
-+{
-+	BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
-+}
-+#endif
-+
 +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 +					   struct shrink_control *sc)
 +{
@@ -21438,11 +22711,12 @@ index 000000000000..4f1bc1d165aa
 +
 +	rcu_read_lock();
 +	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-+	for (i = 0; i < tbl->size; i++)
-+		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-+			bkey_cached_evict(bc, ck);
-+			list_add(&ck->list, &bc->freed);
-+		}
++	if (tbl)
++		for (i = 0; i < tbl->size; i++)
++			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
++				bkey_cached_evict(bc, ck);
++				list_add(&ck->list, &bc->freed);
++			}
 +	rcu_read_unlock();
 +
 +	list_for_each_entry_safe(ck, n, &bc->freed, list) {
@@ -21512,10 +22786,10 @@ index 000000000000..4f1bc1d165aa
 +}
 diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
 new file mode 100644
-index 000000000000..0768ef3ca776
+index 000000000000..fd29c14c5626
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.h
-@@ -0,0 +1,54 @@
+@@ -0,0 +1,45 @@
 +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 +#define _BCACHEFS_BTREE_KEY_CACHE_H
 +
@@ -21534,8 +22808,7 @@ index 000000000000..0768ef3ca776
 +	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 +	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 +
-+	return nr_dirty > max_dirty &&
-+		test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
++	return nr_dirty > max_dirty;
 +}
 +
 +int bch2_btree_key_cache_journal_flush(struct journal *,
@@ -21551,14 +22824,6 @@ index 000000000000..0768ef3ca776
 +			struct btree_path *, struct bkey_i *);
 +int bch2_btree_key_cache_flush(struct btree_trans *,
 +			       enum btree_id, struct bpos);
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+void bch2_btree_key_cache_verify_clean(struct btree_trans *,
-+				enum btree_id, struct bpos);
-+#else
-+static inline void
-+bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
-+				enum btree_id id, struct bpos pos) {}
-+#endif
 +
 +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
 +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
@@ -21572,10 +22837,10 @@ index 000000000000..0768ef3ca776
 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
 diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
 new file mode 100644
-index 000000000000..d599008c5fc1
+index 000000000000..67c970d727ac
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.h
-@@ -0,0 +1,243 @@
+@@ -0,0 +1,259 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_LOCKING_H
 +#define _BCACHEFS_BTREE_LOCKING_H
@@ -21636,7 +22901,8 @@ index 000000000000..d599008c5fc1
 +	path->nodes_intent_locked &= ~(1 << level);
 +}
 +
-+static inline void mark_btree_node_locked(struct btree_path *path,
++static inline void mark_btree_node_locked(struct btree_trans *trans,
++					  struct btree_path *path,
 +					  unsigned level,
 +					  enum six_lock_type type)
 +{
@@ -21644,14 +22910,17 @@ index 000000000000..d599008c5fc1
 +	BUILD_BUG_ON(SIX_LOCK_read   != 0);
 +	BUILD_BUG_ON(SIX_LOCK_intent != 1);
 +
++	BUG_ON(trans->in_traverse_all && path->sorted_idx > trans->traverse_all_idx);
++
 +	path->nodes_locked |= 1 << level;
 +	path->nodes_intent_locked |= type << level;
 +}
 +
-+static inline void mark_btree_node_intent_locked(struct btree_path *path,
++static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
++						 struct btree_path *path,
 +						 unsigned level)
 +{
-+	mark_btree_node_locked(path, level, SIX_LOCK_intent);
++	mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
 +}
 +
 +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
@@ -21706,23 +22975,35 @@ index 000000000000..d599008c5fc1
 +	}
 +}
 +
-+/*
-+ * wrapper around six locks that just traces lock contended time
-+ */
-+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
-+					  enum six_lock_type type)
++static inline bool btree_node_lock_type(struct btree_trans *trans,
++				       struct btree_path *path,
++				       struct btree *b,
++				       struct bpos pos, unsigned level,
++				       enum six_lock_type type,
++				       six_lock_should_sleep_fn should_sleep_fn, void *p)
 +{
-+	u64 start_time = local_clock();
++	struct bch_fs *c = trans->c;
++	u64 start_time;
++	bool ret;
 +
-+	six_lock_type(&b->c.lock, type, NULL, NULL);
-+	bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-+}
++	if (six_trylock_type(&b->c.lock, type))
++		return true;
 +
-+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
-+					enum six_lock_type type)
-+{
-+	if (!six_trylock_type(&b->c.lock, type))
-+		__btree_node_lock_type(c, b, type);
++	start_time = local_clock();
++
++	trans->locking_path_idx = path->idx;
++	trans->locking_pos	= pos;
++	trans->locking_btree_id	= path->btree_id;
++	trans->locking_level	= level;
++	trans->locking_lock_type = type;
++	trans->locking		= b;
++	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
++	trans->locking = NULL;
++
++	if (ret)
++		bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
++
++	return ret;
 +}
 +
 +/*
@@ -21821,10 +23102,10 @@ index 000000000000..d599008c5fc1
 +
 diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
 new file mode 100644
-index 000000000000..0d0a719f738f
+index 000000000000..3438e089dba0
 --- /dev/null
 +++ b/fs/bcachefs/btree_types.h
-@@ -0,0 +1,700 @@
+@@ -0,0 +1,713 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_TYPES_H
 +#define _BCACHEFS_BTREE_TYPES_H
@@ -21835,6 +23116,7 @@ index 000000000000..0d0a719f738f
 +
 +#include "bkey_methods.h"
 +#include "buckets_types.h"
++#include "darray.h"
 +#include "journal_types.h"
 +
 +struct open_bucket;
@@ -21979,7 +23261,8 @@ index 000000000000..0d0a719f738f
 +	struct mutex		lock;
 +	struct list_head	live;
 +	struct list_head	freeable;
-+	struct list_head	freed;
++	struct list_head	freed_pcpu;
++	struct list_head	freed_nonpcpu;
 +
 +	/* Number of elements in live + freeable lists */
 +	unsigned		used;
@@ -22029,14 +23312,16 @@ index 000000000000..0d0a719f738f
 + */
 +#define BTREE_ITER_IS_EXTENTS		(1 << 4)
 +#define BTREE_ITER_NOT_EXTENTS		(1 << 5)
-+#define BTREE_ITER_ERROR		(1 << 6)
-+#define BTREE_ITER_CACHED		(1 << 7)
-+#define BTREE_ITER_CACHED_NOFILL	(1 << 8)
-+#define BTREE_ITER_CACHED_NOCREATE	(1 << 9)
++#define BTREE_ITER_CACHED		(1 << 6)
++#define BTREE_ITER_CACHED_NOFILL	(1 << 7)
++#define BTREE_ITER_CACHED_NOCREATE	(1 << 8)
++#define BTREE_ITER_WITH_KEY_CACHE	(1 << 9)
 +#define BTREE_ITER_WITH_UPDATES		(1 << 10)
-+#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 11)
-+#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
-+#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 13)
++#define BTREE_ITER_WITH_JOURNAL		(1 << 11)
++#define __BTREE_ITER_ALL_SNAPSHOTS	(1 << 12)
++#define BTREE_ITER_ALL_SNAPSHOTS	(1 << 13)
++#define BTREE_ITER_FILTER_SNAPSHOTS	(1 << 14)
++#define BTREE_ITER_NOPRESERVE		(1 << 15)
 +
 +enum btree_path_uptodate {
 +	BTREE_ITER_UPTODATE		= 0,
@@ -22101,6 +23386,8 @@ index 000000000000..0d0a719f738f
 +struct btree_iter {
 +	struct btree_trans	*trans;
 +	struct btree_path	*path;
++	struct btree_path	*update_path;
++	struct btree_path	*key_cache_path;
 +
 +	enum btree_id		btree_id:4;
 +	unsigned		min_depth:4;
@@ -22118,6 +23405,9 @@ index 000000000000..0d0a719f738f
 +	 * bch2_btree_iter_next_slot() can correctly advance pos.
 +	 */
 +	struct bkey		k;
++#ifdef CONFIG_BCACHEFS_DEBUG
++	unsigned long		ip_allocated;
++#endif
 +};
 +
 +struct btree_key_cache {
@@ -22145,7 +23435,7 @@ index 000000000000..0d0a719f738f
 +	struct btree_bkey_cached_common c;
 +
 +	unsigned long		flags;
-+	u8			u64s;
++	u16			u64s;
 +	bool			valid;
 +	u32			btree_trans_barrier_seq;
 +	struct bkey_cached_key	key;
@@ -22163,12 +23453,20 @@ index 000000000000..0d0a719f738f
 +	unsigned		flags;
 +	u8			bkey_type;
 +	enum btree_id		btree_id:8;
-+	u8			level;
++	u8			level:4;
 +	bool			cached:1;
 +	bool			insert_trigger_run:1;
 +	bool			overwrite_trigger_run:1;
++	/*
++	 * @old_k may be a key from the journal; @old_btree_u64s always refers
++	 * to the size of the key being overwritten in the btree:
++	 */
++	u8			old_btree_u64s;
 +	struct bkey_i		*k;
 +	struct btree_path	*path;
++	/* key being overwritten: */
++	struct bkey		old_k;
++	const struct bch_val	*old_v;
 +	unsigned long		ip_allocated;
 +};
 +
@@ -22190,23 +23488,26 @@ index 000000000000..0d0a719f738f
 +
 +struct btree_trans {
 +	struct bch_fs		*c;
-+#ifdef CONFIG_BCACHEFS_DEBUG
++	const char		*fn;
 +	struct list_head	list;
 +	struct btree		*locking;
 +	unsigned		locking_path_idx;
 +	struct bpos		locking_pos;
 +	u8			locking_btree_id;
 +	u8			locking_level;
++	u8			locking_lock_type;
 +	pid_t			pid;
-+#endif
-+	unsigned long		ip;
 +	int			srcu_idx;
 +
 +	u8			nr_sorted;
 +	u8			nr_updates;
++	u8			traverse_all_idx;
 +	bool			used_mempool:1;
 +	bool			in_traverse_all:1;
 +	bool			restarted:1;
++	bool			memory_allocation_failure:1;
++	bool			journal_transaction_names:1;
++	bool			is_initial_gc:1;
 +	/*
 +	 * For when bch2_trans_update notices we'll be splitting a compressed
 +	 * extent:
@@ -22225,8 +23526,7 @@ index 000000000000..0d0a719f738f
 +
 +	/* update path: */
 +	struct btree_trans_commit_hook *hooks;
-+	struct jset_entry	*extra_journal_entries;
-+	unsigned		extra_journal_entry_u64s;
++	DARRAY(u64)		extra_journal_entries;
 +	struct journal_entry_pin *journal_pin;
 +
 +	struct journal_res	journal_res;
@@ -22239,7 +23539,31 @@ index 000000000000..0d0a719f738f
 +	struct replicas_delta_list *fs_usage_deltas;
 +};
 +
-+#define BTREE_FLAG(flag)						\
++#define BTREE_FLAGS()							\
++	x(read_in_flight)						\
++	x(read_error)							\
++	x(dirty)							\
++	x(need_write)							\
++	x(write_blocked)						\
++	x(will_make_reachable)						\
++	x(noevict)							\
++	x(write_idx)							\
++	x(accessed)							\
++	x(write_in_flight)						\
++	x(write_in_flight_inner)					\
++	x(just_written)							\
++	x(dying)							\
++	x(fake)								\
++	x(need_rewrite)							\
++	x(never_write)
++
++enum btree_flags {
++#define x(flag)	BTREE_NODE_##flag,
++	BTREE_FLAGS()
++#undef x
++};
++
++#define x(flag)								\
 +static inline bool btree_node_ ## flag(struct btree *b)			\
 +{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
 +									\
@@ -22249,36 +23573,8 @@ index 000000000000..0d0a719f738f
 +static inline void clear_btree_node_ ## flag(struct btree *b)		\
 +{	clear_bit(BTREE_NODE_ ## flag, &b->flags); }
 +
-+enum btree_flags {
-+	BTREE_NODE_read_in_flight,
-+	BTREE_NODE_read_error,
-+	BTREE_NODE_dirty,
-+	BTREE_NODE_need_write,
-+	BTREE_NODE_noevict,
-+	BTREE_NODE_write_idx,
-+	BTREE_NODE_accessed,
-+	BTREE_NODE_write_in_flight,
-+	BTREE_NODE_write_in_flight_inner,
-+	BTREE_NODE_just_written,
-+	BTREE_NODE_dying,
-+	BTREE_NODE_fake,
-+	BTREE_NODE_need_rewrite,
-+	BTREE_NODE_never_write,
-+};
-+
-+BTREE_FLAG(read_in_flight);
-+BTREE_FLAG(read_error);
-+BTREE_FLAG(need_write);
-+BTREE_FLAG(noevict);
-+BTREE_FLAG(write_idx);
-+BTREE_FLAG(accessed);
-+BTREE_FLAG(write_in_flight);
-+BTREE_FLAG(write_in_flight_inner);
-+BTREE_FLAG(just_written);
-+BTREE_FLAG(dying);
-+BTREE_FLAG(fake);
-+BTREE_FLAG(need_rewrite);
-+BTREE_FLAG(never_write);
++BTREE_FLAGS()
++#undef x
 +
 +static inline struct btree_write *btree_current_write(struct btree *b)
 +{
@@ -22408,24 +23704,9 @@ index 000000000000..0d0a719f738f
 +	return __btree_node_type(b->c.level, b->c.btree_id);
 +}
 +
-+static inline bool btree_node_type_is_extents(enum btree_node_type type)
-+{
-+	switch (type) {
-+	case BKEY_TYPE_extents:
-+	case BKEY_TYPE_reflink:
-+		return true;
-+	default:
-+		return false;
-+	}
-+}
-+
-+static inline bool btree_node_is_extents(struct btree *b)
-+{
-+	return btree_node_type_is_extents(btree_node_type(b));
-+}
-+
 +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
 +	((1U << BKEY_TYPE_extents)|			\
++	 (1U << BKEY_TYPE_alloc)|			\
 +	 (1U << BKEY_TYPE_inodes)|			\
 +	 (1U << BKEY_TYPE_stripes)|			\
 +	 (1U << BKEY_TYPE_reflink)|			\
@@ -22441,6 +23722,16 @@ index 000000000000..0d0a719f738f
 +	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 +	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 +
++#define BTREE_ID_IS_EXTENTS				\
++	((1U << BTREE_ID_extents)|			\
++	 (1U << BTREE_ID_reflink)|			\
++	 (1U << BTREE_ID_freespace))
++
++static inline bool btree_node_type_is_extents(enum btree_node_type type)
++{
++	return (1U << type) & BTREE_ID_IS_EXTENTS;
++}
++
 +#define BTREE_ID_HAS_SNAPSHOTS				\
 +	((1U << BTREE_ID_extents)|			\
 +	 (1U << BTREE_ID_inodes)|			\
@@ -22458,6 +23749,7 @@ index 000000000000..0d0a719f738f
 +
 +enum btree_update_flags {
 +	__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
++	__BTREE_UPDATE_KEY_CACHE_RECLAIM,
 +
 +	__BTREE_TRIGGER_NORUN,		/* Don't run triggers at all */
 +
@@ -22470,6 +23762,7 @@ index 000000000000..0d0a719f738f
 +};
 +
 +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
++#define BTREE_UPDATE_KEY_CACHE_RECLAIM	(1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 +
 +#define BTREE_TRIGGER_NORUN		(1U << __BTREE_TRIGGER_NORUN)
 +
@@ -22484,6 +23777,7 @@ index 000000000000..0d0a719f738f
 +	((1U << KEY_TYPE_alloc)|		\
 +	 (1U << KEY_TYPE_alloc_v2)|		\
 +	 (1U << KEY_TYPE_alloc_v3)|		\
++	 (1U << KEY_TYPE_alloc_v4)|		\
 +	 (1U << KEY_TYPE_stripe)|		\
 +	 (1U << KEY_TYPE_inode)|		\
 +	 (1U << KEY_TYPE_inode_v2)|		\
@@ -22527,10 +23821,10 @@ index 000000000000..0d0a719f738f
 +#endif /* _BCACHEFS_BTREE_TYPES_H */
 diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
 new file mode 100644
-index 000000000000..0268dd74f0ab
+index 000000000000..ad13b0739a68
 --- /dev/null
 +++ b/fs/bcachefs/btree_update.h
-@@ -0,0 +1,155 @@
+@@ -0,0 +1,141 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_UPDATE_H
 +#define _BCACHEFS_BTREE_UPDATE_H
@@ -22549,12 +23843,12 @@ index 000000000000..0268dd74f0ab
 +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 +
 +enum btree_insert_flags {
-+	__BTREE_INSERT_NOFAIL,
++	/* First two bits for journal watermark: */
++	__BTREE_INSERT_NOFAIL = 2,
 +	__BTREE_INSERT_NOCHECK_RW,
 +	__BTREE_INSERT_LAZY_RW,
 +	__BTREE_INSERT_USE_RESERVE,
 +	__BTREE_INSERT_JOURNAL_REPLAY,
-+	__BTREE_INSERT_JOURNAL_RESERVED,
 +	__BTREE_INSERT_JOURNAL_RECLAIM,
 +	__BTREE_INSERT_NOWAIT,
 +	__BTREE_INSERT_GC_LOCK_HELD,
@@ -22574,9 +23868,6 @@ index 000000000000..0268dd74f0ab
 +/* Insert is for journal replay - don't get journal reservations: */
 +#define BTREE_INSERT_JOURNAL_REPLAY	(1 << __BTREE_INSERT_JOURNAL_REPLAY)
 +
-+/* Indicates that we have pre-reserved space in the journal: */
-+#define BTREE_INSERT_JOURNAL_RESERVED	(1 << __BTREE_INSERT_JOURNAL_RESERVED)
-+
 +/* Insert is being called from journal reclaim path: */
 +#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
 +
@@ -22596,7 +23887,7 @@ index 000000000000..0268dd74f0ab
 +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
 +				  struct bpos, struct bpos, unsigned, u64 *);
 +int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-+			    struct bpos, struct bpos, u64 *);
++			    struct bpos, struct bpos, unsigned, u64 *);
 +
 +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
 +			    struct btree *, unsigned);
@@ -22606,12 +23897,18 @@ index 000000000000..0268dd74f0ab
 +int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 +				struct btree *, struct bkey_i *, bool);
 +
-+int bch2_trans_update(struct btree_trans *, struct btree_iter *,
-+		      struct bkey_i *, enum btree_update_flags);
++int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
++			     struct bkey_i *, enum btree_update_flags);
++
++int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
++				   struct bkey_i *, enum btree_update_flags);
++
 +void bch2_trans_commit_hook(struct btree_trans *,
 +			    struct btree_trans_commit_hook *);
 +int __bch2_trans_commit(struct btree_trans *);
 +
++int bch2_trans_log_msg(struct btree_trans *, const char *);
++
 +/**
 + * bch2_trans_commit - insert keys at given iterator positions
 + *
@@ -22668,30 +23965,13 @@ index 000000000000..0268dd74f0ab
 +	     (_i) < (_trans)->updates + (_trans)->nr_updates;		\
 +	     (_i)++)
 +
-+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
-+						      enum btree_id btree_id,
-+						      struct bpos pos)
-+{
-+	struct btree_insert_entry *i;
-+
-+	trans_for_each_update(trans, i)
-+		if ((cmp_int(btree_id,	i->btree_id) ?:
-+		     bpos_cmp(pos,	i->k->k.p)) <= 0) {
-+			if (btree_id ==	i->btree_id)
-+				return i->k;
-+			break;
-+		}
-+
-+	return NULL;
-+}
-+
 +#endif /* _BCACHEFS_BTREE_UPDATE_H */
 diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
 new file mode 100644
-index 000000000000..61c7757bd3ca
+index 000000000000..42ae3b0c5839
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.c
-@@ -0,0 +1,2187 @@
+@@ -0,0 +1,2238 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -22710,6 +23990,7 @@ index 000000000000..61c7757bd3ca
 +#include "journal.h"
 +#include "journal_reclaim.h"
 +#include "keylist.h"
++#include "recovery.h"
 +#include "replicas.h"
 +#include "super-io.h"
 +
@@ -22734,11 +24015,11 @@ index 000000000000..61c7757bd3ca
 +	struct bkey_s_c k;
 +	struct bkey_s_c_btree_ptr_v2 bp;
 +	struct bkey unpacked;
-+	char buf1[100], buf2[100];
++	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +
 +	BUG_ON(!b->c.level);
 +
-+	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
++	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
 +		return;
 +
 +	bch2_btree_node_iter_init_from_start(&iter, b);
@@ -22751,9 +24032,9 @@ index 000000000000..61c7757bd3ca
 +
 +		if (bpos_cmp(next_node, bp.v->min_key)) {
 +			bch2_dump_btree_node(c, b);
-+			panic("expected next min_key %s got %s\n",
-+			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
-+			      (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
++			bch2_bpos_to_text(&buf1, next_node);
++			bch2_bpos_to_text(&buf2, bp.v->min_key);
++			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
 +		}
 +
 +		bch2_btree_node_iter_advance(&iter, b);
@@ -22761,9 +24042,9 @@ index 000000000000..61c7757bd3ca
 +		if (bch2_btree_node_iter_end(&iter)) {
 +			if (bpos_cmp(k.k->p, b->key.k.p)) {
 +				bch2_dump_btree_node(c, b);
-+				panic("expected end %s got %s\n",
-+				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
-+				      (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
++				bch2_bpos_to_text(&buf1, b->key.k.p);
++				bch2_bpos_to_text(&buf2, k.k->p);
++				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
 +			}
 +			break;
 +		}
@@ -22874,6 +24155,7 @@ index 000000000000..61c7757bd3ca
 +static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 +					     struct disk_reservation *res,
 +					     struct closure *cl,
++					     bool interior_node,
 +					     unsigned flags)
 +{
 +	struct write_point *wp;
@@ -22886,10 +24168,10 @@ index 000000000000..61c7757bd3ca
 +
 +	if (flags & BTREE_INSERT_USE_RESERVE) {
 +		nr_reserve	= 0;
-+		alloc_reserve	= RESERVE_BTREE_MOVINGGC;
++		alloc_reserve	= RESERVE_btree_movinggc;
 +	} else {
 +		nr_reserve	= BTREE_NODE_RESERVE;
-+		alloc_reserve	= RESERVE_BTREE;
++		alloc_reserve	= RESERVE_btree;
 +	}
 +
 +	mutex_lock(&c->btree_reserve_cache_lock);
@@ -22917,12 +24199,12 @@ index 000000000000..61c7757bd3ca
 +	if (IS_ERR(wp))
 +		return ERR_CAST(wp);
 +
-+	if (wp->sectors_free < c->opts.btree_node_size) {
++	if (wp->sectors_free < btree_sectors(c)) {
 +		struct open_bucket *ob;
 +		unsigned i;
 +
 +		open_bucket_for_each(c, &wp->ptrs, ob, i)
-+			if (ob->sectors_free < c->opts.btree_node_size)
++			if (ob->sectors_free < btree_sectors(c))
 +				ob->sectors_free = 0;
 +
 +		bch2_alloc_sectors_done(c, wp);
@@ -22930,12 +24212,14 @@ index 000000000000..61c7757bd3ca
 +	}
 +
 +	bkey_btree_ptr_v2_init(&tmp.k);
-+	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
++	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 +
 +	bch2_open_bucket_get(c, wp, &ob);
 +	bch2_alloc_sectors_done(c, wp);
 +mem_alloc:
-+	b = bch2_btree_node_mem_alloc(c);
++	b = bch2_btree_node_mem_alloc(c, interior_node);
++	six_unlock_write(&b->c.lock);
++	six_unlock_intent(&b->c.lock);
 +
 +	/* we hold cannibalize_lock: */
 +	BUG_ON(IS_ERR(b));
@@ -22951,15 +24235,19 @@ index 000000000000..61c7757bd3ca
 +{
 +	struct bch_fs *c = as->c;
 +	struct btree *b;
++	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
 +	int ret;
 +
 +	BUG_ON(level >= BTREE_MAX_DEPTH);
-+	BUG_ON(!as->nr_prealloc_nodes);
++	BUG_ON(!p->nr);
 +
-+	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
++	b = p->b[--p->nr];
++
++	six_lock_intent(&b->c.lock, NULL, NULL);
++	six_lock_write(&b->c.lock, NULL, NULL);
 +
 +	set_btree_node_accessed(b);
-+	set_btree_node_dirty(c, b);
++	set_btree_node_dirty_acct(c, b);
 +	set_btree_node_need_write(b);
 +
 +	bch2_bset_init_first(b, &b->data->keys);
@@ -23065,70 +24353,94 @@ index 000000000000..61c7757bd3ca
 +static void bch2_btree_reserve_put(struct btree_update *as)
 +{
 +	struct bch_fs *c = as->c;
++	struct prealloc_nodes *p;
 +
 +	mutex_lock(&c->btree_reserve_cache_lock);
 +
-+	while (as->nr_prealloc_nodes) {
-+		struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
++	for (p = as->prealloc_nodes;
++	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
++	     p++) {
++		while (p->nr) {
++			struct btree *b = p->b[--p->nr];
 +
-+		six_unlock_write(&b->c.lock);
++			six_lock_intent(&b->c.lock, NULL, NULL);
++			six_lock_write(&b->c.lock, NULL, NULL);
 +
-+		if (c->btree_reserve_cache_nr <
-+		    ARRAY_SIZE(c->btree_reserve_cache)) {
-+			struct btree_alloc *a =
-+				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
++			if (c->btree_reserve_cache_nr <
++			    ARRAY_SIZE(c->btree_reserve_cache)) {
++				struct btree_alloc *a =
++					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 +
-+			a->ob = b->ob;
-+			b->ob.nr = 0;
-+			bkey_copy(&a->k, &b->key);
-+		} else {
-+			bch2_open_buckets_put(c, &b->ob);
++				a->ob = b->ob;
++				b->ob.nr = 0;
++				bkey_copy(&a->k, &b->key);
++			} else {
++				bch2_open_buckets_put(c, &b->ob);
++			}
++
++			__btree_node_free(c, b);
++			six_unlock_write(&b->c.lock);
++			six_unlock_intent(&b->c.lock);
 +		}
-+
-+		btree_node_lock_type(c, b, SIX_LOCK_write);
-+		__btree_node_free(c, b);
-+		six_unlock_write(&b->c.lock);
-+
-+		six_unlock_intent(&b->c.lock);
 +	}
 +
 +	mutex_unlock(&c->btree_reserve_cache_lock);
 +}
 +
-+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
-+				  unsigned flags, struct closure *cl)
++static int bch2_btree_reserve_get(struct btree_update *as,
++				  unsigned nr_nodes[2],
++				  unsigned flags)
 +{
 +	struct bch_fs *c = as->c;
++	struct closure cl;
 +	struct btree *b;
++	unsigned interior;
 +	int ret;
 +
-+	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
++	closure_init_stack(&cl);
++retry:
++
++	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
 +
 +	/*
 +	 * Protects reaping from the btree node cache and using the btree node
 +	 * open bucket reserve:
++	 *
++	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
++	 * blocking on this lock:
 +	 */
-+	ret = bch2_btree_cache_cannibalize_lock(c, cl);
++	ret = bch2_btree_cache_cannibalize_lock(c, &cl);
 +	if (ret)
-+		return ret;
++		goto err;
 +
-+	while (as->nr_prealloc_nodes < nr_nodes) {
-+		b = __bch2_btree_node_alloc(c, &as->disk_res,
-+					    flags & BTREE_INSERT_NOWAIT
-+					    ? NULL : cl, flags);
-+		if (IS_ERR(b)) {
-+			ret = PTR_ERR(b);
-+			goto err_free;
++	for (interior = 0; interior < 2; interior++) {
++		struct prealloc_nodes *p = as->prealloc_nodes + interior;
++
++		while (p->nr < nr_nodes[interior]) {
++			b = __bch2_btree_node_alloc(c, &as->disk_res,
++						    flags & BTREE_INSERT_NOWAIT
++						    ? NULL : &cl,
++						    interior, flags);
++			if (IS_ERR(b)) {
++				ret = PTR_ERR(b);
++				goto err;
++			}
++
++			p->b[p->nr++] = b;
 +		}
-+
-+		as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
 +	}
 +
 +	bch2_btree_cache_cannibalize_unlock(c);
++	closure_sync(&cl);
 +	return 0;
-+err_free:
++err:
 +	bch2_btree_cache_cannibalize_unlock(c);
-+	trace_btree_reserve_get_fail(c, nr_nodes, cl);
++	closure_sync(&cl);
++
++	if (ret == -EAGAIN)
++		goto retry;
++
++	trace_btree_reserve_get_fail(c, nr_nodes[0] + nr_nodes[1], &cl);
 +	return ret;
 +}
 +
@@ -23149,15 +24461,23 @@ index 000000000000..61c7757bd3ca
 +	bch2_disk_reservation_put(c, &as->disk_res);
 +	bch2_btree_reserve_put(as);
 +
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
++			       as->start_time);
++
 +	mutex_lock(&c->btree_interior_update_lock);
 +	list_del(&as->unwritten_list);
 +	list_del(&as->list);
-+	mutex_unlock(&c->btree_interior_update_lock);
 +
 +	closure_debug_destroy(&as->cl);
 +	mempool_free(as, &c->btree_interior_update_pool);
 +
++	/*
++	 * Have to do the wakeup with btree_interior_update_lock still held,
++	 * since being on btree_interior_update_list is our ref on @c:
++	 */
 +	closure_wake_up(&c->btree_interior_update_wait);
++
++	mutex_unlock(&c->btree_interior_update_lock);
 +}
 +
 +static void btree_update_will_delete_key(struct btree_update *as,
@@ -23186,24 +24506,25 @@ index 000000000000..61c7757bd3ca
 +	struct bkey_i *k;
 +	int ret;
 +
-+	trans->extra_journal_entries = (void *) &as->journal_entries[0];
-+	trans->extra_journal_entry_u64s = as->journal_u64s;
++	ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s);
++	if (ret)
++		return ret;
++
++	memcpy(&darray_top(trans->extra_journal_entries),
++	       as->journal_entries,
++	       as->journal_u64s * sizeof(u64));
++	trans->extra_journal_entries.nr += as->journal_u64s;
++
 +	trans->journal_pin = &as->journal;
 +
 +	for_each_keylist_key(&as->new_keys, k) {
-+		ret = bch2_trans_mark_key(trans,
-+					  bkey_s_c_null,
-+					  bkey_i_to_s_c(k),
-+					  BTREE_TRIGGER_INSERT);
++		ret = bch2_trans_mark_new(trans, k, 0);
 +		if (ret)
 +			return ret;
 +	}
 +
 +	for_each_keylist_key(&as->old_keys, k) {
-+		ret = bch2_trans_mark_key(trans,
-+					  bkey_i_to_s_c(k),
-+					  bkey_s_c_null,
-+					  BTREE_TRIGGER_OVERWRITE);
++		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0);
 +		if (ret)
 +			return ret;
 +	}
@@ -23231,8 +24552,6 @@ index 000000000000..61c7757bd3ca
 +	if (ret)
 +		goto err;
 +
-+	BUG_ON(!journal_pin_active(&as->journal));
-+
 +	/*
 +	 * Wait for any in flight writes to finish before we free the old nodes
 +	 * on disk:
@@ -23268,7 +24587,7 @@ index 000000000000..61c7757bd3ca
 +			      BTREE_INSERT_NOFAIL|
 +			      BTREE_INSERT_NOCHECK_RW|
 +			      BTREE_INSERT_JOURNAL_RECLAIM|
-+			      BTREE_INSERT_JOURNAL_RESERVED,
++			      JOURNAL_WATERMARK_reserved,
 +			      btree_update_nodes_written_trans(&trans, as));
 +	bch2_trans_exit(&trans);
 +
@@ -23288,11 +24607,13 @@ index 000000000000..61c7757bd3ca
 +		 * we're in journal error state:
 +		 */
 +
-+		btree_node_lock_type(c, b, SIX_LOCK_intent);
-+		btree_node_lock_type(c, b, SIX_LOCK_write);
++		six_lock_intent(&b->c.lock, NULL, NULL);
++		six_lock_write(&b->c.lock, NULL, NULL);
 +		mutex_lock(&c->btree_interior_update_lock);
 +
 +		list_del(&as->write_blocked_list);
++		if (list_empty(&b->write_blocked))
++			clear_btree_node_write_blocked(b);
 +
 +		/*
 +		 * Node might have been freed, recheck under
@@ -23337,13 +24658,14 @@ index 000000000000..61c7757bd3ca
 +
 +		BUG_ON(b->will_make_reachable != (unsigned long) as);
 +		b->will_make_reachable = 0;
++		clear_btree_node_will_make_reachable(b);
 +	}
 +	mutex_unlock(&c->btree_interior_update_lock);
 +
 +	for (i = 0; i < as->nr_new_nodes; i++) {
 +		b = as->new_nodes[i];
 +
-+		btree_node_lock_type(c, b, SIX_LOCK_read);
++		six_lock_read(&b->c.lock, NULL, NULL);
 +		btree_node_write_if_need(c, b, SIX_LOCK_read);
 +		six_unlock_read(&b->c.lock);
 +	}
@@ -23403,6 +24725,8 @@ index 000000000000..61c7757bd3ca
 +
 +	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
 +	as->b		= b;
++
++	set_btree_node_write_blocked(b);
 +	list_add(&as->write_blocked_list, &b->write_blocked);
 +
 +	mutex_unlock(&c->btree_interior_update_lock);
@@ -23468,6 +24792,7 @@ index 000000000000..61c7757bd3ca
 +
 +	as->new_nodes[as->nr_new_nodes++] = b;
 +	b->will_make_reachable = 1UL|(unsigned long) as;
++	set_btree_node_will_make_reachable(b);
 +
 +	mutex_unlock(&c->btree_interior_update_lock);
 +
@@ -23490,6 +24815,7 @@ index 000000000000..61c7757bd3ca
 +	 * xchg() is for synchronization with bch2_btree_complete_write:
 +	 */
 +	v = xchg(&b->will_make_reachable, 0);
++	clear_btree_node_will_make_reachable(b);
 +	as = (struct btree_update *) (v & ~1UL);
 +
 +	if (!as) {
@@ -23555,7 +24881,7 @@ index 000000000000..61c7757bd3ca
 +		closure_wake_up(&c->btree_interior_update_wait);
 +	}
 +
-+	clear_btree_node_dirty(c, b);
++	clear_btree_node_dirty_acct(c, b);
 +	clear_btree_node_need_write(b);
 +
 +	/*
@@ -23596,6 +24922,9 @@ index 000000000000..61c7757bd3ca
 +
 +static void bch2_btree_update_done(struct btree_update *as)
 +{
++	struct bch_fs *c = as->c;
++	u64 start_time = as->start_time;
++
 +	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 +
 +	if (as->took_gc_lock)
@@ -23606,34 +24935,50 @@ index 000000000000..61c7757bd3ca
 +
 +	continue_at(&as->cl, btree_update_set_nodes_written,
 +		    as->c->btree_interior_update_worker);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
++			       start_time);
 +}
 +
 +static struct btree_update *
 +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-+			unsigned level, unsigned nr_nodes, unsigned flags)
++			unsigned level, bool split, unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_update *as;
-+	struct closure cl;
++	u64 start_time = local_clock();
 +	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
 +		? BCH_DISK_RESERVATION_NOFAIL : 0;
-+	int journal_flags = 0;
++	unsigned nr_nodes[2] = { 0, 0 };
++	unsigned update_level = level;
++	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
 +	int ret = 0;
 +
 +	BUG_ON(!path->should_be_locked);
 +
-+	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
-+		journal_flags |= JOURNAL_RES_GET_RESERVED;
++	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
++		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 +
-+	closure_init_stack(&cl);
-+retry:
++	while (1) {
++		nr_nodes[!!update_level] += 1 + split;
++		update_level++;
++
++		if (!btree_path_node(path, update_level))
++			break;
++
++		/*
++		 * XXX: figure out how far we might need to split,
++		 * instead of locking/reserving all the way to the root:
++		 */
++		split = update_level + 1 < BTREE_MAX_DEPTH;
++	}
++
++	/* Might have to allocate a new root: */
++	if (update_level < BTREE_MAX_DEPTH)
++		nr_nodes[1] += 1;
 +
-+	/*
-+	 * XXX: figure out how far we might need to split,
-+	 * instead of locking/reserving all the way to the root:
-+	 */
 +	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
-+		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
++		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
 +						 path->btree_id, &path->pos);
 +		ret = btree_trans_restart(trans);
 +		return ERR_PTR(ret);
@@ -23654,6 +24999,7 @@ index 000000000000..61c7757bd3ca
 +	memset(as, 0, sizeof(*as));
 +	closure_init(&as->cl, NULL);
 +	as->c		= c;
++	as->start_time	= start_time;
 +	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 +	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
 +	as->btree_id	= path->btree_id;
@@ -23680,60 +25026,37 @@ index 000000000000..61c7757bd3ca
 +	if (ret)
 +		goto err;
 +
++	bch2_trans_unlock(trans);
++
 +	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
 +				      BTREE_UPDATE_JOURNAL_RES,
-+				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
-+	if (ret == -EAGAIN) {
-+		bch2_trans_unlock(trans);
-+
-+		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-+			bch2_btree_update_free(as);
-+			btree_trans_restart(trans);
-+			return ERR_PTR(ret);
-+		}
-+
-+		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-+				BTREE_UPDATE_JOURNAL_RES,
-+				journal_flags);
-+		if (ret) {
-+			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
-+			goto err;
-+		}
-+
-+		if (!bch2_trans_relock(trans)) {
-+			ret = -EINTR;
-+			goto err;
-+		}
++				      journal_flags);
++	if (ret) {
++		bch2_btree_update_free(as);
++		trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
++		btree_trans_restart(trans);
++		return ERR_PTR(ret);
 +	}
 +
 +	ret = bch2_disk_reservation_get(c, &as->disk_res,
-+			nr_nodes * c->opts.btree_node_size,
++			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
 +			c->opts.metadata_replicas,
 +			disk_res_flags);
 +	if (ret)
 +		goto err;
 +
-+	ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
++	ret = bch2_btree_reserve_get(as, nr_nodes, flags);
 +	if (ret)
 +		goto err;
 +
-+	bch2_journal_pin_add(&c->journal,
-+			     atomic64_read(&c->journal.seq),
-+			     &as->journal, NULL);
++	if (!bch2_trans_relock(trans)) {
++		ret = -EINTR;
++		goto err;
++	}
 +
 +	return as;
 +err:
 +	bch2_btree_update_free(as);
-+
-+	if (ret == -EAGAIN) {
-+		bch2_trans_unlock(trans);
-+		closure_sync(&cl);
-+		ret = -EINTR;
-+	}
-+
-+	if (ret == -EINTR && bch2_trans_relock(trans))
-+		goto retry;
-+
 +	return ERR_PTR(ret);
 +}
 +
@@ -23783,8 +25106,7 @@ index 000000000000..61c7757bd3ca
 +	struct btree *old;
 +
 +	trace_btree_set_root(c, b);
-+	BUG_ON(!b->written &&
-+	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
++	BUG_ON(!b->written);
 +
 +	old = btree_node_root(c, b);
 +
@@ -23824,13 +25146,17 @@ index 000000000000..61c7757bd3ca
 +	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
 +	       !btree_ptr_sectors_written(insert));
 +
++	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
++		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
++
 +	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
 +		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
 +	if (invalid) {
-+		char buf[160];
++		struct printbuf buf = PRINTBUF;
 +
-+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
-+		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
++		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf.buf, invalid);
++		printbuf_exit(&buf);
 +		dump_stack();
 +	}
 +
@@ -23848,7 +25174,7 @@ index 000000000000..61c7757bd3ca
 +		bch2_btree_node_iter_advance(node_iter, b);
 +
 +	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
-+	set_btree_node_dirty(c, b);
++	set_btree_node_dirty_acct(c, b);
 +	set_btree_node_need_write(b);
 +}
 +
@@ -24069,8 +25395,8 @@ index 000000000000..61c7757bd3ca
 +		six_unlock_write(&n2->c.lock);
 +		six_unlock_write(&n1->c.lock);
 +
-+		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
-+		bch2_btree_node_write(c, n2, SIX_LOCK_intent);
++		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
++		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
 +
 +		/*
 +		 * Note that on recursive parent_keys == keys, so we
@@ -24089,7 +25415,7 @@ index 000000000000..61c7757bd3ca
 +
 +			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 +
-+			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
++			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
 +		}
 +	} else {
 +		trace_btree_compact(c, b);
@@ -24097,7 +25423,7 @@ index 000000000000..61c7757bd3ca
 +		bch2_btree_build_aux_trees(n1);
 +		six_unlock_write(&n1->c.lock);
 +
-+		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
++		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 +
 +		if (parent)
 +			bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -24146,7 +25472,9 @@ index 000000000000..61c7757bd3ca
 +
 +	bch2_trans_verify_locks(trans);
 +
-+	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
++	bch2_time_stats_update(&c->times[n2
++			       ? BCH_TIME_btree_node_split
++			       : BCH_TIME_btree_node_compact],
 +			       start_time);
 +}
 +
@@ -24232,14 +25560,13 @@ index 000000000000..61c7757bd3ca
 +			  struct btree_path *path,
 +			  unsigned flags)
 +{
-+	struct bch_fs *c = trans->c;
 +	struct btree *b = path_l(path)->b;
 +	struct btree_update *as;
 +	unsigned l;
 +	int ret = 0;
 +
 +	as = bch2_btree_update_start(trans, path, path->level,
-+		btree_update_reserve_required(c, b), flags);
++				     true, flags);
 +	if (IS_ERR(as))
 +		return PTR_ERR(as);
 +
@@ -24267,6 +25594,7 @@ index 000000000000..61c7757bd3ca
 +	struct btree *b, *m, *n, *prev, *next, *parent;
 +	struct bpos sib_pos;
 +	size_t sib_u64s;
++	u64 start_time = local_clock();
 +	int ret = 0;
 +
 +	BUG_ON(!path->should_be_locked);
@@ -24284,8 +25612,8 @@ index 000000000000..61c7757bd3ca
 +		? bpos_predecessor(b->data->min_key)
 +		: bpos_successor(b->data->max_key);
 +
-+	sib_path = bch2_path_get(trans, false, path->btree_id,
-+				 sib_pos, U8_MAX, level, true);
++	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
++				 U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
 +	ret = bch2_btree_path_traverse(trans, sib_path, false);
 +	if (ret)
 +		goto err;
@@ -24309,15 +25637,17 @@ index 000000000000..61c7757bd3ca
 +	}
 +
 +	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
-+		char buf1[100], buf2[100];
++		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +
-+		bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
-+		bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
++		bch2_bpos_to_text(&buf1, prev->data->max_key);
++		bch2_bpos_to_text(&buf2, next->data->min_key);
 +		bch_err(c,
 +			"btree topology error in btree merge:\n"
 +			"  prev ends at   %s\n"
 +			"  next starts at %s",
-+			buf1, buf2);
++			buf1.buf, buf2.buf);
++		printbuf_exit(&buf1);
++		printbuf_exit(&buf2);
 +		bch2_topology_error(c);
 +		ret = -EIO;
 +		goto err;
@@ -24347,11 +25677,10 @@ index 000000000000..61c7757bd3ca
 +		goto out;
 +
 +	parent = btree_node_parent(path, b);
-+	as = bch2_btree_update_start(trans, path, level,
-+			 btree_update_reserve_required(c, parent) + 1,
-+			 flags|
++	as = bch2_btree_update_start(trans, path, level, false,
 +			 BTREE_INSERT_NOFAIL|
-+			 BTREE_INSERT_USE_RESERVE);
++			 BTREE_INSERT_USE_RESERVE|
++			 flags);
 +	ret = PTR_ERR_OR_ZERO(as);
 +	if (ret)
 +		goto err;
@@ -24364,6 +25693,10 @@ index 000000000000..61c7757bd3ca
 +	n = bch2_btree_node_alloc(as, b->c.level);
 +	bch2_btree_update_add_new_node(as, n);
 +
++	SET_BTREE_NODE_SEQ(n->data,
++			   max(BTREE_NODE_SEQ(b->data),
++			       BTREE_NODE_SEQ(m->data)) + 1);
++
 +	btree_set_min(n, prev->data->min_key);
 +	btree_set_max(n, next->data->max_key);
 +	n->data->format		= new_f;
@@ -24376,7 +25709,7 @@ index 000000000000..61c7757bd3ca
 +	bch2_btree_build_aux_trees(n);
 +	six_unlock_write(&n->c.lock);
 +
-+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 +
 +	bkey_init(&delete.k);
 +	delete.k.p = prev->key.k.p;
@@ -24404,6 +25737,8 @@ index 000000000000..61c7757bd3ca
 +	six_unlock_intent(&n->c.lock);
 +
 +	bch2_btree_update_done(as);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
 +out:
 +err:
 +	bch2_path_put(trans, sib_path, true);
@@ -24428,10 +25763,7 @@ index 000000000000..61c7757bd3ca
 +
 +	parent = btree_node_parent(iter->path, b);
 +	as = bch2_btree_update_start(trans, iter->path, b->c.level,
-+		(parent
-+		 ? btree_update_reserve_required(c, parent)
-+		 : 0) + 1,
-+		flags);
++				     false, flags);
 +	ret = PTR_ERR_OR_ZERO(as);
 +	if (ret) {
 +		trace_btree_gc_rewrite_node_fail(c, b);
@@ -24448,7 +25780,7 @@ index 000000000000..61c7757bd3ca
 +
 +	trace_btree_gc_rewrite_node(c, b);
 +
-+	bch2_btree_node_write(c, n, SIX_LOCK_intent);
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 +
 +	if (parent) {
 +		bch2_keylist_add(&as->parent_keys, &n->key);
@@ -24520,9 +25852,6 @@ index 000000000000..61c7757bd3ca
 +{
 +	struct async_btree_rewrite *a;
 +
-+	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
-+		return;
-+
 +	if (!percpu_ref_tryget(&c->writes))
 +		return;
 +
@@ -24551,21 +25880,14 @@ index 000000000000..61c7757bd3ca
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter2 = { NULL };
 +	struct btree *parent;
-+	u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
 +	int ret;
 +
 +	if (!skip_triggers) {
-+		ret = bch2_trans_mark_key(trans,
-+					  bkey_s_c_null,
-+					  bkey_i_to_s_c(new_key),
-+					  BTREE_TRIGGER_INSERT);
++		ret = bch2_trans_mark_new(trans, new_key, 0);
 +		if (ret)
 +			return ret;
 +
-+		ret = bch2_trans_mark_key(trans,
-+					  bkey_i_to_s_c(&b->key),
-+					  bkey_s_c_null,
-+					  BTREE_TRIGGER_OVERWRITE);
++		ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0);
 +		if (ret)
 +			return ret;
 +	}
@@ -24582,7 +25904,8 @@ index 000000000000..61c7757bd3ca
 +		bch2_trans_copy_iter(&iter2, iter);
 +
 +		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
-+				iter2.flags & BTREE_ITER_INTENT);
++				iter2.flags & BTREE_ITER_INTENT,
++				_THIS_IP_);
 +
 +		BUG_ON(iter2.path->level != b->c.level);
 +		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
@@ -24590,6 +25913,9 @@ index 000000000000..61c7757bd3ca
 +		btree_node_unlock(iter2.path, iter2.path->level);
 +		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
 +		iter2.path->level++;
++		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
++
++		bch2_btree_path_check_sort(trans, iter2.path, 0);
 +
 +		ret   = bch2_btree_iter_traverse(&iter2) ?:
 +			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
@@ -24598,19 +25924,24 @@ index 000000000000..61c7757bd3ca
 +	} else {
 +		BUG_ON(btree_node_root(c, b) != b);
 +
-+		trans->extra_journal_entries = (void *) &journal_entries[0];
-+		trans->extra_journal_entry_u64s =
-+			journal_entry_set((void *) &journal_entries[0],
-+					  BCH_JSET_ENTRY_btree_root,
-+					  b->c.btree_id, b->c.level,
-+					  new_key, new_key->k.u64s);
++		ret = darray_make_room(trans->extra_journal_entries,
++				       jset_u64s(new_key->k.u64s));
++		if (ret)
++			return ret;
++
++		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
++				  BCH_JSET_ENTRY_btree_root,
++				  b->c.btree_id, b->c.level,
++				  new_key, new_key->k.u64s);
++		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
 +	}
 +
 +	ret = bch2_trans_commit(trans, NULL, NULL,
 +				BTREE_INSERT_NOFAIL|
 +				BTREE_INSERT_NOCHECK_RW|
++				BTREE_INSERT_USE_RESERVE|
 +				BTREE_INSERT_JOURNAL_RECLAIM|
-+				BTREE_INSERT_JOURNAL_RESERVED);
++				JOURNAL_WATERMARK_reserved);
 +	if (ret)
 +		goto err;
 +
@@ -24673,7 +26004,7 @@ index 000000000000..61c7757bd3ca
 +				return -EINTR;
 +		}
 +
-+		new_hash = bch2_btree_node_mem_alloc(c);
++		new_hash = bch2_btree_node_mem_alloc(c, false);
 +	}
 +
 +	path->intent_ref++;
@@ -24749,7 +26080,7 @@ index 000000000000..61c7757bd3ca
 +		closure_sync(&cl);
 +	} while (ret);
 +
-+	b = bch2_btree_node_mem_alloc(c);
++	b = bch2_btree_node_mem_alloc(c, false);
 +	bch2_btree_cache_cannibalize_unlock(c);
 +
 +	set_btree_node_fake(b);
@@ -24881,10 +26212,10 @@ index 000000000000..61c7757bd3ca
 +}
 diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
 new file mode 100644
-index 000000000000..8e03bd987d6d
+index 000000000000..e72eb8795616
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.h
-@@ -0,0 +1,318 @@
+@@ -0,0 +1,321 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
 +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
@@ -24922,6 +26253,7 @@ index 000000000000..8e03bd987d6d
 +struct btree_update {
 +	struct closure			cl;
 +	struct bch_fs			*c;
++	u64				start_time;
 +
 +	struct list_head		list;
 +	struct list_head		unwritten_list;
@@ -24962,18 +26294,20 @@ index 000000000000..8e03bd987d6d
 +	struct journal_entry_pin	journal;
 +
 +	/* Preallocated nodes we reserve when we start the update: */
-+	struct btree			*prealloc_nodes[BTREE_UPDATE_NODES_MAX];
-+	unsigned			nr_prealloc_nodes;
++	struct prealloc_nodes {
++		struct btree		*b[BTREE_UPDATE_NODES_MAX];
++		unsigned		nr;
++	}				prealloc_nodes[2];
 +
 +	/* Nodes being freed: */
 +	struct keylist			old_keys;
 +	u64				_old_keys[BTREE_UPDATE_NODES_MAX *
-+						  BKEY_BTREE_PTR_VAL_U64s_MAX];
++						  BKEY_BTREE_PTR_U64s_MAX];
 +
 +	/* Nodes being added: */
 +	struct keylist			new_keys;
 +	u64				_new_keys[BTREE_UPDATE_NODES_MAX *
-+						  BKEY_BTREE_PTR_VAL_U64s_MAX];
++						  BKEY_BTREE_PTR_U64s_MAX];
 +
 +	/* New nodes, that will be made reachable by this update: */
 +	struct btree			*new_nodes[BTREE_UPDATE_NODES_MAX];
@@ -25104,7 +26438,7 @@ index 000000000000..8e03bd987d6d
 +{
 +	ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
 +		b->whiteout_u64s;
-+	ssize_t total = c->opts.btree_node_size << 6;
++	ssize_t total = c->opts.btree_node_size >> 3;
 +
 +	/* Always leave one extra u64 for bch2_varint_decode: */
 +	used++;
@@ -25205,10 +26539,10 @@ index 000000000000..8e03bd987d6d
 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
 diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
 new file mode 100644
-index 000000000000..112ac7caf579
+index 000000000000..a0480c63dd81
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_leaf.c
-@@ -0,0 +1,1518 @@
+@@ -0,0 +1,1756 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -25226,6 +26560,7 @@ index 000000000000..112ac7caf579
 +#include "journal.h"
 +#include "journal_reclaim.h"
 +#include "keylist.h"
++#include "recovery.h"
 +#include "subvolume.h"
 +#include "replicas.h"
 +
@@ -25233,6 +26568,10 @@ index 000000000000..112ac7caf579
 +#include <linux/sort.h>
 +#include <trace/events/bcachefs.h>
 +
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
++			  struct bkey_i *, enum btree_update_flags);
++
 +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
 +					 const struct btree_insert_entry *r)
 +{
@@ -25373,10 +26712,24 @@ index 000000000000..112ac7caf579
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	struct btree_write *w = container_of(pin, struct btree_write, journal);
 +	struct btree *b = container_of(w, struct btree, writes[i]);
++	unsigned long old, new, v;
++	unsigned idx = w - b->writes;
 +
-+	btree_node_lock_type(c, b, SIX_LOCK_read);
-+	bch2_btree_node_write_cond(c, b,
-+		(btree_current_write(b) == w && w->journal.seq == seq));
++	six_lock_read(&b->c.lock, NULL, NULL);
++	v = READ_ONCE(b->flags);
++
++	do {
++		old = new = v;
++
++		if (!(old & (1 << BTREE_NODE_dirty)) ||
++		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
++		    w->journal.seq != seq)
++			break;
++
++		new |= 1 << BTREE_NODE_need_write;
++	} while ((v = cmpxchg(&b->flags, old, new)) != old);
++
++	btree_node_write_if_need(c, b, SIX_LOCK_read);
 +	six_unlock_read(&b->c.lock);
 +	return 0;
 +}
@@ -25405,7 +26758,7 @@ index 000000000000..112ac7caf579
 +/**
 + * btree_insert_key - insert a key one key into a leaf node
 + */
-+static bool btree_insert_key_leaf(struct btree_trans *trans,
++static void btree_insert_key_leaf(struct btree_trans *trans,
 +				  struct btree_insert_entry *insert)
 +{
 +	struct bch_fs *c = trans->c;
@@ -25416,12 +26769,9 @@ index 000000000000..112ac7caf579
 +	int old_live_u64s = b->nr.live_u64s;
 +	int live_u64s_added, u64s_added;
 +
-+	EBUG_ON(!insert->level &&
-+		!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-+
 +	if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
 +					&insert_l(insert)->iter, insert->k)))
-+		return false;
++		return;
 +
 +	i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
 +					 le64_to_cpu(i->journal_seq)));
@@ -25429,7 +26779,7 @@ index 000000000000..112ac7caf579
 +	bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
 +
 +	if (unlikely(!btree_node_dirty(b)))
-+		set_btree_node_dirty(c, b);
++		set_btree_node_dirty_acct(c, b);
 +
 +	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 +	u64s_added = (int) bset_u64s(t) - old_u64s;
@@ -25442,8 +26792,6 @@ index 000000000000..112ac7caf579
 +	if (u64s_added > live_u64s_added &&
 +	    bch2_maybe_compact_whiteouts(c, b))
 +		bch2_trans_node_reinit_iter(trans, b);
-+
-+	return true;
 +}
 +
 +/* Cached btree updates: */
@@ -25479,7 +26827,7 @@ index 000000000000..112ac7caf579
 +		return ret;
 +
 +	if (!bch2_trans_relock(trans)) {
-+		trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
++		trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
 +		return -EINTR;
 +	}
 +
@@ -25492,15 +26840,40 @@ index 000000000000..112ac7caf579
 +	struct bch_fs *c = trans->c;
 +	int ret;
 +
-+	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-+		flags |= JOURNAL_RES_GET_RESERVED;
-+
 +	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
-+				   trans->journal_u64s, flags);
++				   trans->journal_u64s,
++				   flags|
++				   (trans->flags & JOURNAL_WATERMARK_MASK));
 +
 +	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 +}
 +
++#define JSET_ENTRY_LOG_U64s		4
++
++static noinline void journal_transaction_name(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
++	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
++	unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
++	unsigned b, buflen = u64s * sizeof(u64);
++
++	l->entry.u64s		= cpu_to_le16(u64s);
++	l->entry.btree_id	= 0;
++	l->entry.level		= 0;
++	l->entry.type		= BCH_JSET_ENTRY_log;
++	l->entry.pad[0]		= 0;
++	l->entry.pad[1]		= 0;
++	l->entry.pad[2]		= 0;
++	b = min_t(unsigned, strlen(trans->fn), buflen);
++	memcpy(l->d, trans->fn, b);
++	while (b < buflen)
++		l->d[b++] = '\0';
++
++	trans->journal_res.offset	+= JSET_ENTRY_LOG_U64s;
++	trans->journal_res.u64s		-= JSET_ENTRY_LOG_U64s;
++}
++
 +static inline enum btree_insert_ret
 +btree_key_can_insert(struct btree_trans *trans,
 +		     struct btree *b,
@@ -25519,14 +26892,15 @@ index 000000000000..112ac7caf579
 +			    struct btree_path *path,
 +			    unsigned u64s)
 +{
++	struct bch_fs *c = trans->c;
 +	struct bkey_cached *ck = (void *) path->l[0].b;
-+	unsigned new_u64s;
++	unsigned old_u64s = ck->u64s, new_u64s;
 +	struct bkey_i *new_k;
 +
 +	EBUG_ON(path->level);
 +
 +	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-+	    bch2_btree_key_cache_must_wait(trans->c) &&
++	    bch2_btree_key_cache_must_wait(c) &&
 +	    !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
 +		return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 +
@@ -25541,12 +26915,27 @@ index 000000000000..112ac7caf579
 +
 +	new_u64s	= roundup_pow_of_two(u64s);
 +	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-+	if (!new_k)
++	if (!new_k) {
++		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
++			bch2_btree_ids[path->btree_id], new_u64s);
 +		return -ENOMEM;
++	}
 +
 +	ck->u64s	= new_u64s;
 +	ck->k		= new_k;
-+	return BTREE_INSERT_OK;
++	/*
++	 * Keys returned by peek() are no longer valid pointers, so we need a
++	 * transaction restart:
++	 */
++	trace_trans_restart_key_cache_key_realloced(trans->fn, _RET_IP_,
++					     path->btree_id, &path->pos,
++					     old_u64s, new_u64s);
++	/*
++	 * Not using btree_trans_restart() because we can't unlock here, we have
++	 * write locks held:
++	 */
++	trans->restarted = true;
++	return -EINTR;
 +}
 +
 +static inline void do_btree_insert_one(struct btree_trans *trans,
@@ -25554,18 +26943,16 @@ index 000000000000..112ac7caf579
 +{
 +	struct bch_fs *c = trans->c;
 +	struct journal *j = &c->journal;
-+	bool did_work;
 +
 +	EBUG_ON(trans->journal_res.ref !=
 +		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 +
 +	i->k->k.needs_whiteout = false;
 +
-+	did_work = !i->cached
-+		? btree_insert_key_leaf(trans, i)
-+		: bch2_btree_insert_key_cached(trans, i->path, i->k);
-+	if (!did_work)
-+		return;
++	if (!i->cached)
++		btree_insert_key_leaf(trans, i);
++	else
++		bch2_btree_insert_key_cached(trans, i->path, i->k);
 +
 +	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 +		bch2_journal_add_keys(j, &trans->journal_res,
@@ -25578,10 +26965,163 @@ index 000000000000..112ac7caf579
 +	}
 +}
 +
-+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
++/* Triggers: */
++
++static int run_one_mem_trigger(struct btree_trans *trans,
++			       struct btree_insert_entry *i,
++			       unsigned flags)
++{
++	struct bkey_s_c old = { &i->old_k, i->old_v };
++	struct bkey_i *new = i->k;
++	int ret;
++
++	if (unlikely(flags & BTREE_TRIGGER_NORUN))
++		return 0;
++
++	if (!btree_node_type_needs_gc(i->btree_id))
++		return 0;
++
++	if (bch2_bkey_ops[old.k->type].atomic_trigger ==
++	    bch2_bkey_ops[i->k->k.type].atomic_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
++	} else {
++		struct bkey		_deleted = KEY(0, 0, 0);
++		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
++
++		_deleted.p = i->path->pos;
++
++		ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|flags) ?:
++			bch2_mark_key(trans, old, deleted,
++				BTREE_TRIGGER_OVERWRITE|flags);
++	}
++
++	return ret;
++}
++
++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
++				 bool overwrite)
++{
++	/*
++	 * Transactional triggers create new btree_insert_entries, so we can't
++	 * pass them a pointer to a btree_insert_entry, that memory is going to
++	 * move:
++	 */
++	struct bkey old_k = i->old_k;
++	struct bkey_s_c old = { &old_k, i->old_v };
++
++	if ((i->flags & BTREE_TRIGGER_NORUN) ||
++	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
++		return 0;
++
++	if (!i->insert_trigger_run &&
++	    !i->overwrite_trigger_run &&
++	    bch2_bkey_ops[old.k->type].trans_trigger ==
++	    bch2_bkey_ops[i->k->k.type].trans_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		i->overwrite_trigger_run = true;
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_key(trans, old, i->k,
++					   BTREE_TRIGGER_INSERT|
++					   BTREE_TRIGGER_OVERWRITE|
++					   i->flags) ?: 1;
++	} else if (overwrite && !i->overwrite_trigger_run) {
++		i->overwrite_trigger_run = true;
++		return bch2_trans_mark_old(trans, old, i->flags) ?: 1;
++	} else if (!i->insert_trigger_run) {
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1;
++	} else {
++		return 0;
++	}
++}
++
++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
++			      struct btree_insert_entry *btree_id_start)
++{
++	struct btree_insert_entry *i;
++	bool trans_trigger_run;
++	int ret, overwrite;
++
++	for (overwrite = 1; overwrite >= 0; --overwrite) {
++
++		/*
++		 * Running triggers will append more updates to the list of updates as
++		 * we're walking it:
++		 */
++		do {
++			trans_trigger_run = false;
++
++			for (i = btree_id_start;
++			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
++			     i++) {
++				if (i->btree_id != btree_id)
++					continue;
++
++				ret = run_one_trans_trigger(trans, i, overwrite);
++				if (ret < 0)
++					return ret;
++				if (ret)
++					trans_trigger_run = true;
++			}
++		} while (trans_trigger_run);
++	}
++
++	return 0;
++}
++
++static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
++	unsigned btree_id = 0;
++	int ret = 0;
++
++	/*
++	 *
++	 * For a given btree, this algorithm runs insert triggers before
++	 * overwrite triggers: this is so that when extents are being moved
++	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
++	 * they are re-added.
++	 */
++	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
++		if (btree_id == BTREE_ID_alloc)
++			continue;
++
++		while (btree_id_start < trans->updates + trans->nr_updates &&
++		       btree_id_start->btree_id < btree_id)
++			btree_id_start++;
++
++		ret = run_btree_triggers(trans, btree_id, btree_id_start);
++		if (ret)
++			return ret;
++	}
++
++	trans_for_each_update(trans, i) {
++		if (i->btree_id > BTREE_ID_alloc)
++			break;
++		if (i->btree_id == BTREE_ID_alloc) {
++			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
++			if (ret)
++				return ret;
++			break;
++		}
++	}
++
++	trans_for_each_update(trans, i)
++		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
++		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
++		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
++
++	return 0;
++}
++
++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i;
++	int ret = 0;
 +
 +	trans_for_each_update(trans, i) {
 +		/*
@@ -25590,10 +27130,14 @@ index 000000000000..112ac7caf579
 +		 */
 +		BUG_ON(i->cached || i->level);
 +
-+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
-+			bch2_mark_update(trans, i->path, i->k,
-+					 i->flags|BTREE_TRIGGER_GC);
++		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
++			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
++			if (ret)
++				break;
++		}
 +	}
++
++	return ret;
 +}
 +
 +static inline int
@@ -25609,7 +27153,7 @@ index 000000000000..112ac7caf579
 +	int ret;
 +
 +	if (race_fault()) {
-+		trace_trans_restart_fault_inject(trans->ip, trace_ip);
++		trace_trans_restart_fault_inject(trans->fn, trace_ip);
 +		trans->restarted = true;
 +		return -EINTR;
 +	}
@@ -25646,17 +27190,32 @@ index 000000000000..112ac7caf579
 +
 +		if (btree_node_type_needs_gc(i->bkey_type))
 +			marking = true;
-+	}
 +
-+	if (marking) {
-+		percpu_down_read(&c->mark_lock);
-+	}
++		/*
++		 * Revalidate before calling mem triggers - XXX, ugly:
++		 *
++		 * - successful btree node splits don't cause transaction
++		 *   restarts and will have invalidated the pointer to the bkey
++		 *   value
++		 * - btree_node_lock_for_insert() -> btree_node_prep_for_write()
++		 *   when it has to resort
++		 * - btree_key_can_insert_cached() when it has to reallocate
++		 *
++		 *   Ugly because we currently have no way to tell if the
++		 *   pointer's been invalidated, which means it's debatabale
++		 *   whether we should be stashing the old key at all.
++		 */
++		i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
 +
-+	/* Must be called under mark_lock: */
-+	if (marking && trans->fs_usage_deltas &&
-+	    !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
-+		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-+		goto err;
++		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
++			struct bkey_i *j_k =
++				bch2_journal_keys_peek(c, i->btree_id, i->level, i->k->k.p);
++
++			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
++				i->old_k = j_k->k;
++				i->old_v = &j_k->v;
++			}
++		}
 +	}
 +
 +	/*
@@ -25667,18 +27226,21 @@ index 000000000000..112ac7caf579
 +		ret = bch2_trans_journal_res_get(trans,
 +				JOURNAL_RES_GET_NONBLOCK);
 +		if (ret)
-+			goto err;
++			return ret;
++
++		if (unlikely(trans->journal_transaction_names))
++			journal_transaction_name(trans);
 +	} else {
 +		trans->journal_res.seq = c->journal.replay_journal_seq;
 +	}
 +
-+	if (unlikely(trans->extra_journal_entry_u64s)) {
++	if (unlikely(trans->extra_journal_entries.nr)) {
 +		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-+				  trans->extra_journal_entries,
-+				  trans->extra_journal_entry_u64s);
++				  trans->extra_journal_entries.data,
++				  trans->extra_journal_entries.nr);
 +
-+		trans->journal_res.offset	+= trans->extra_journal_entry_u64s;
-+		trans->journal_res.u64s		-= trans->extra_journal_entry_u64s;
++		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
++		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
 +	}
 +
 +	/*
@@ -25695,22 +27257,25 @@ index 000000000000..112ac7caf579
 +				i->k->k.version = MAX_VERSION;
 +	}
 +
++	if (trans->fs_usage_deltas &&
++	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
++		return BTREE_INSERT_NEED_MARK_REPLICAS;
++
 +	trans_for_each_update(trans, i)
-+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-+			bch2_mark_update(trans, i->path, i->k, i->flags);
++		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
++			ret = run_one_mem_trigger(trans, i, i->flags);
++			if (ret)
++				return ret;
++		}
 +
-+	if (marking && trans->fs_usage_deltas)
-+		bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
-+
-+	if (unlikely(c->gc_pos.phase))
-+		bch2_trans_mark_gc(trans);
++	if (unlikely(c->gc_pos.phase)) {
++		ret = bch2_trans_commit_run_gc_triggers(trans);
++		if  (ret)
++			return ret;
++	}
 +
 +	trans_for_each_update(trans, i)
 +		do_btree_insert_one(trans, i);
-+err:
-+	if (marking) {
-+		percpu_up_read(&c->mark_lock);
-+	}
 +
 +	return ret;
 +}
@@ -25797,8 +27362,10 @@ index 000000000000..112ac7caf579
 +			if (have_conflicting_read_lock(trans, i->path))
 +				goto fail;
 +
-+			__btree_node_lock_type(trans->c, insert_l(i)->b,
-+					       SIX_LOCK_write);
++			btree_node_lock_type(trans, i->path,
++					     insert_l(i)->b,
++					     i->path->pos, i->level,
++					     SIX_LOCK_write, NULL, NULL);
 +		}
 +
 +		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
@@ -25813,10 +27380,18 @@ index 000000000000..112ac7caf579
 +		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
 +	}
 +
-+	trace_trans_restart_would_deadlock_write(trans->ip);
++	trace_trans_restart_would_deadlock_write(trans->fn);
 +	return btree_trans_restart(trans);
 +}
 +
++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	trans_for_each_update(trans, i)
++		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
++}
++
 +/*
 + * Get journal reservation, take write locks, and attempt to do btree update(s):
 + */
@@ -25826,42 +27401,29 @@ index 000000000000..112ac7caf579
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i;
-+	struct bkey_s_c old;
 +	int ret, u64s_delta = 0;
 +
 +	trans_for_each_update(trans, i) {
 +		const char *invalid = bch2_bkey_invalid(c,
 +				bkey_i_to_s_c(i->k), i->bkey_type);
 +		if (invalid) {
-+			char buf[200];
++			struct printbuf buf = PRINTBUF;
 +
-+			bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-+			bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
-+				buf, (void *) trans->ip,
-+				(void *) i->ip_allocated, invalid);
-+			bch2_fatal_error(c);
++			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
++			bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
++					    buf.buf, trans->fn, (void *) i->ip_allocated, invalid);
++			printbuf_exit(&buf);
 +			return -EINVAL;
 +		}
 +		btree_insert_entry_checks(trans, i);
 +	}
 +
 +	trans_for_each_update(trans, i) {
-+		struct bkey u;
-+
-+		/*
-+		 * peek_slot() doesn't yet work on iterators that point to
-+		 * interior nodes:
-+		 */
-+		if (i->cached || i->level)
++		if (i->cached)
 +			continue;
 +
-+		old = bch2_btree_path_peek_slot(i->path, &u);
-+		ret = bkey_err(old);
-+		if (unlikely(ret))
-+			return ret;
-+
 +		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-+		u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
++		u64s_delta -= i->old_btree_u64s;
 +
 +		if (!same_leaf_as_next(trans, i)) {
 +			if (u64s_delta <= 0) {
@@ -25878,8 +27440,7 @@ index 000000000000..112ac7caf579
 +	ret = bch2_journal_preres_get(&c->journal,
 +			&trans->journal_preres, trans->journal_preres_u64s,
 +			JOURNAL_RES_GET_NONBLOCK|
-+			((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
-+			 ? JOURNAL_RES_GET_RESERVED : 0));
++			(trans->flags & JOURNAL_WATERMARK_MASK));
 +	if (unlikely(ret == -EAGAIN))
 +		ret = bch2_trans_journal_preres_get_cold(trans,
 +						trans->journal_preres_u64s, trace_ip);
@@ -25894,6 +27455,9 @@ index 000000000000..112ac7caf579
 +
 +	ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 +
++	if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
++		bch2_drop_overwrites_from_journal(trans);
++
 +	trans_for_each_update(trans, i)
 +		if (!same_leaf_as_prev(trans, i))
 +			bch2_btree_node_unlock_write_inlined(trans, i->path,
@@ -25941,7 +27505,7 @@ index 000000000000..112ac7caf579
 +			return 0;
 +
 +		if (ret == -EINTR)
-+			trace_trans_restart_btree_node_split(trans->ip, trace_ip,
++			trace_trans_restart_btree_node_split(trans->fn, trace_ip,
 +						i->btree_id, &i->path->pos);
 +		break;
 +	case BTREE_INSERT_NEED_MARK_REPLICAS:
@@ -25954,14 +27518,14 @@ index 000000000000..112ac7caf579
 +		if (bch2_trans_relock(trans))
 +			return 0;
 +
-+		trace_trans_restart_mark_replicas(trans->ip, trace_ip);
++		trace_trans_restart_mark_replicas(trans->fn, trace_ip);
 +		ret = -EINTR;
 +		break;
 +	case BTREE_INSERT_NEED_JOURNAL_RES:
 +		bch2_trans_unlock(trans);
 +
 +		if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-+		    !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) {
++		    !(trans->flags & JOURNAL_WATERMARK_reserved)) {
 +			trans->restarted = true;
 +			ret = -EAGAIN;
 +			break;
@@ -25974,13 +27538,13 @@ index 000000000000..112ac7caf579
 +		if (bch2_trans_relock(trans))
 +			return 0;
 +
-+		trace_trans_restart_journal_res_get(trans->ip, trace_ip);
++		trace_trans_restart_journal_res_get(trans->fn, trace_ip);
 +		ret = -EINTR;
 +		break;
 +	case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
 +		bch2_trans_unlock(trans);
 +
-+		trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
++		trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
 +
 +		wait_event_freezable(c->journal.reclaim_wait,
 +				     (ret = journal_reclaim_wait_done(c)));
@@ -25990,7 +27554,7 @@ index 000000000000..112ac7caf579
 +		if (bch2_trans_relock(trans))
 +			return 0;
 +
-+		trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
++		trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
 +		ret = -EINTR;
 +		break;
 +	default:
@@ -25999,7 +27563,9 @@ index 000000000000..112ac7caf579
 +	}
 +
 +	BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
-+	BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
++	BUG_ON(ret == -ENOSPC &&
++	       !(trans->flags & BTREE_INSERT_NOWAIT) &&
++	       (trans->flags & BTREE_INSERT_NOFAIL));
 +
 +	return ret;
 +}
@@ -26010,7 +27576,8 @@ index 000000000000..112ac7caf579
 +	struct bch_fs *c = trans->c;
 +	int ret;
 +
-+	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
++	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
++	    test_bit(BCH_FS_STARTED, &c->flags))
 +		return -EROFS;
 +
 +	bch2_trans_unlock(trans);
@@ -26026,155 +27593,72 @@ index 000000000000..112ac7caf579
 +	return 0;
 +}
 +
-+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
++/*
++ * This is for updates done in the early part of fsck - btree_gc - before we've
++ * gone RW. we only add the new key to the list of keys for journal replay to
++ * do.
++ */
++static noinline int
++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
 +{
-+	struct bkey		_deleted = KEY(0, 0, 0);
-+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-+	struct bkey_s_c		old;
-+	struct bkey		unpacked;
-+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-+	bool trans_trigger_run;
-+	unsigned btree_id = 0;
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
 +	int ret = 0;
 +
-+	/*
-+	 *
-+	 * For a given btree, this algorithm runs insert triggers before
-+	 * overwrite triggers: this is so that when extents are being moved
-+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-+	 * they are re-added.
-+	 */
-+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-+		while (btree_id_start < trans->updates + trans->nr_updates &&
-+		       btree_id_start->btree_id < btree_id)
-+			btree_id_start++;
-+
-+		/*
-+		 * Running triggers will append more updates to the list of updates as
-+		 * we're walking it:
-+		 */
-+		do {
-+			trans_trigger_run = false;
-+
-+			for (i = btree_id_start;
-+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-+			     i++) {
-+				if (i->insert_trigger_run ||
-+				    (i->flags & BTREE_TRIGGER_NORUN) ||
-+				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-+					continue;
-+
-+				BUG_ON(i->overwrite_trigger_run);
-+
-+				i->insert_trigger_run = true;
-+				trans_trigger_run = true;
-+
-+				old = bch2_btree_path_peek_slot(i->path, &unpacked);
-+				_deleted.p = i->path->pos;
-+
-+				if (old.k->type == i->k->k.type &&
-+				    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-+					i->overwrite_trigger_run = true;
-+					ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
-+							BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
-+				} else {
-+					ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
-+							BTREE_TRIGGER_INSERT|i->flags);
-+				}
-+
-+				if (ret == -EINTR)
-+					trace_trans_restart_mark(trans->ip, _RET_IP_,
-+							i->btree_id, &i->path->pos);
-+				if (ret)
-+					return ret;
-+			}
-+		} while (trans_trigger_run);
-+
-+		do {
-+			trans_trigger_run = false;
-+
-+			for (i = btree_id_start;
-+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-+			     i++) {
-+				if (i->overwrite_trigger_run ||
-+				    (i->flags & BTREE_TRIGGER_NORUN) ||
-+				    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-+					continue;
-+
-+				BUG_ON(!i->insert_trigger_run);
-+
-+				i->overwrite_trigger_run = true;
-+				trans_trigger_run = true;
-+
-+				old = bch2_btree_path_peek_slot(i->path, &unpacked);
-+				_deleted.p = i->path->pos;
-+
-+				ret = bch2_trans_mark_key(trans, old, deleted,
-+						BTREE_TRIGGER_OVERWRITE|i->flags);
-+
-+				if (ret == -EINTR)
-+					trace_trans_restart_mark(trans->ip, _RET_IP_,
-+							i->btree_id, &i->path->pos);
-+				if (ret)
-+					return ret;
-+			}
-+		} while (trans_trigger_run);
++	trans_for_each_update(trans, i) {
++		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
++		if (ret)
++			break;
 +	}
 +
-+	trans_for_each_update(trans, i)
-+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-+
-+	return 0;
++	return ret;
 +}
 +
 +int __bch2_trans_commit(struct btree_trans *trans)
 +{
++	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i = NULL;
 +	unsigned u64s;
 +	int ret = 0;
 +
 +	if (!trans->nr_updates &&
-+	    !trans->extra_journal_entry_u64s)
++	    !trans->extra_journal_entries.nr)
 +		goto out_reset;
 +
 +	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
-+		lockdep_assert_held(&trans->c->gc_lock);
++		lockdep_assert_held(&c->gc_lock);
 +
-+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
++	ret = bch2_trans_commit_run_triggers(trans);
++	if (ret)
++		goto out_reset;
 +
-+	trans->journal_u64s		= trans->extra_journal_entry_u64s;
-+	trans->journal_preres_u64s	= 0;
++	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
++		ret = do_bch2_trans_commit_to_journal_replay(trans);
++		goto out_reset;
++	}
 +
 +	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-+	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
++	    unlikely(!percpu_ref_tryget(&c->writes))) {
 +		ret = bch2_trans_commit_get_rw_cold(trans);
 +		if (ret)
 +			goto out_reset;
 +	}
 +
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	/*
-+	 * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
-+	 * from the key cache flush code:
-+	 */
-+	trans_for_each_update(trans, i)
-+		if (!i->cached &&
-+		    !(i->flags & BTREE_TRIGGER_NORUN))
-+			bch2_btree_key_cache_verify_clean(trans,
-+					i->btree_id, i->k->k.p);
-+#endif
++	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 +
-+	ret = bch2_trans_commit_run_triggers(trans);
-+	if (ret)
-+		goto out;
++	trans->journal_u64s		= trans->extra_journal_entries.nr;
++	trans->journal_preres_u64s	= 0;
++
++	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
++
++	if (trans->journal_transaction_names)
++		trans->journal_u64s += JSET_ENTRY_LOG_U64s;
 +
 +	trans_for_each_update(trans, i) {
 +		BUG_ON(!i->path->should_be_locked);
 +
 +		if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
-+			trace_trans_restart_upgrade(trans->ip, _RET_IP_,
++			trace_trans_restart_upgrade(trans->fn, _RET_IP_,
 +						    i->btree_id, &i->path->pos);
 +			ret = btree_trans_restart(trans);
 +			goto out;
@@ -26190,7 +27674,7 @@ index 000000000000..112ac7caf579
 +	}
 +
 +	if (trans->extra_journal_res) {
-+		ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
++		ret = bch2_disk_reservation_add(c, trans->disk_res,
 +				trans->extra_journal_res,
 +				(trans->flags & BTREE_INSERT_NOFAIL)
 +				? BCH_DISK_RESERVATION_NOFAIL : 0);
@@ -26209,10 +27693,10 @@ index 000000000000..112ac7caf579
 +	if (ret)
 +		goto err;
 +out:
-+	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
++	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 +
 +	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-+		percpu_ref_put(&trans->c->writes);
++		percpu_ref_put(&c->writes);
 +out_reset:
 +	trans_for_each_update(trans, i)
 +		bch2_path_put(trans, i->path, true);
@@ -26220,8 +27704,7 @@ index 000000000000..112ac7caf579
 +	trans->extra_journal_res	= 0;
 +	trans->nr_updates		= 0;
 +	trans->hooks			= NULL;
-+	trans->extra_journal_entries	= NULL;
-+	trans->extra_journal_entry_u64s	= 0;
++	trans->extra_journal_entries.nr	= 0;
 +
 +	if (trans->fs_usage_deltas) {
 +		trans->fs_usage_deltas->used = 0;
@@ -26248,6 +27731,9 @@ index 000000000000..112ac7caf579
 +	struct bkey_s_c k;
 +	int ret;
 +
++	if (!btree_type_has_snapshots(id))
++		return 0;
++
 +	if (!snapshot_t(c, pos.snapshot)->children[0])
 +		return 0;
 +
@@ -26276,10 +27762,10 @@ index 000000000000..112ac7caf579
 +	return ret;
 +}
 +
-+static int bch2_trans_update_extent(struct btree_trans *trans,
-+				    struct btree_iter *orig_iter,
-+				    struct bkey_i *insert,
-+				    enum btree_update_flags flags)
++int bch2_trans_update_extent(struct btree_trans *trans,
++			     struct btree_iter *orig_iter,
++			     struct bkey_i *insert,
++			     enum btree_update_flags flags)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter, update_iter;
@@ -26293,7 +27779,7 @@ index 000000000000..112ac7caf579
 +			     BTREE_ITER_INTENT|
 +			     BTREE_ITER_WITH_UPDATES|
 +			     BTREE_ITER_NOT_EXTENTS);
-+	k = bch2_btree_iter_peek(&iter);
++	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 +	if ((ret = bkey_err(k)))
 +		goto err;
 +	if (!k.k)
@@ -26437,19 +27923,16 @@ index 000000000000..112ac7caf579
 +			bkey_reassemble(update, k);
 +			bch2_cut_front(insert->k.p, update);
 +
-+			bch2_trans_copy_iter(&update_iter, &iter);
-+			update_iter.pos = update->k.p;
-+			ret   = bch2_trans_update(trans, &update_iter, update,
++			ret = bch2_trans_update_by_path(trans, iter.path, update,
 +						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 +						  flags);
-+			bch2_trans_iter_exit(trans, &update_iter);
-+
 +			if (ret)
 +				goto err;
 +			goto out;
 +		}
 +next:
-+		k = bch2_btree_iter_next(&iter);
++		bch2_btree_iter_advance(&iter);
++		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
 +		if ((ret = bkey_err(k)))
 +			goto err;
 +		if (!k.k)
@@ -26510,7 +27993,8 @@ index 000000000000..112ac7caf579
 +	pos.snapshot++;
 +
 +	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-+			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++			   BTREE_ITER_ALL_SNAPSHOTS|
++			   BTREE_ITER_NOPRESERVE, k, ret) {
 +		if (bkey_cmp(k.k->p, pos))
 +			break;
 +
@@ -26525,48 +28009,35 @@ index 000000000000..112ac7caf579
 +	return ret;
 +}
 +
-+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-+		      struct bkey_i *k, enum btree_update_flags flags)
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
++			  struct bkey_i *k, enum btree_update_flags flags)
 +{
++	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i, n;
 +
-+	BUG_ON(!iter->path->should_be_locked);
-+
-+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-+		return bch2_trans_update_extent(trans, iter, k, flags);
++	BUG_ON(!path->should_be_locked);
 +
 +	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-+	BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
++	BUG_ON(bpos_cmp(k->k.p, path->pos));
 +
 +	n = (struct btree_insert_entry) {
 +		.flags		= flags,
-+		.bkey_type	= __btree_node_type(iter->path->level, iter->btree_id),
-+		.btree_id	= iter->btree_id,
-+		.level		= iter->path->level,
-+		.cached		= iter->flags & BTREE_ITER_CACHED,
-+		.path		= iter->path,
++		.bkey_type	= __btree_node_type(path->level, path->btree_id),
++		.btree_id	= path->btree_id,
++		.level		= path->level,
++		.cached		= path->cached,
++		.path		= path,
 +		.k		= k,
 +		.ip_allocated	= _RET_IP_,
 +	};
 +
-+	__btree_path_get(n.path, true);
-+
 +#ifdef CONFIG_BCACHEFS_DEBUG
 +	trans_for_each_update(trans, i)
 +		BUG_ON(i != trans->updates &&
 +		       btree_insert_entry_cmp(i - 1, i) >= 0);
 +#endif
 +
-+	if (bkey_deleted(&n.k->k) &&
-+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-+		int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
-+		if (unlikely(ret < 0))
-+			return ret;
-+
-+		if (ret)
-+			n.k->k.type = KEY_TYPE_whiteout;
-+	}
-+
 +	/*
 +	 * Pending updates are kept sorted: first, find position of new update,
 +	 * then delete/trim any updates the new update overwrites:
@@ -26579,27 +28050,95 @@ index 000000000000..112ac7caf579
 +	    !btree_insert_entry_cmp(&n, i)) {
 +		BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 +
-+		/*
-+		 * This is a hack to ensure that inode creates update the btree,
-+		 * not the key cache, which helps with cache coherency issues in
-+		 * other areas:
-+		 */
-+		if (n.cached && !i->cached) {
-+			i->k = n.k;
-+			i->flags = n.flags;
-+
-+			__btree_path_get(n.path, false);
-+		} else {
-+			bch2_path_put(trans, i->path, true);
-+			*i = n;
-+		}
-+	} else
++		bch2_path_put(trans, i->path, true);
++		i->flags	= n.flags;
++		i->cached	= n.cached;
++		i->k		= n.k;
++		i->path		= n.path;
++		i->ip_allocated	= n.ip_allocated;
++	} else {
 +		array_insert_item(trans->updates, trans->nr_updates,
 +				  i - trans->updates, n);
 +
++		i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v;
++		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
++
++		if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) {
++			struct bkey_i *j_k =
++				bch2_journal_keys_peek(c, n.btree_id, n.level, k->k.p);
++
++			if (j_k && !bpos_cmp(j_k->k.p, i->k->k.p)) {
++				i->old_k = j_k->k;
++				i->old_v = &j_k->v;
++			}
++		}
++	}
++
++	__btree_path_get(n.path, true);
 +	return 0;
 +}
 +
++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
++				   struct bkey_i *k, enum btree_update_flags flags)
++{
++	struct btree_path *path = iter->update_path ?: iter->path;
++	struct bkey_cached *ck;
++	int ret;
++
++	if (iter->flags & BTREE_ITER_IS_EXTENTS)
++		return bch2_trans_update_extent(trans, iter, k, flags);
++
++	if (bkey_deleted(&k->k) &&
++	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
++		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
++		if (unlikely(ret < 0))
++			return ret;
++
++		if (ret)
++			k->k.type = KEY_TYPE_whiteout;
++	}
++
++	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    !path->cached &&
++	    !path->level &&
++	    btree_id_cached(trans->c, path->btree_id)) {
++		if (!iter->key_cache_path ||
++		    !iter->key_cache_path->should_be_locked ||
++		    bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
++			if (!iter->key_cache_path)
++				iter->key_cache_path =
++					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++						      BTREE_ITER_INTENT|
++						      BTREE_ITER_CACHED, _THIS_IP_);
++
++			iter->key_cache_path =
++				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
++							iter->flags & BTREE_ITER_INTENT,
++							_THIS_IP_);
++
++			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
++						       BTREE_ITER_CACHED);
++			if (unlikely(ret))
++				return ret;
++
++			ck = (void *) iter->key_cache_path->l[0].b;
++
++			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++				trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
++				btree_trans_restart(trans);
++				return -EINTR;
++			}
++
++			iter->key_cache_path->should_be_locked = true;
++		}
++
++		path = iter->key_cache_path;
++	}
++
++	return bch2_trans_update_by_path(trans, path, k, flags);
++}
++
 +void bch2_trans_commit_hook(struct btree_trans *trans,
 +			    struct btree_trans_commit_hook *h)
 +{
@@ -26653,19 +28192,21 @@ index 000000000000..112ac7caf579
 +
 +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 +				  struct bpos start, struct bpos end,
-+				  unsigned iter_flags,
++				  unsigned update_flags,
 +				  u64 *journal_seq)
 +{
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	int ret = 0;
 +
-+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
++	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 +retry:
 +	while ((bch2_trans_begin(trans),
 +	       (k = bch2_btree_iter_peek(&iter)).k) &&
 +	       !(ret = bkey_err(k)) &&
 +	       bkey_cmp(iter.pos, end) < 0) {
++		struct disk_reservation disk_res =
++			bch2_disk_reservation_init(trans->c, 0);
 +		struct bkey_i delete;
 +
 +		bkey_init(&delete.k);
@@ -26686,7 +28227,7 @@ index 000000000000..112ac7caf579
 +		 */
 +		delete.k.p = iter.pos;
 +
-+		if (btree_node_type_is_extents(id)) {
++		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
 +			unsigned max_sectors =
 +				KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 +
@@ -26700,8 +28241,10 @@ index 000000000000..112ac7caf579
 +		}
 +
 +		ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
-+			bch2_trans_commit(trans, NULL, journal_seq,
-+					BTREE_INSERT_NOFAIL);
++			bch2_trans_commit(trans, &disk_res, journal_seq,
++					  BTREE_INSERT_NOFAIL|
++					  update_flags);
++		bch2_disk_reservation_put(trans->c, &disk_res);
 +		if (ret)
 +			break;
 +	}
@@ -26722,17 +28265,46 @@ index 000000000000..112ac7caf579
 + */
 +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 +			    struct bpos start, struct bpos end,
++			    unsigned update_flags,
 +			    u64 *journal_seq)
 +{
 +	return bch2_trans_do(c, NULL, journal_seq, 0,
-+			     bch2_btree_delete_range_trans(&trans, id, start, end, 0, journal_seq));
++			     bch2_btree_delete_range_trans(&trans, id, start, end,
++							   update_flags, journal_seq));
++}
++
++int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
++{
++	unsigned len = strlen(msg);
++	unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
++	struct jset_entry_log *l;
++	int ret;
++
++	ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s));
++	if (ret)
++		return ret;
++
++	l = (void *) &darray_top(trans->extra_journal_entries);
++	l->entry.u64s		= cpu_to_le16(u64s);
++	l->entry.btree_id	= 0;
++	l->entry.level		= 1;
++	l->entry.type		= BCH_JSET_ENTRY_log;
++	l->entry.pad[0]		= 0;
++	l->entry.pad[1]		= 0;
++	l->entry.pad[2]		= 0;
++	memcpy(l->d, msg, len);
++	while (len & 7)
++		l->d[len++] = '\0';
++
++	trans->extra_journal_entries.nr += jset_u64s(u64s);
++	return 0;
 +}
 diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
 new file mode 100644
-index 000000000000..6fc93b56bcb2
+index 000000000000..7654ab24a909
 --- /dev/null
 +++ b/fs/bcachefs/buckets.c
-@@ -0,0 +1,2227 @@
+@@ -0,0 +1,2122 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -26746,6 +28318,7 @@ index 000000000000..6fc93b56bcb2
 +#include "btree_gc.h"
 +#include "btree_update.h"
 +#include "buckets.h"
++#include "buckets_waiting_for_journal.h"
 +#include "ec.h"
 +#include "error.h"
 +#include "inode.h"
@@ -26778,43 +28351,6 @@ index 000000000000..6fc93b56bcb2
 +	}
 +}
 +
-+/*
-+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
-+ * wraparound:
-+ */
-+void bch2_bucket_seq_cleanup(struct bch_fs *c)
-+{
-+	u64 journal_seq = atomic64_read(&c->journal.seq);
-+	u16 last_seq_ondisk = c->journal.last_seq_ondisk;
-+	struct bch_dev *ca;
-+	struct bucket_array *buckets;
-+	struct bucket *g;
-+	struct bucket_mark m;
-+	unsigned i;
-+
-+	if (journal_seq - c->last_bucket_seq_cleanup <
-+	    (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
-+		return;
-+
-+	c->last_bucket_seq_cleanup = journal_seq;
-+
-+	for_each_member_device(ca, c, i) {
-+		down_read(&ca->bucket_lock);
-+		buckets = bucket_array(ca);
-+
-+		for_each_bucket(g, buckets) {
-+			bucket_cmpxchg(g, m, ({
-+				if (!m.journal_seq_valid ||
-+				    bucket_needs_journal_commit(m, last_seq_ondisk))
-+					break;
-+
-+				m.journal_seq_valid = 0;
-+			}));
-+		}
-+		up_read(&ca->bucket_lock);
-+	}
-+}
-+
 +void bch2_fs_usage_initialize(struct bch_fs *c)
 +{
 +	struct bch_fs_usage *usage;
@@ -26879,6 +28415,7 @@ index 000000000000..6fc93b56bcb2
 +						unsigned journal_seq,
 +						bool gc)
 +{
++	percpu_rwsem_assert_held(&c->mark_lock);
 +	BUG_ON(!gc && !journal_seq);
 +
 +	return this_cpu_ptr(gc
@@ -27049,36 +28586,24 @@ index 000000000000..6fc93b56bcb2
 +	return ret;
 +}
 +
-+static inline int is_unavailable_bucket(struct bucket_mark m)
++static inline int is_unavailable_bucket(struct bch_alloc_v4 a)
 +{
-+	return !is_available_bucket(m);
++	return a.dirty_sectors || a.stripe;
 +}
 +
 +static inline int bucket_sectors_fragmented(struct bch_dev *ca,
-+					    struct bucket_mark m)
++					    struct bch_alloc_v4 a)
 +{
-+	return bucket_sectors_used(m)
-+		? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
++	return a.dirty_sectors
++		? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
 +		: 0;
 +}
 +
-+static inline int is_stripe_data_bucket(struct bucket_mark m)
++static inline enum bch_data_type bucket_type(struct bch_alloc_v4 a)
 +{
-+	return m.stripe && m.data_type != BCH_DATA_parity;
-+}
-+
-+static inline enum bch_data_type bucket_type(struct bucket_mark m)
-+{
-+	return m.cached_sectors && !m.dirty_sectors
++	return a.cached_sectors && !a.dirty_sectors
 +		? BCH_DATA_cached
-+		: m.data_type;
-+}
-+
-+static bool bucket_became_unavailable(struct bucket_mark old,
-+				      struct bucket_mark new)
-+{
-+	return is_available_bucket(old) &&
-+	       !is_available_bucket(new);
++		: a.data_type;
 +}
 +
 +static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -27093,21 +28618,13 @@ index 000000000000..6fc93b56bcb2
 +}
 +
 +static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-+				  struct bucket_mark old, struct bucket_mark new,
++				  struct bch_alloc_v4 old,
++				  struct bch_alloc_v4 new,
 +				  u64 journal_seq, bool gc)
 +{
 +	struct bch_fs_usage *fs_usage;
 +	struct bch_dev_usage *u;
 +
-+	/*
-+	 * Hack for bch2_fs_initialize path, where we're first marking sb and
-+	 * journal non-transactionally:
-+	 */
-+	if (!journal_seq && !test_bit(BCH_FS_INITIALIZED, &c->flags))
-+		journal_seq = 1;
-+
-+	percpu_rwsem_assert_held(&c->mark_lock);
-+
 +	preempt_disable();
 +	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 +	u = dev_usage_ptr(ca, journal_seq, gc);
@@ -27133,9 +28650,28 @@ index 000000000000..6fc93b56bcb2
 +	u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
 +
 +	preempt_enable();
++}
 +
-+	if (!is_available_bucket(old) && is_available_bucket(new))
-+		bch2_wake_allocator(ca);
++static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
++				    struct bucket old, struct bucket new,
++				    u64 journal_seq, bool gc)
++{
++	struct bch_alloc_v4 old_a = {
++		.gen		= old.gen,
++		.data_type	= old.data_type,
++		.dirty_sectors	= old.dirty_sectors,
++		.cached_sectors	= old.cached_sectors,
++		.stripe		= old.stripe,
++	};
++	struct bch_alloc_v4 new_a = {
++		.gen		= new.gen,
++		.data_type	= new.data_type,
++		.dirty_sectors	= new.dirty_sectors,
++		.cached_sectors	= new.cached_sectors,
++		.stripe		= new.stripe,
++	};
++
++	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 +}
 +
 +static inline int __update_replicas(struct bch_fs *c,
@@ -27153,25 +28689,50 @@ index 000000000000..6fc93b56bcb2
 +	return 0;
 +}
 +
-+static inline int update_replicas(struct bch_fs *c,
++static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 +			struct bch_replicas_entry *r, s64 sectors,
 +			unsigned journal_seq, bool gc)
 +{
 +	struct bch_fs_usage __percpu *fs_usage;
-+	int idx = bch2_replicas_entry_idx(c, r);
++	int idx, ret = 0;
++	struct printbuf buf = PRINTBUF;
 +
-+	if (idx < 0)
-+		return -1;
++	percpu_down_read(&c->mark_lock);
++	buf.atomic++;
++
++	idx = bch2_replicas_entry_idx(c, r);
++	if (idx < 0 &&
++	    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
++	     fsck_err(c, "no replicas entry\n"
++		      "  while marking %s",
++		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
++		percpu_up_read(&c->mark_lock);
++		ret = bch2_mark_replicas(c, r);
++		percpu_down_read(&c->mark_lock);
++
++		if (ret)
++			goto err;
++		idx = bch2_replicas_entry_idx(c, r);
++	}
++	if (idx < 0) {
++		ret = -1;
++		goto err;
++	}
 +
 +	preempt_disable();
 +	fs_usage = fs_usage_ptr(c, journal_seq, gc);
 +	fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
 +	fs_usage->replicas[idx]		+= sectors;
 +	preempt_enable();
-+	return 0;
++err:
++fsck_err:
++	percpu_up_read(&c->mark_lock);
++	printbuf_exit(&buf);
++	return ret;
 +}
 +
 +static inline int update_cached_sectors(struct bch_fs *c,
++			struct bkey_s_c k,
 +			unsigned dev, s64 sectors,
 +			unsigned journal_seq, bool gc)
 +{
@@ -27179,7 +28740,7 @@ index 000000000000..6fc93b56bcb2
 +
 +	bch2_replicas_entry_cached(&r.e, dev);
 +
-+	return update_replicas(c, &r.e, sectors, journal_seq, gc);
++	return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
 +}
 +
 +static struct replicas_delta_list *
@@ -27245,47 +28806,21 @@ index 000000000000..6fc93b56bcb2
 +	update_replicas_list(trans, &r.e, sectors);
 +}
 +
-+#define do_mark_fn(fn, c, pos, flags, ...)				\
-+({									\
-+	int gc, ret = 0;						\
-+									\
-+	percpu_rwsem_assert_held(&c->mark_lock);			\
-+									\
-+	for (gc = 0; gc < 2 && !ret; gc++)				\
-+		if (!gc == !(flags & BTREE_TRIGGER_GC) ||		\
-+		    (gc && gc_visited(c, pos)))				\
-+			ret = fn(c, __VA_ARGS__, gc);			\
-+	ret;								\
-+})
-+
-+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-+			    size_t b, bool owned_by_allocator)
-+{
-+	struct bucket *g = bucket(ca, b);
-+	struct bucket_mark old, new;
-+
-+	old = bucket_cmpxchg(g, new, ({
-+		new.owned_by_allocator	= owned_by_allocator;
-+	}));
-+
-+	BUG_ON(owned_by_allocator == old.owned_by_allocator);
-+}
-+
-+static int bch2_mark_alloc(struct btree_trans *trans,
-+			   struct bkey_s_c old, struct bkey_s_c new,
-+			   unsigned flags)
++int bch2_mark_alloc(struct btree_trans *trans,
++		    struct bkey_s_c old, struct bkey_s_c new,
++		    unsigned flags)
 +{
 +	bool gc = flags & BTREE_TRIGGER_GC;
 +	u64 journal_seq = trans->journal_res.seq;
 +	struct bch_fs *c = trans->c;
-+	struct bkey_alloc_unpacked u;
-+	struct bch_dev *ca;
-+	struct bucket *g;
-+	struct bucket_mark old_m, m;
++	struct bch_alloc_v4 old_a, new_a;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
++	int ret = 0;
 +
-+	/* We don't do anything for deletions - do we?: */
-+	if (!bkey_is_alloc(new.k))
-+		return 0;
++	if (bch2_trans_inconsistent_on(new.k->p.offset < ca->mi.first_bucket ||
++				       new.k->p.offset >= ca->mi.nbuckets, trans,
++				       "alloc key outside range of device's buckets"))
++		return -EIO;
 +
 +	/*
 +	 * alloc btree is read in by bch2_alloc_read, not gc:
@@ -27294,44 +28829,81 @@ index 000000000000..6fc93b56bcb2
 +	    !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
 +		return 0;
 +
-+	if (flags & BTREE_TRIGGER_INSERT) {
-+		struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
++	bch2_alloc_to_v4(old, &old_a);
++	bch2_alloc_to_v4(new, &new_a);
++
++	if ((flags & BTREE_TRIGGER_INSERT) &&
++	    !old_a.data_type != !new_a.data_type &&
++	    new.k->type == KEY_TYPE_alloc_v4) {
++		struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
 +
 +		BUG_ON(!journal_seq);
-+		BUG_ON(new.k->type != KEY_TYPE_alloc_v3);
 +
-+		v->journal_seq = cpu_to_le64(journal_seq);
++		/*
++		 * If the btree updates referring to a bucket weren't flushed
++		 * before the bucket became empty again, then the we don't have
++		 * to wait on a journal flush before we can reuse the bucket:
++		 */
++		new_a.journal_seq = !new_a.data_type &&
++			(journal_seq == v->journal_seq ||
++			 bch2_journal_noflush_seq(&c->journal, v->journal_seq))
++			? 0 : journal_seq;
++		v->journal_seq = new_a.journal_seq;
 +	}
 +
-+	ca = bch_dev_bkey_exists(c, new.k->p.inode);
-+
-+	if (new.k->p.offset >= ca->mi.nbuckets)
-+		return 0;
-+
-+	g = __bucket(ca, new.k->p.offset, gc);
-+	u = bch2_alloc_unpack(new);
-+
-+	old_m = bucket_cmpxchg(g, m, ({
-+		m.gen			= u.gen;
-+		m.data_type		= u.data_type;
-+		m.dirty_sectors		= u.dirty_sectors;
-+		m.cached_sectors	= u.cached_sectors;
-+		m.stripe		= u.stripe != 0;
-+
-+		if (journal_seq) {
-+			m.journal_seq_valid	= 1;
-+			m.journal_seq		= journal_seq;
++	if (old_a.data_type && !new_a.data_type && new_a.journal_seq) {
++		ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
++				c->journal.flushed_seq_ondisk,
++				new.k->p.inode, new.k->p.offset,
++				new_a.journal_seq);
++		if (ret) {
++			bch2_fs_fatal_error(c,
++				"error setting bucket_needs_journal_commit: %i", ret);
++			return ret;
 +		}
-+	}));
++	}
 +
-+	bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
++	if (!new_a.data_type &&
++	    (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
++		closure_wake_up(&c->freelist_wait);
 +
-+	g->io_time[READ]	= u.read_time;
-+	g->io_time[WRITE]	= u.write_time;
-+	g->oldest_gen		= u.oldest_gen;
-+	g->gen_valid		= 1;
-+	g->stripe		= u.stripe;
-+	g->stripe_redundancy	= u.stripe_redundancy;
++	if ((flags & BTREE_TRIGGER_INSERT) &&
++	    BCH_ALLOC_V4_NEED_DISCARD(&new_a) &&
++	    !new_a.journal_seq)
++		bch2_do_discards(c);
++
++	if (!old_a.data_type &&
++	    new_a.data_type &&
++	    should_invalidate_buckets(ca))
++		bch2_do_invalidates(c);
++
++	if (bucket_state(new_a) == BUCKET_need_gc_gens) {
++		atomic_inc(&c->kick_gc);
++		wake_up_process(c->gc_thread);
++	}
++
++	percpu_down_read(&c->mark_lock);
++	if (!gc && new_a.gen != old_a.gen)
++		*bucket_gen(ca, new.k->p.offset) = new_a.gen;
++
++	bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
++
++	if (gc) {
++		struct bucket *g = gc_bucket(ca, new.k->p.offset);
++
++		bucket_lock(g);
++
++		g->gen_valid		= 1;
++		g->gen			= new_a.gen;
++		g->data_type		= new_a.data_type;
++		g->stripe		= new_a.stripe;
++		g->stripe_redundancy	= new_a.stripe_redundancy;
++		g->dirty_sectors	= new_a.dirty_sectors;
++		g->cached_sectors	= new_a.cached_sectors;
++
++		bucket_unlock(g);
++	}
++	percpu_up_read(&c->mark_lock);
 +
 +	/*
 +	 * need to know if we're getting called from the invalidate path or
@@ -27339,45 +28911,52 @@ index 000000000000..6fc93b56bcb2
 +	 */
 +
 +	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-+	    old_m.cached_sectors) {
-+		if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
-+					  journal_seq, gc)) {
++	    old_a.cached_sectors) {
++		ret = update_cached_sectors(c, new, ca->dev_idx,
++					    -old_a.cached_sectors,
++					    journal_seq, gc);
++		if (ret) {
 +			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
-+			return -1;
++			return ret;
 +		}
 +
 +		trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
-+				 old_m.cached_sectors);
++				 old_a.cached_sectors);
 +	}
 +
 +	return 0;
 +}
 +
-+#define checked_add(a, b)					\
-+({								\
-+	unsigned _res = (unsigned) (a) + (b);			\
-+	bool overflow = _res > U16_MAX;				\
-+	if (overflow)						\
-+		_res = U16_MAX;					\
-+	(a) = _res;						\
-+	overflow;						\
-+})
-+
-+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-+				       size_t b, enum bch_data_type data_type,
-+				       unsigned sectors, bool gc)
++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
++			       size_t b, enum bch_data_type data_type,
++			       unsigned sectors, struct gc_pos pos,
++			       unsigned flags)
 +{
-+	struct bucket *g = __bucket(ca, b, gc);
-+	struct bucket_mark old, new;
++	struct bucket old, new, *g;
 +	bool overflow;
 +
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 +	BUG_ON(data_type != BCH_DATA_sb &&
 +	       data_type != BCH_DATA_journal);
 +
-+	old = bucket_cmpxchg(g, new, ({
-+		new.data_type	= data_type;
-+		overflow = checked_add(new.dirty_sectors, sectors);
-+	}));
++	/*
++	 * Backup superblock might be past the end of our normal usable space:
++	 */
++	if (b >= ca->mi.nbuckets)
++		return;
++
++	percpu_down_read(&c->mark_lock);
++	g = gc_bucket(ca, b);
++
++	bucket_lock(g);
++	old = *g;
++
++	g->data_type = data_type;
++	g->dirty_sectors += sectors;
++	overflow = g->dirty_sectors < sectors;
++
++	new = *g;
++	bucket_unlock(g);
 +
 +	bch2_fs_inconsistent_on(old.data_type &&
 +				old.data_type != data_type, c,
@@ -27391,32 +28970,8 @@ index 000000000000..6fc93b56bcb2
 +		bch2_data_types[old.data_type ?: data_type],
 +		old.dirty_sectors, sectors);
 +
-+	if (c)
-+		bch2_dev_usage_update(c, ca, old, new, 0, gc);
-+
-+	return 0;
-+}
-+
-+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-+			       size_t b, enum bch_data_type type,
-+			       unsigned sectors, struct gc_pos pos,
-+			       unsigned flags)
-+{
-+	BUG_ON(type != BCH_DATA_sb &&
-+	       type != BCH_DATA_journal);
-+
-+	/*
-+	 * Backup superblock might be past the end of our normal usable space:
-+	 */
-+	if (b >= ca->mi.nbuckets)
-+		return;
-+
-+	if (likely(c)) {
-+		do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
-+			   ca, b, type, sectors);
-+	} else {
-+		__bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
-+	}
++	bch2_dev_usage_update_m(c, ca, old, new, 0, true);
++	percpu_up_read(&c->mark_lock);
 +}
 +
 +static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
@@ -27434,124 +28989,154 @@ index 000000000000..6fc93b56bcb2
 +			    struct bkey_s_c k,
 +			    const struct bch_extent_ptr *ptr,
 +			    s64 sectors, enum bch_data_type ptr_data_type,
-+			    u8 bucket_gen, u8 bucket_data_type,
-+			    u16 dirty_sectors, u16 cached_sectors)
++			    u8 b_gen, u8 bucket_data_type,
++			    u32 dirty_sectors, u32 cached_sectors)
 +{
-+	size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++	size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
 +	u16 bucket_sectors = !ptr->cached
 +		? dirty_sectors
 +		: cached_sectors;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
 +
-+	if (gen_after(ptr->gen, bucket_gen)) {
++	if (gen_after(ptr->gen, b_gen)) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
 +			"while marking %s",
-+			ptr->dev, bucket_nr, bucket_gen,
++			ptr->dev, bucket_nr, b_gen,
 +			bch2_data_types[bucket_data_type ?: ptr_data_type],
 +			ptr->gen,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EIO;
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
 +	}
 +
-+	if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
++	if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
 +			"while marking %s",
-+			ptr->dev, bucket_nr, bucket_gen,
++			ptr->dev, bucket_nr, b_gen,
 +			bch2_data_types[bucket_data_type ?: ptr_data_type],
 +			ptr->gen,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EIO;
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
 +	}
 +
-+	if (bucket_gen != ptr->gen && !ptr->cached) {
++	if (b_gen != ptr->gen && !ptr->cached) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-+			"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
++			"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
 +			"while marking %s",
-+			ptr->dev, bucket_nr, bucket_gen,
++			ptr->dev, bucket_nr, b_gen,
++			*bucket_gen(ca, bucket_nr),
 +			bch2_data_types[bucket_data_type ?: ptr_data_type],
 +			ptr->gen,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EIO;
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
 +	}
 +
-+	if (bucket_gen != ptr->gen)
-+		return 1;
++	if (b_gen != ptr->gen) {
++		ret = 1;
++		goto err;
++	}
 +
 +	if (bucket_data_type && ptr_data_type &&
 +	    bucket_data_type != ptr_data_type) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
 +			"while marking %s",
-+			ptr->dev, bucket_nr, bucket_gen,
++			ptr->dev, bucket_nr, b_gen,
 +			bch2_data_types[bucket_data_type],
 +			bch2_data_types[ptr_data_type],
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EIO;
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
 +	}
 +
-+	if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
++	if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
 +			"while marking %s",
-+			ptr->dev, bucket_nr, bucket_gen,
++			ptr->dev, bucket_nr, b_gen,
 +			bch2_data_types[bucket_data_type ?: ptr_data_type],
 +			bucket_sectors, sectors,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EIO;
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EIO;
++		goto err;
 +	}
-+
-+	return 0;
++err:
++	printbuf_exit(&buf);
++	return ret;
 +}
 +
 +static int mark_stripe_bucket(struct btree_trans *trans,
 +			      struct bkey_s_c k,
 +			      unsigned ptr_idx,
-+			      u64 journal_seq, unsigned flags)
++			      unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
++	u64 journal_seq = trans->journal_res.seq;
 +	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 +	unsigned nr_data = s->nr_blocks - s->nr_redundant;
 +	bool parity = ptr_idx >= nr_data;
++	enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
++	s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
 +	const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
-+	bool gc = flags & BTREE_TRIGGER_GC;
 +	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+	struct bucket *g = PTR_BUCKET(ca, ptr, gc);
-+	struct bucket_mark new, old;
-+	char buf[200];
-+	int ret;
++	struct bucket old, new, *g;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
 +
-+	if (g->stripe && g->stripe != k.k->p.offset) {
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	/* * XXX doesn't handle deletion */
++
++	percpu_down_read(&c->mark_lock);
++	buf.atomic++;
++	g = PTR_GC_BUCKET(ca, ptr);
++
++	if (g->dirty_sectors ||
++	    (g->stripe && g->stripe != k.k->p.offset)) {
 +		bch2_fs_inconsistent(c,
 +			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
-+			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-+		return -EINVAL;
++			      ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
++			      (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		ret = -EINVAL;
++		goto err;
 +	}
 +
-+	old = bucket_cmpxchg(g, new, ({
-+		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
-+				       new.dirty_sectors, new.cached_sectors);
-+		if (ret)
-+			return ret;
++	bucket_lock(g);
++	old = *g;
 +
-+		if (parity) {
-+			new.data_type		= BCH_DATA_parity;
-+			new.dirty_sectors	= le16_to_cpu(s->sectors);
-+		}
++	ret = check_bucket_ref(c, k, ptr, sectors, data_type,
++			       new.gen, new.data_type,
++			       new.dirty_sectors, new.cached_sectors);
++	if (ret) {
++		bucket_unlock(g);
++		goto err;
++	}
 +
-+		if (journal_seq) {
-+			new.journal_seq_valid	= 1;
-+			new.journal_seq		= journal_seq;
-+		}
-+	}));
++	new.dirty_sectors += sectors;
++	if (data_type)
++		new.data_type = data_type;
 +
 +	g->stripe		= k.k->p.offset;
 +	g->stripe_redundancy	= s->nr_redundant;
 +
-+	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
-+	return 0;
++	new = *g;
++	bucket_unlock(g);
++
++	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
++err:
++	percpu_up_read(&c->mark_lock);
++	printbuf_exit(&buf);
++	return ret;
 +}
 +
 +static int __mark_pointer(struct btree_trans *trans,
@@ -27559,9 +29144,9 @@ index 000000000000..6fc93b56bcb2
 +			  const struct bch_extent_ptr *ptr,
 +			  s64 sectors, enum bch_data_type ptr_data_type,
 +			  u8 bucket_gen, u8 *bucket_data_type,
-+			  u16 *dirty_sectors, u16 *cached_sectors)
++			  u32 *dirty_sectors, u32 *cached_sectors)
 +{
-+	u16 *dst_sectors = !ptr->cached
++	u32 *dst_sectors = !ptr->cached
 +		? dirty_sectors
 +		: cached_sectors;
 +	int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
@@ -27583,64 +29168,64 @@ index 000000000000..6fc93b56bcb2
 +			     s64 sectors, enum bch_data_type data_type,
 +			     unsigned flags)
 +{
-+	bool gc = flags & BTREE_TRIGGER_GC;
 +	u64 journal_seq = trans->journal_res.seq;
 +	struct bch_fs *c = trans->c;
-+	struct bucket_mark old, new;
 +	struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-+	struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
++	struct bucket old, new, *g;
 +	u8 bucket_data_type;
-+	u64 v;
-+	int ret;
++	int ret = 0;
 +
-+	v = atomic64_read(&g->_mark.v);
-+	do {
-+		new.v.counter = old.v.counter = v;
-+		bucket_data_type = new.data_type;
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
 +
-+		ret = __mark_pointer(trans, k, &p.ptr, sectors,
-+				     data_type, new.gen,
-+				     &bucket_data_type,
-+				     &new.dirty_sectors,
-+				     &new.cached_sectors);
-+		if (ret)
-+			return ret;
++	percpu_down_read(&c->mark_lock);
++	g = PTR_GC_BUCKET(ca, &p.ptr);
 +
-+		new.data_type = bucket_data_type;
++	bucket_lock(g);
++	old = *g;
 +
-+		if (journal_seq) {
-+			new.journal_seq_valid = 1;
-+			new.journal_seq = journal_seq;
-+		}
++	bucket_data_type = g->data_type;
 +
-+		if (flags & BTREE_TRIGGER_NOATOMIC) {
-+			g->_mark = new;
-+			break;
-+		}
-+	} while ((v = atomic64_cmpxchg(&g->_mark.v,
-+			      old.v.counter,
-+			      new.v.counter)) != old.v.counter);
++	ret = __mark_pointer(trans, k, &p.ptr, sectors,
++			     data_type, g->gen,
++			     &bucket_data_type,
++			     &g->dirty_sectors,
++			     &g->cached_sectors);
++	if (ret) {
++		bucket_unlock(g);
++		goto err;
++	}
 +
-+	bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
++	g->data_type = bucket_data_type;
 +
-+	BUG_ON(!gc && bucket_became_unavailable(old, new));
++	new = *g;
++	bucket_unlock(g);
 +
-+	return 0;
++	bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
++err:
++	percpu_up_read(&c->mark_lock);
++
++	return ret;
 +}
 +
 +static int bch2_mark_stripe_ptr(struct btree_trans *trans,
++				struct bkey_s_c k,
 +				struct bch_extent_stripe_ptr p,
 +				enum bch_data_type data_type,
 +				s64 sectors,
 +				unsigned flags)
 +{
-+	bool gc = flags & BTREE_TRIGGER_GC;
 +	struct bch_fs *c = trans->c;
 +	struct bch_replicas_padded r;
-+	struct stripe *m;
-+	unsigned i, blocks_nonempty = 0;
++	struct gc_stripe *m;
 +
-+	m = genradix_ptr(&c->stripes[gc], p.idx);
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
++	m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
++	if (!m) {
++		bch_err(c, "error allocating memory for gc_stripes, idx %llu",
++			(u64) p.idx);
++		return -ENOMEM;
++	}
 +
 +	spin_lock(&c->ec_stripes_heap_lock);
 +
@@ -27655,29 +29240,18 @@ index 000000000000..6fc93b56bcb2
 +	m->block_sectors[p.block] += sectors;
 +
 +	r = m->r;
-+
-+	for (i = 0; i < m->nr_blocks; i++)
-+		blocks_nonempty += m->block_sectors[i] != 0;
-+
-+	if (m->blocks_nonempty != blocks_nonempty) {
-+		m->blocks_nonempty = blocks_nonempty;
-+		if (!gc)
-+			bch2_stripes_heap_update(c, m, p.idx);
-+	}
-+
 +	spin_unlock(&c->ec_stripes_heap_lock);
 +
 +	r.e.data_type = data_type;
-+	update_replicas(c, &r.e, sectors, trans->journal_res.seq, gc);
++	update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
 +
 +	return 0;
 +}
 +
-+static int bch2_mark_extent(struct btree_trans *trans,
-+			    struct bkey_s_c old, struct bkey_s_c new,
-+			    unsigned flags)
++int bch2_mark_extent(struct btree_trans *trans,
++		     struct bkey_s_c old, struct bkey_s_c new,
++		     unsigned flags)
 +{
-+	bool gc = flags & BTREE_TRIGGER_GC;
 +	u64 journal_seq = trans->journal_res.seq;
 +	struct bch_fs *c = trans->c;
 +	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -27689,12 +29263,14 @@ index 000000000000..6fc93b56bcb2
 +		? BCH_DATA_btree
 +		: BCH_DATA_user;
 +	s64 sectors = bkey_is_btree_ptr(k.k)
-+		? c->opts.btree_node_size
++		? btree_sectors(c)
 +		: k.k->size;
 +	s64 dirty_sectors = 0;
 +	bool stale;
 +	int ret;
 +
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
 +	r.e.data_type	= data_type;
 +	r.e.nr_devs	= 0;
 +	r.e.nr_required	= 1;
@@ -27713,18 +29289,19 @@ index 000000000000..6fc93b56bcb2
 +		stale = ret > 0;
 +
 +		if (p.ptr.cached) {
-+			if (!stale)
-+				if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
-+							  journal_seq, gc)) {
++			if (!stale) {
++				ret = update_cached_sectors(c, k, p.ptr.dev,
++						disk_sectors, journal_seq, true);
++				if (ret) {
 +					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
-+					return -1;
-+
++					return ret;
 +				}
++			}
 +		} else if (!p.has_ec) {
 +			dirty_sectors	       += disk_sectors;
 +			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
 +		} else {
-+			ret = bch2_mark_stripe_ptr(trans, p.ec, data_type,
++			ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
 +					disk_sectors, flags);
 +			if (ret)
 +				return ret;
@@ -27739,110 +29316,130 @@ index 000000000000..6fc93b56bcb2
 +	}
 +
 +	if (r.e.nr_devs) {
-+		if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
-+			char buf[200];
++		ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
++		if (ret) {
++			struct printbuf buf = PRINTBUF;
 +
-+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
-+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-+			return -1;
++			bch2_bkey_val_to_text(&buf, c, k);
++			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
++			printbuf_exit(&buf);
++			return ret;
 +		}
 +	}
 +
 +	return 0;
 +}
 +
-+static int bch2_mark_stripe(struct btree_trans *trans,
-+			    struct bkey_s_c old, struct bkey_s_c new,
-+			    unsigned flags)
++int bch2_mark_stripe(struct btree_trans *trans,
++		     struct bkey_s_c old, struct bkey_s_c new,
++		     unsigned flags)
 +{
 +	bool gc = flags & BTREE_TRIGGER_GC;
 +	u64 journal_seq = trans->journal_res.seq;
 +	struct bch_fs *c = trans->c;
-+	size_t idx = new.k->p.offset;
++	u64 idx = new.k->p.offset;
 +	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
 +		? bkey_s_c_to_stripe(old).v : NULL;
 +	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
 +		? bkey_s_c_to_stripe(new).v : NULL;
-+	struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
 +	unsigned i;
 +	int ret;
 +
 +	BUG_ON(gc && old_s);
 +
-+	if (!m || (old_s && !m->alive)) {
-+		char buf1[200], buf2[200];
++	if (!gc) {
++		struct stripe *m = genradix_ptr(&c->stripes, idx);
 +
-+		bch2_bkey_val_to_text(&PBUF(buf1), c, old);
-+		bch2_bkey_val_to_text(&PBUF(buf2), c, new);
-+		bch_err_ratelimited(c, "error marking nonexistent stripe %zu while marking\n"
-+				    "old %s\n"
-+				    "new %s", idx, buf1, buf2);
-+		bch2_inconsistent_error(c);
-+		return -1;
-+	}
++		if (!m || (old_s && !m->alive)) {
++			struct printbuf buf1 = PRINTBUF;
++			struct printbuf buf2 = PRINTBUF;
 +
-+	if (!new_s) {
-+		spin_lock(&c->ec_stripes_heap_lock);
-+		bch2_stripes_heap_del(c, m, idx);
-+		spin_unlock(&c->ec_stripes_heap_lock);
-+
-+		memset(m, 0, sizeof(*m));
-+	} else {
-+		m->alive	= true;
-+		m->sectors	= le16_to_cpu(new_s->sectors);
-+		m->algorithm	= new_s->algorithm;
-+		m->nr_blocks	= new_s->nr_blocks;
-+		m->nr_redundant	= new_s->nr_redundant;
-+		m->blocks_nonempty = 0;
-+
-+		for (i = 0; i < new_s->nr_blocks; i++) {
-+			m->block_sectors[i] =
-+				stripe_blockcount_get(new_s, i);
-+			m->blocks_nonempty += !!m->block_sectors[i];
-+
-+			m->ptrs[i] = new_s->ptrs[i];
++			bch2_bkey_val_to_text(&buf1, c, old);
++			bch2_bkey_val_to_text(&buf2, c, new);
++			bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
++					    "old %s\n"
++					    "new %s", idx, buf1.buf, buf2.buf);
++			printbuf_exit(&buf2);
++			printbuf_exit(&buf1);
++			bch2_inconsistent_error(c);
++			return -1;
 +		}
 +
-+		bch2_bkey_to_replicas(&m->r.e, new);
++		if (!new_s) {
++			spin_lock(&c->ec_stripes_heap_lock);
++			bch2_stripes_heap_del(c, m, idx);
++			spin_unlock(&c->ec_stripes_heap_lock);
++
++			memset(m, 0, sizeof(*m));
++		} else {
++			m->alive	= true;
++			m->sectors	= le16_to_cpu(new_s->sectors);
++			m->algorithm	= new_s->algorithm;
++			m->nr_blocks	= new_s->nr_blocks;
++			m->nr_redundant	= new_s->nr_redundant;
++			m->blocks_nonempty = 0;
++
++			for (i = 0; i < new_s->nr_blocks; i++)
++				m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
 +
-+		if (!gc) {
 +			spin_lock(&c->ec_stripes_heap_lock);
 +			bch2_stripes_heap_update(c, m, idx);
 +			spin_unlock(&c->ec_stripes_heap_lock);
 +		}
-+	}
++	} else {
++		struct gc_stripe *m =
++			genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
++
++		if (!m) {
++			bch_err(c, "error allocating memory for gc_stripes, idx %llu",
++				idx);
++			return -ENOMEM;
++		}
++		/*
++		 * This will be wrong when we bring back runtime gc: we should
++		 * be unmarking the old key and then marking the new key
++		 */
++		m->alive	= true;
++		m->sectors	= le16_to_cpu(new_s->sectors);
++		m->nr_blocks	= new_s->nr_blocks;
++		m->nr_redundant	= new_s->nr_redundant;
++
++		for (i = 0; i < new_s->nr_blocks; i++)
++			m->ptrs[i] = new_s->ptrs[i];
++
++		bch2_bkey_to_replicas(&m->r.e, new);
 +
-+	if (gc) {
 +		/*
 +		 * gc recalculates this field from stripe ptr
 +		 * references:
 +		 */
 +		memset(m->block_sectors, 0, sizeof(m->block_sectors));
-+		m->blocks_nonempty = 0;
 +
 +		for (i = 0; i < new_s->nr_blocks; i++) {
-+			ret = mark_stripe_bucket(trans, new, i, journal_seq, flags);
++			ret = mark_stripe_bucket(trans, new, i, flags);
 +			if (ret)
 +				return ret;
 +		}
 +
-+		if (update_replicas(c, &m->r.e,
-+				    ((s64) m->sectors * m->nr_redundant),
-+				    journal_seq, gc)) {
-+			char buf[200];
++		ret = update_replicas(c, new, &m->r.e,
++				      ((s64) m->sectors * m->nr_redundant),
++				      journal_seq, gc);
++		if (ret) {
++			struct printbuf buf = PRINTBUF;
 +
-+			bch2_bkey_val_to_text(&PBUF(buf), c, new);
-+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-+			return -1;
++			bch2_bkey_val_to_text(&buf, c, new);
++			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
++			printbuf_exit(&buf);
++			return ret;
 +		}
 +	}
 +
 +	return 0;
 +}
 +
-+static int bch2_mark_inode(struct btree_trans *trans,
-+			   struct bkey_s_c old, struct bkey_s_c new,
-+			   unsigned flags)
++int bch2_mark_inode(struct btree_trans *trans,
++		    struct bkey_s_c old, struct bkey_s_c new,
++		    unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bch_fs_usage __percpu *fs_usage;
@@ -27858,18 +29455,22 @@ index 000000000000..6fc93b56bcb2
 +	}
 +
 +	if (flags & BTREE_TRIGGER_GC) {
++		percpu_down_read(&c->mark_lock);
 +		preempt_disable();
++
 +		fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
 +		fs_usage->nr_inodes += bkey_is_inode(new.k);
 +		fs_usage->nr_inodes -= bkey_is_inode(old.k);
++
 +		preempt_enable();
++		percpu_up_read(&c->mark_lock);
 +	}
 +	return 0;
 +}
 +
-+static int bch2_mark_reservation(struct btree_trans *trans,
-+				 struct bkey_s_c old, struct bkey_s_c new,
-+				 unsigned flags)
++int bch2_mark_reservation(struct btree_trans *trans,
++			  struct bkey_s_c old, struct bkey_s_c new,
++			  unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
@@ -27877,34 +29478,46 @@ index 000000000000..6fc93b56bcb2
 +	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 +	s64 sectors = (s64) k.k->size;
 +
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
 +	if (flags & BTREE_TRIGGER_OVERWRITE)
 +		sectors = -sectors;
 +	sectors *= replicas;
 +
++	percpu_down_read(&c->mark_lock);
 +	preempt_disable();
++
 +	fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
 +	replicas = clamp_t(unsigned, replicas, 1,
 +			   ARRAY_SIZE(fs_usage->persistent_reserved));
 +
 +	fs_usage->reserved				+= sectors;
 +	fs_usage->persistent_reserved[replicas - 1]	+= sectors;
++
 +	preempt_enable();
++	percpu_up_read(&c->mark_lock);
 +
 +	return 0;
 +}
 +
-+static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
++static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
++				 struct bkey_s_c_reflink_p p,
++				 u64 start, u64 end,
 +				 u64 *idx, unsigned flags, size_t r_idx)
 +{
++	struct bch_fs *c = trans->c;
 +	struct reflink_gc *r;
 +	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
++	u64 next_idx = end;
 +	s64 ret = 0;
++	struct printbuf buf = PRINTBUF;
 +
 +	if (r_idx >= c->reflink_gc_nr)
 +		goto not_found;
 +
 +	r = genradix_ptr(&c->reflink_gc_table, r_idx);
-+	if (*idx < r->offset - r->size)
++	next_idx = min(next_idx, r->offset - r->size);
++	if (*idx < next_idx)
 +		goto not_found;
 +
 +	BUG_ON((s64) r->refcount + add < 0);
@@ -27913,46 +29526,42 @@ index 000000000000..6fc93b56bcb2
 +	*idx = r->offset;
 +	return 0;
 +not_found:
-+	*idx = U64_MAX;
-+	ret = -EIO;
++	if (fsck_err(c, "pointer to missing indirect extent\n"
++		     "  %s\n"
++		     "  missing range %llu-%llu",
++		     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
++		     *idx, next_idx)) {
++		struct bkey_i_error new;
 +
-+	/*
-+	 * XXX: we're replacing the entire reflink pointer with an error
-+	 * key, we should just be replacing the part that was missing:
-+	 */
-+	if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
-+		     p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
-+		struct bkey_i_error *new;
-+
-+		new = kmalloc(sizeof(*new), GFP_KERNEL);
-+		if (!new) {
-+			bch_err(c, "%s: error allocating new key", __func__);
-+			return -ENOMEM;
-+		}
-+
-+		bkey_init(&new->k);
-+		new->k.type	= KEY_TYPE_error;
-+		new->k.p	= p.k->p;
-+		new->k.size	= p.k->size;
-+		ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
++		bkey_init(&new.k);
++		new.k.type	= KEY_TYPE_error;
++		new.k.p		= bkey_start_pos(p.k);
++		new.k.p.offset += *idx - start;
++		bch2_key_resize(&new.k, next_idx - *idx);
++		ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i);
 +	}
++
++	*idx = next_idx;
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
-+static int bch2_mark_reflink_p(struct btree_trans *trans,
-+			       struct bkey_s_c old, struct bkey_s_c new,
-+			       unsigned flags)
++int bch2_mark_reflink_p(struct btree_trans *trans,
++			struct bkey_s_c old, struct bkey_s_c new,
++			unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 +	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 +	struct reflink_gc *ref;
 +	size_t l, r, m;
-+	u64 idx = le64_to_cpu(p.v->idx);
++	u64 idx = le64_to_cpu(p.v->idx), start = idx;
 +	u64 end = le64_to_cpu(p.v->idx) + p.k->size;
 +	int ret = 0;
 +
++	BUG_ON(!(flags & BTREE_TRIGGER_GC));
++
 +	if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
 +		idx -= le32_to_cpu(p.v->front_pad);
 +		end += le32_to_cpu(p.v->back_pad);
@@ -27971,89 +29580,8 @@ index 000000000000..6fc93b56bcb2
 +	}
 +
 +	while (idx < end && !ret)
-+		ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
-+
-+	return ret;
-+}
-+
-+static int bch2_mark_key_locked(struct btree_trans *trans,
-+		   struct bkey_s_c old,
-+		   struct bkey_s_c new,
-+		   unsigned flags)
-+{
-+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-+
-+	switch (k.k->type) {
-+	case KEY_TYPE_alloc:
-+	case KEY_TYPE_alloc_v2:
-+	case KEY_TYPE_alloc_v3:
-+		return bch2_mark_alloc(trans, old, new, flags);
-+	case KEY_TYPE_btree_ptr:
-+	case KEY_TYPE_btree_ptr_v2:
-+	case KEY_TYPE_extent:
-+	case KEY_TYPE_reflink_v:
-+		return bch2_mark_extent(trans, old, new, flags);
-+	case KEY_TYPE_stripe:
-+		return bch2_mark_stripe(trans, old, new, flags);
-+	case KEY_TYPE_inode:
-+	case KEY_TYPE_inode_v2:
-+		return bch2_mark_inode(trans, old, new, flags);
-+	case KEY_TYPE_reservation:
-+		return bch2_mark_reservation(trans, old, new, flags);
-+	case KEY_TYPE_reflink_p:
-+		return bch2_mark_reflink_p(trans, old, new, flags);
-+	case KEY_TYPE_snapshot:
-+		return bch2_mark_snapshot(trans, old, new, flags);
-+	default:
-+		return 0;
-+	}
-+}
-+
-+int bch2_mark_key(struct btree_trans *trans, struct bkey_s_c new, unsigned flags)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bkey deleted = KEY(0, 0, 0);
-+	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-+	int ret;
-+
-+	deleted.p = new.k->p;
-+
-+	percpu_down_read(&c->mark_lock);
-+	ret = bch2_mark_key_locked(trans, old, new, flags);
-+	percpu_up_read(&c->mark_lock);
-+
-+	return ret;
-+}
-+
-+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
-+		     struct bkey_i *new, unsigned flags)
-+{
-+	struct bkey		_deleted = KEY(0, 0, 0);
-+	struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-+	struct bkey_s_c		old;
-+	struct bkey		unpacked;
-+	int ret;
-+
-+	_deleted.p = path->pos;
-+
-+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
-+		return 0;
-+
-+	if (!btree_node_type_needs_gc(path->btree_id))
-+		return 0;
-+
-+	old = bch2_btree_path_peek_slot(path, &unpacked);
-+
-+	if (old.k->type == new->k.type &&
-+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-+		ret   = bch2_mark_key_locked(trans, old, bkey_i_to_s_c(new),
-+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-+	} else {
-+		ret   = bch2_mark_key_locked(trans, deleted, bkey_i_to_s_c(new),
-+				BTREE_TRIGGER_INSERT|flags) ?:
-+			bch2_mark_key_locked(trans, old, deleted,
-+				BTREE_TRIGGER_OVERWRITE|flags);
-+	}
++		ret = __bch2_mark_reflink_p(trans, p, start, end,
++					    &idx, flags, l++);
 +
 +	return ret;
 +}
@@ -28065,50 +29593,42 @@ index 000000000000..6fc93b56bcb2
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +
 +	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
 +		should_not_have_added, disk_res_sectors);
 +
 +	trans_for_each_update(trans, i) {
++		struct bkey_s_c old = { &i->old_k, i->old_v };
++
 +		pr_err("while inserting");
-+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-+		pr_err("%s", buf);
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
++		pr_err("  %s", buf.buf);
 +		pr_err("overlapping with");
-+
-+		if (!i->cached) {
-+			struct bkey u;
-+			struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
-+
-+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
-+			pr_err("%s", buf);
-+		} else {
-+			struct bkey_cached *ck = (void *) i->path->l[0].b;
-+
-+			if (ck->valid) {
-+				bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
-+				pr_err("%s", buf);
-+			}
-+		}
++		printbuf_reset(&buf);
++		bch2_bkey_val_to_text(&buf, c, old);
++		pr_err("  %s", buf.buf);
 +	}
++
 +	__WARN();
++	printbuf_exit(&buf);
 +}
 +
-+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-+			       struct replicas_delta_list *deltas)
++int bch2_trans_fs_usage_apply(struct btree_trans *trans,
++			      struct replicas_delta_list *deltas)
 +{
 +	struct bch_fs *c = trans->c;
 +	static int warned_disk_usage = 0;
 +	bool warn = false;
 +	unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-+	struct replicas_delta *d = deltas->d;
++	struct replicas_delta *d = deltas->d, *d2;
 +	struct replicas_delta *top = (void *) deltas->d + deltas->used;
 +	struct bch_fs_usage *dst;
 +	s64 added = 0, should_not_have_added;
 +	unsigned i;
 +
-+	percpu_rwsem_assert_held(&c->mark_lock);
-+
++	percpu_down_read(&c->mark_lock);
 +	preempt_disable();
 +	dst = fs_usage_ptr(c, trans->journal_res.seq, false);
 +
@@ -28120,7 +29640,8 @@ index 000000000000..6fc93b56bcb2
 +			added += d->delta;
 +		}
 +
-+		BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
++		if (__update_replicas(c, dst, &d->r, d->delta))
++			goto need_mark;
 +	}
 +
 +	dst->nr_inodes += deltas->nr_inodes;
@@ -28155,74 +29676,44 @@ index 000000000000..6fc93b56bcb2
 +	}
 +
 +	preempt_enable();
++	percpu_up_read(&c->mark_lock);
 +
 +	if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
 +		fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
++	return 0;
++need_mark:
++	/* revert changes: */
++	for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
++		BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
++
++	preempt_enable();
++	percpu_up_read(&c->mark_lock);
++	return -1;
 +}
 +
 +/* trans_mark: */
 +
-+static struct bkey_alloc_buf *
-+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
-+			      const struct bch_extent_ptr *ptr,
-+			      struct bkey_alloc_unpacked *u)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+	struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-+	struct bucket *g;
-+	struct bkey_alloc_buf *a;
-+	struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
-+	int ret;
-+
-+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-+	if (IS_ERR(a))
-+		return a;
-+
-+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
-+			     BTREE_ITER_CACHED|
-+			     BTREE_ITER_CACHED_NOFILL|
-+			     BTREE_ITER_INTENT);
-+	ret = bch2_btree_iter_traverse(iter);
-+	if (ret) {
-+		bch2_trans_iter_exit(trans, iter);
-+		return ERR_PTR(ret);
-+	}
-+
-+	if (update && !bpos_cmp(update->k.p, pos)) {
-+		*u = bch2_alloc_unpack(bkey_i_to_s_c(update));
-+	} else {
-+		percpu_down_read(&c->mark_lock);
-+		g = bucket(ca, pos.offset);
-+		*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
-+		percpu_up_read(&c->mark_lock);
-+	}
-+
-+	return a;
-+}
-+
 +static int bch2_trans_mark_pointer(struct btree_trans *trans,
 +			struct bkey_s_c k, struct extent_ptr_decoded p,
 +			s64 sectors, enum bch_data_type data_type)
 +{
-+	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
-+	struct bkey_alloc_unpacked u;
-+	struct bkey_alloc_buf *a;
++	struct bkey_i_alloc_v4 *a;
 +	int ret;
 +
-+	a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
++	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
 +	if (IS_ERR(a))
 +		return PTR_ERR(a);
 +
 +	ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
-+			     u.gen, &u.data_type,
-+			     &u.dirty_sectors, &u.cached_sectors);
++			     a->v.gen, &a->v.data_type,
++			     &a->v.dirty_sectors, &a->v.cached_sectors);
 +	if (ret)
 +		goto out;
 +
-+	bch2_alloc_pack(c, a, u);
-+	bch2_trans_update(trans, &iter, &a->k, 0);
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	if (ret)
++		goto out;
 +out:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
@@ -28232,7 +29723,6 @@ index 000000000000..6fc93b56bcb2
 +			struct extent_ptr_decoded p,
 +			s64 sectors, enum bch_data_type data_type)
 +{
-+	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bkey_i_stripe *s;
@@ -28248,16 +29738,15 @@ index 000000000000..6fc93b56bcb2
 +		goto err;
 +
 +	if (k.k->type != KEY_TYPE_stripe) {
-+		bch2_fs_inconsistent(c,
++		bch2_trans_inconsistent(trans,
 +			"pointer to nonexistent stripe %llu",
 +			(u64) p.ec.idx);
-+		bch2_inconsistent_error(c);
 +		ret = -EIO;
 +		goto err;
 +	}
 +
 +	if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
-+		bch2_fs_inconsistent(c,
++		bch2_trans_inconsistent(trans,
 +			"stripe pointer doesn't match stripe %llu",
 +			(u64) p.ec.idx);
 +		ret = -EIO;
@@ -28273,7 +29762,10 @@ index 000000000000..6fc93b56bcb2
 +	stripe_blockcount_set(&s->v, p.ec.block,
 +		stripe_blockcount_get(&s->v, p.ec.block) +
 +		sectors);
-+	bch2_trans_update(trans, &iter, &s->k_i, 0);
++
++	ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
++	if (ret)
++		goto err;
 +
 +	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 +	r.e.data_type = data_type;
@@ -28283,10 +29775,14 @@ index 000000000000..6fc93b56bcb2
 +	return ret;
 +}
 +
-+static int bch2_trans_mark_extent(struct btree_trans *trans,
-+			struct bkey_s_c k, unsigned flags)
++int bch2_trans_mark_extent(struct btree_trans *trans,
++			   struct bkey_s_c old, struct bkey_i *new,
++			   unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
 +	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +	const union bch_extent_entry *entry;
 +	struct extent_ptr_decoded p;
@@ -28295,7 +29791,7 @@ index 000000000000..6fc93b56bcb2
 +		? BCH_DATA_btree
 +		: BCH_DATA_user;
 +	s64 sectors = bkey_is_btree_ptr(k.k)
-+		? c->opts.btree_node_size
++		? btree_sectors(c)
 +		: k.k->size;
 +	s64 dirty_sectors = 0;
 +	bool stale;
@@ -28341,119 +29837,158 @@ index 000000000000..6fc93b56bcb2
 +	return 0;
 +}
 +
-+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-+					    struct bkey_s_c_stripe s,
-+					    unsigned idx, bool deleting)
++static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
++					 struct bkey_s_c_stripe s,
++					 unsigned idx, bool deleting)
 +{
 +	struct bch_fs *c = trans->c;
 +	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
-+	struct bkey_alloc_buf *a;
 +	struct btree_iter iter;
-+	struct bkey_alloc_unpacked u;
-+	bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
++	struct bkey_i_alloc_v4 *a;
++	enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
++		? BCH_DATA_parity : 0;
++	s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
 +	int ret = 0;
 +
-+	a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
++	if (deleting)
++		sectors = -sectors;
++
++	a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
 +	if (IS_ERR(a))
 +		return PTR_ERR(a);
 +
-+	if (parity) {
-+		s64 sectors = le16_to_cpu(s.v->sectors);
-+
-+		if (deleting)
-+			sectors = -sectors;
-+
-+		u.dirty_sectors += sectors;
-+		u.data_type = u.dirty_sectors
-+			? BCH_DATA_parity
-+			: 0;
-+	}
++	ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
++			       a->v.gen, a->v.data_type,
++			       a->v.dirty_sectors, a->v.cached_sectors);
++	if (ret)
++		goto err;
 +
 +	if (!deleting) {
-+		if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
-+				"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
-+				iter.pos.inode, iter.pos.offset, u.gen,
-+				u.stripe, s.k->p.offset)) {
++		if (bch2_trans_inconsistent_on(a->v.stripe ||
++					       a->v.stripe_redundancy, trans,
++				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				bch2_data_types[a->v.data_type],
++				a->v.dirty_sectors,
++				a->v.stripe, s.k->p.offset)) {
 +			ret = -EIO;
 +			goto err;
 +		}
 +
-+		u.stripe		= s.k->p.offset;
-+		u.stripe_redundancy	= s.v->nr_redundant;
++		if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
++				"bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				bch2_data_types[a->v.data_type],
++				a->v.dirty_sectors,
++				s.k->p.offset)) {
++			ret = -EIO;
++			goto err;
++		}
++
++		a->v.stripe		= s.k->p.offset;
++		a->v.stripe_redundancy	= s.v->nr_redundant;
 +	} else {
-+		u.stripe		= 0;
-+		u.stripe_redundancy	= 0;
++		if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
++					       a->v.stripe_redundancy != s.v->nr_redundant, trans,
++				"bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
++				iter.pos.inode, iter.pos.offset, a->v.gen,
++				s.k->p.offset, a->v.stripe)) {
++			ret = -EIO;
++			goto err;
++		}
++
++		a->v.stripe		= 0;
++		a->v.stripe_redundancy	= 0;
 +	}
 +
-+	bch2_alloc_pack(c, a, u);
-+	bch2_trans_update(trans, &iter, &a->k, 0);
++	a->v.dirty_sectors += sectors;
++	if (data_type)
++		a->v.data_type = !deleting ? data_type : 0;
++
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	if (ret)
++		goto err;
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
 +
-+static int bch2_trans_mark_stripe(struct btree_trans *trans,
-+				  struct bkey_s_c old, struct bkey_s_c new,
-+				  unsigned flags)
++int bch2_trans_mark_stripe(struct btree_trans *trans,
++			   struct bkey_s_c old, struct bkey_i *new,
++			   unsigned flags)
 +{
-+	struct bkey_s_c_stripe old_s = { .k = NULL };
-+	struct bkey_s_c_stripe new_s = { .k = NULL };
++	const struct bch_stripe *old_s = NULL;
++	struct bch_stripe *new_s = NULL;
 +	struct bch_replicas_padded r;
-+	unsigned i;
++	unsigned i, nr_blocks;
 +	int ret = 0;
 +
 +	if (old.k->type == KEY_TYPE_stripe)
-+		old_s = bkey_s_c_to_stripe(old);
-+	if (new.k->type == KEY_TYPE_stripe)
-+		new_s = bkey_s_c_to_stripe(new);
++		old_s = bkey_s_c_to_stripe(old).v;
++	if (new->k.type == KEY_TYPE_stripe)
++		new_s = &bkey_i_to_stripe(new)->v;
 +
 +	/*
 +	 * If the pointers aren't changing, we don't need to do anything:
 +	 */
-+	if (new_s.k && old_s.k &&
-+	    new_s.v->nr_blocks		== old_s.v->nr_blocks &&
-+	    new_s.v->nr_redundant	== old_s.v->nr_redundant &&
-+	    !memcmp(old_s.v->ptrs, new_s.v->ptrs,
-+		    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
++	if (new_s && old_s &&
++	    new_s->nr_blocks	== old_s->nr_blocks &&
++	    new_s->nr_redundant	== old_s->nr_redundant &&
++	    !memcmp(old_s->ptrs, new_s->ptrs,
++		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
 +		return 0;
 +
-+	if (new_s.k) {
-+		s64 sectors = le16_to_cpu(new_s.v->sectors);
++	BUG_ON(new_s && old_s &&
++	       (new_s->nr_blocks	!= old_s->nr_blocks ||
++		new_s->nr_redundant	!= old_s->nr_redundant));
 +
-+		bch2_bkey_to_replicas(&r.e, new);
-+		update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
++	nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
 +
-+		for (i = 0; i < new_s.v->nr_blocks; i++) {
-+			ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
-+							       i, false);
-+			if (ret)
-+				return ret;
-+		}
++	if (new_s) {
++		s64 sectors = le16_to_cpu(new_s->sectors);
++
++		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
++		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
 +	}
 +
-+	if (old_s.k) {
-+		s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
++	if (old_s) {
++		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
 +
 +		bch2_bkey_to_replicas(&r.e, old);
-+		update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
++		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
++	}
 +
-+		for (i = 0; i < old_s.v->nr_blocks; i++) {
-+			ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
-+							       i, true);
++	for (i = 0; i < nr_blocks; i++) {
++		if (new_s && old_s &&
++		    !memcmp(&new_s->ptrs[i],
++			    &old_s->ptrs[i],
++			    sizeof(new_s->ptrs[i])))
++			continue;
++
++		if (new_s) {
++			ret = bch2_trans_mark_stripe_bucket(trans,
++					bkey_i_to_s_c_stripe(new), i, false);
 +			if (ret)
-+				return ret;
++				break;
++		}
++
++		if (old_s) {
++			ret = bch2_trans_mark_stripe_bucket(trans,
++					bkey_s_c_to_stripe(old), i, true);
++			if (ret)
++				break;
 +		}
 +	}
 +
 +	return ret;
 +}
 +
-+static int bch2_trans_mark_inode(struct btree_trans *trans,
-+				 struct bkey_s_c old,
-+				 struct bkey_s_c new,
-+				 unsigned flags)
++int bch2_trans_mark_inode(struct btree_trans *trans,
++			  struct bkey_s_c old,
++			  struct bkey_i *new,
++			  unsigned flags)
 +{
-+	int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
++	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
 +
 +	if (nr) {
 +		struct replicas_delta_list *d =
@@ -28464,9 +29999,14 @@ index 000000000000..6fc93b56bcb2
 +	return 0;
 +}
 +
-+static int bch2_trans_mark_reservation(struct btree_trans *trans,
-+				       struct bkey_s_c k, unsigned flags)
++int bch2_trans_mark_reservation(struct btree_trans *trans,
++				struct bkey_s_c old,
++				struct bkey_i *new,
++				unsigned flags)
 +{
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
 +	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 +	s64 sectors = (s64) k.k->size;
 +	struct replicas_delta_list *d;
@@ -28494,7 +30034,7 @@ index 000000000000..6fc93b56bcb2
 +	struct bkey_i *n;
 +	__le64 *refcount;
 +	int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret;
 +
 +	bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
@@ -28514,19 +30054,19 @@ index 000000000000..6fc93b56bcb2
 +
 +	refcount = bkey_refcount(n);
 +	if (!refcount) {
-+		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
-+		bch2_fs_inconsistent(c,
++		bch2_bkey_val_to_text(&buf, c, p.s_c);
++		bch2_trans_inconsistent(trans,
 +			"nonexistent indirect extent at %llu while marking\n  %s",
-+			*idx, buf);
++			*idx, buf.buf);
 +		ret = -EIO;
 +		goto err;
 +	}
 +
 +	if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
-+		bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
-+		bch2_fs_inconsistent(c,
++		bch2_bkey_val_to_text(&buf, c, p.s_c);
++		bch2_trans_inconsistent(trans,
 +			"indirect extent refcount underflow at %llu while marking\n  %s",
-+			*idx, buf);
++			*idx, buf.buf);
 +		ret = -EIO;
 +		goto err;
 +	}
@@ -28548,11 +30088,6 @@ index 000000000000..6fc93b56bcb2
 +
 +	le64_add_cpu(refcount, add);
 +
-+	if (!*refcount) {
-+		n->k.type = KEY_TYPE_deleted;
-+		set_bkey_val_u64s(&n->k, 0);
-+	}
-+
 +	bch2_btree_iter_set_pos_to_extent_start(&iter);
 +	ret = bch2_trans_update(trans, &iter, n, 0);
 +	if (ret)
@@ -28561,12 +30096,18 @@ index 000000000000..6fc93b56bcb2
 +	*idx = k.k->p.offset;
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
-+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-+				     struct bkey_s_c k, unsigned flags)
++int bch2_trans_mark_reflink_p(struct btree_trans *trans,
++			      struct bkey_s_c old,
++			      struct bkey_i *new,
++			      unsigned flags)
 +{
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
++		? old
++		: bkey_i_to_s_c(new);
 +	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 +	u64 idx, end_idx;
 +	int ret = 0;
@@ -28587,31 +30128,6 @@ index 000000000000..6fc93b56bcb2
 +	return ret;
 +}
 +
-+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-+			struct bkey_s_c new, unsigned flags)
-+{
-+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
-+
-+	switch (k.k->type) {
-+	case KEY_TYPE_btree_ptr:
-+	case KEY_TYPE_btree_ptr_v2:
-+	case KEY_TYPE_extent:
-+	case KEY_TYPE_reflink_v:
-+		return bch2_trans_mark_extent(trans, k, flags);
-+	case KEY_TYPE_stripe:
-+		return bch2_trans_mark_stripe(trans, old, new, flags);
-+	case KEY_TYPE_inode:
-+	case KEY_TYPE_inode_v2:
-+		return bch2_trans_mark_inode(trans, old, new, flags);
-+	case KEY_TYPE_reservation:
-+		return bch2_trans_mark_reservation(trans, k, flags);
-+	case KEY_TYPE_reflink_p:
-+		return bch2_trans_mark_reflink_p(trans, k, flags);
-+	default:
-+		return 0;
-+	}
-+}
-+
 +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 +				    struct bch_dev *ca, size_t b,
 +				    enum bch_data_type type,
@@ -28619,12 +30135,7 @@ index 000000000000..6fc93b56bcb2
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
-+	struct bkey_alloc_unpacked u;
-+	struct bkey_alloc_buf *a;
-+	struct bch_extent_ptr ptr = {
-+		.dev = ca->dev_idx,
-+		.offset = bucket_to_sector(ca, b),
-+	};
++	struct bkey_i_alloc_v4 *a;
 +	int ret = 0;
 +
 +	/*
@@ -28633,27 +30144,28 @@ index 000000000000..6fc93b56bcb2
 +	if (b >= ca->mi.nbuckets)
 +		return 0;
 +
-+	a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
++	a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
 +	if (IS_ERR(a))
 +		return PTR_ERR(a);
 +
-+	if (u.data_type && u.data_type != type) {
++	if (a->v.data_type && a->v.data_type != type) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 +			"while marking %s",
-+			iter.pos.inode, iter.pos.offset, u.gen,
-+			bch2_data_types[u.data_type],
++			iter.pos.inode, iter.pos.offset, a->v.gen,
++			bch2_data_types[a->v.data_type],
 +			bch2_data_types[type],
 +			bch2_data_types[type]);
 +		ret = -EIO;
 +		goto out;
 +	}
 +
-+	u.data_type	= type;
-+	u.dirty_sectors	= sectors;
++	a->v.data_type		= type;
++	a->v.dirty_sectors	= sectors;
 +
-+	bch2_alloc_pack(c, a, u);
-+	bch2_trans_update(trans, &iter, &a->k, 0);
++	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
++	if (ret)
++		goto out;
 +out:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
@@ -28814,54 +30326,31 @@ index 000000000000..6fc93b56bcb2
 +
 +/* Startup/shutdown: */
 +
-+static void buckets_free_rcu(struct rcu_head *rcu)
++static void bucket_gens_free_rcu(struct rcu_head *rcu)
 +{
-+	struct bucket_array *buckets =
-+		container_of(rcu, struct bucket_array, rcu);
++	struct bucket_gens *buckets =
++		container_of(rcu, struct bucket_gens, rcu);
 +
-+	kvpfree(buckets,
-+		sizeof(struct bucket_array) +
-+		buckets->nbuckets * sizeof(struct bucket));
++	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
 +}
 +
 +int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 +{
-+	struct bucket_array *buckets = NULL, *old_buckets = NULL;
++	struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
 +	unsigned long *buckets_nouse = NULL;
-+	alloc_fifo	free[RESERVE_NR];
-+	alloc_fifo	free_inc;
-+	alloc_heap	alloc_heap;
-+
-+	size_t btree_reserve	= DIV_ROUND_UP(BTREE_NODE_RESERVE,
-+			     ca->mi.bucket_size / c->opts.btree_node_size);
-+	/* XXX: these should be tunable */
-+	size_t reserve_none	= max_t(size_t, 1, nbuckets >> 9);
-+	size_t copygc_reserve	= max_t(size_t, 2, nbuckets >> 6);
-+	size_t free_inc_nr	= max(max_t(size_t, 1, nbuckets >> 12),
-+				      btree_reserve * 2);
-+	bool resize = ca->buckets[0] != NULL;
++	bool resize = ca->bucket_gens != NULL;
 +	int ret = -ENOMEM;
-+	unsigned i;
 +
-+	memset(&free,		0, sizeof(free));
-+	memset(&free_inc,	0, sizeof(free_inc));
-+	memset(&alloc_heap,	0, sizeof(alloc_heap));
-+
-+	if (!(buckets		= kvpmalloc(sizeof(struct bucket_array) +
-+					    nbuckets * sizeof(struct bucket),
++	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
 +					    GFP_KERNEL|__GFP_ZERO)) ||
-+	    !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
++	    (c->opts.buckets_nouse &&
++	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
 +					    sizeof(unsigned long),
-+					    GFP_KERNEL|__GFP_ZERO)) ||
-+	    !init_fifo(&free[RESERVE_MOVINGGC],
-+		       copygc_reserve, GFP_KERNEL) ||
-+	    !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-+	    !init_fifo(&free_inc,	free_inc_nr, GFP_KERNEL) ||
-+	    !init_heap(&alloc_heap,	ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
++					    GFP_KERNEL|__GFP_ZERO))))
 +		goto err;
 +
-+	buckets->first_bucket	= ca->mi.first_bucket;
-+	buckets->nbuckets	= nbuckets;
++	bucket_gens->first_bucket = ca->mi.first_bucket;
++	bucket_gens->nbuckets	= nbuckets;
 +
 +	bch2_copygc_stop(c);
 +
@@ -28871,56 +30360,39 @@ index 000000000000..6fc93b56bcb2
 +		percpu_down_write(&c->mark_lock);
 +	}
 +
-+	old_buckets = bucket_array(ca);
++	old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 +
 +	if (resize) {
-+		size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
++		size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
 +
-+		memcpy(buckets->b,
-+		       old_buckets->b,
-+		       n * sizeof(struct bucket));
-+		memcpy(buckets_nouse,
-+		       ca->buckets_nouse,
-+		       BITS_TO_LONGS(n) * sizeof(unsigned long));
++		memcpy(bucket_gens->b,
++		       old_bucket_gens->b,
++		       n);
++		if (buckets_nouse)
++			memcpy(buckets_nouse,
++			       ca->buckets_nouse,
++			       BITS_TO_LONGS(n) * sizeof(unsigned long));
 +	}
 +
-+	rcu_assign_pointer(ca->buckets[0], buckets);
-+	buckets = old_buckets;
++	rcu_assign_pointer(ca->bucket_gens, bucket_gens);
++	bucket_gens	= old_bucket_gens;
 +
 +	swap(ca->buckets_nouse, buckets_nouse);
 +
++	nbuckets = ca->mi.nbuckets;
++
 +	if (resize) {
 +		percpu_up_write(&c->mark_lock);
++		up_write(&ca->bucket_lock);
 +		up_write(&c->gc_lock);
 +	}
 +
-+	spin_lock(&c->freelist_lock);
-+	for (i = 0; i < RESERVE_NR; i++) {
-+		fifo_move(&free[i], &ca->free[i]);
-+		swap(ca->free[i], free[i]);
-+	}
-+	fifo_move(&free_inc, &ca->free_inc);
-+	swap(ca->free_inc, free_inc);
-+	spin_unlock(&c->freelist_lock);
-+
-+	/* with gc lock held, alloc_heap can't be in use: */
-+	swap(ca->alloc_heap, alloc_heap);
-+
-+	nbuckets = ca->mi.nbuckets;
-+
-+	if (resize)
-+		up_write(&ca->bucket_lock);
-+
 +	ret = 0;
 +err:
-+	free_heap(&alloc_heap);
-+	free_fifo(&free_inc);
-+	for (i = 0; i < RESERVE_NR; i++)
-+		free_fifo(&free[i]);
 +	kvpfree(buckets_nouse,
 +		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
-+	if (buckets)
-+		call_rcu(&old_buckets->rcu, buckets_free_rcu);
++	if (bucket_gens)
++		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 +
 +	return ret;
 +}
@@ -28929,15 +30401,10 @@ index 000000000000..6fc93b56bcb2
 +{
 +	unsigned i;
 +
-+	free_heap(&ca->alloc_heap);
-+	free_fifo(&ca->free_inc);
-+	for (i = 0; i < RESERVE_NR; i++)
-+		free_fifo(&ca->free[i]);
 +	kvpfree(ca->buckets_nouse,
 +		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-+	kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
-+		sizeof(struct bucket_array) +
-+		ca->mi.nbuckets * sizeof(struct bucket));
++	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
++		sizeof(struct bucket_gens) + ca->mi.nbuckets);
 +
 +	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
 +		free_percpu(ca->usage[i]);
@@ -28962,10 +30429,10 @@ index 000000000000..6fc93b56bcb2
 +}
 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
 new file mode 100644
-index 000000000000..5ed9441cb115
+index 000000000000..853bc9dd1294
 --- /dev/null
 +++ b/fs/bcachefs/buckets.h
-@@ -0,0 +1,291 @@
+@@ -0,0 +1,298 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -28983,57 +30450,49 @@ index 000000000000..5ed9441cb115
 +	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
 +	     _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 +
-+#define bucket_cmpxchg(g, new, expr)				\
-+({								\
-+	struct bucket *_g = g;					\
-+	u64 _v = atomic64_read(&(g)->_mark.v);			\
-+	struct bucket_mark _old;				\
-+								\
-+	do {							\
-+		(new).v.counter = _old.v.counter = _v;		\
-+		expr;						\
-+	} while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,		\
-+			       _old.v.counter,			\
-+			       (new).v.counter)) != _old.v.counter);\
-+	_old;							\
-+})
-+
-+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
-+						  bool gc)
++static inline void bucket_unlock(struct bucket *b)
 +{
-+	return rcu_dereference_check(ca->buckets[gc],
++	smp_store_release(&b->lock, 0);
++}
++
++static inline void bucket_lock(struct bucket *b)
++{
++	while (xchg(&b->lock, 1))
++		cpu_relax();
++}
++
++static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
++{
++	return rcu_dereference_check(ca->buckets_gc,
 +				     !ca->fs ||
 +				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
 +				     lockdep_is_held(&ca->fs->gc_lock) ||
 +				     lockdep_is_held(&ca->bucket_lock));
 +}
 +
-+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
++static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 +{
-+	return __bucket_array(ca, false);
-+}
-+
-+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
-+{
-+	struct bucket_array *buckets = __bucket_array(ca, gc);
++	struct bucket_array *buckets = gc_bucket_array(ca);
 +
 +	BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
 +	return buckets->b + b;
 +}
 +
-+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
++static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
 +{
-+	return __bucket(ca, b, false);
++	return rcu_dereference_check(ca->bucket_gens,
++				     !ca->fs ||
++				     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
++				     lockdep_is_held(&ca->fs->gc_lock) ||
++				     lockdep_is_held(&ca->bucket_lock));
 +}
 +
-+/*
-+ * bucket_gc_gen() returns the difference between the bucket's current gen and
-+ * the oldest gen of any pointer into that bucket in the btree.
-+ */
-+
-+static inline u8 bucket_gc_gen(struct bucket *g)
++static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
 +{
-+	return g->mark.gen - g->oldest_gen;
++	struct bucket_gens *gens = bucket_gens(ca);
++
++	BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
++	return gens->b + b;
 +}
 +
 +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -29042,11 +30501,18 @@ index 000000000000..5ed9441cb115
 +	return sector_to_bucket(ca, ptr->offset);
 +}
 +
-+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-+					const struct bch_extent_ptr *ptr,
-+					bool gc)
++static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
++				   const struct bch_extent_ptr *ptr)
 +{
-+	return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
++
++	return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
++}
++
++static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
++					   const struct bch_extent_ptr *ptr)
++{
++	return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
 +}
 +
 +static inline enum bch_data_type ptr_data_type(const struct bkey *k,
@@ -29059,18 +30525,6 @@ index 000000000000..5ed9441cb115
 +	return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 +}
 +
-+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
-+						 const struct bch_extent_ptr *ptr)
-+{
-+	struct bucket_mark m;
-+
-+	rcu_read_lock();
-+	m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
-+	rcu_read_unlock();
-+
-+	return m;
-+}
-+
 +static inline int gen_cmp(u8 a, u8 b)
 +{
 +	return (s8) (a - b);
@@ -29090,26 +30544,13 @@ index 000000000000..5ed9441cb115
 +static inline u8 ptr_stale(struct bch_dev *ca,
 +			   const struct bch_extent_ptr *ptr)
 +{
-+	return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
-+}
++	u8 ret;
 +
-+/* bucket gc marks */
++	rcu_read_lock();
++	ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
++	rcu_read_unlock();
 +
-+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-+{
-+	return mark.dirty_sectors + mark.cached_sectors;
-+}
-+
-+static inline bool is_available_bucket(struct bucket_mark mark)
-+{
-+	return !mark.dirty_sectors && !mark.stripe;
-+}
-+
-+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-+					       u16 last_seq_ondisk)
-+{
-+	return m.journal_seq_valid &&
-+		((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
++	return ret;
 +}
 +
 +/* Device usage: */
@@ -29117,50 +30558,50 @@ index 000000000000..5ed9441cb115
 +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 +
 +static inline u64 __dev_buckets_available(struct bch_dev *ca,
-+					  struct bch_dev_usage stats)
++					  struct bch_dev_usage stats,
++					  enum alloc_reserve reserve)
 +{
-+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
++	s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
++	s64 reserved = 0;
++
++	switch (reserve) {
++	case RESERVE_none:
++		reserved += ca->mi.nbuckets >> 6;
++		fallthrough;
++	case RESERVE_movinggc:
++		reserved += ca->nr_btree_reserve;
++		fallthrough;
++	case RESERVE_btree:
++		reserved += ca->nr_btree_reserve;
++		fallthrough;
++	case RESERVE_btree_movinggc:
++		break;
++	default:
++		BUG();
++	}
 +
 +	if (WARN_ONCE(stats.buckets_unavailable > total,
 +		      "buckets_unavailable overflow (%llu > %llu)\n",
 +		      stats.buckets_unavailable, total))
 +		return 0;
 +
-+	return total - stats.buckets_unavailable;
++	return max_t(s64, 0,
++		     total -
++		     stats.buckets_unavailable -
++		     ca->nr_open_buckets -
++		     reserved);
 +}
 +
-+static inline u64 dev_buckets_available(struct bch_dev *ca)
++static inline u64 dev_buckets_available(struct bch_dev *ca,
++					enum alloc_reserve reserve)
 +{
-+	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
-+}
-+
-+static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
-+					    struct bch_dev_usage stats)
-+{
-+	struct bch_fs *c = ca->fs;
-+	s64 available = __dev_buckets_available(ca, stats);
-+	unsigned i;
-+
-+	spin_lock(&c->freelist_lock);
-+	for (i = 0; i < RESERVE_NR; i++)
-+		available -= fifo_used(&ca->free[i]);
-+	available -= fifo_used(&ca->free_inc);
-+	available -= ca->nr_open_buckets;
-+	spin_unlock(&c->freelist_lock);
-+
-+	return max(available, 0LL);
-+}
-+
-+static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
-+{
-+	return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
++	return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
 +}
 +
 +/* Filesystem usage: */
 +
 +static inline unsigned fs_usage_u64s(struct bch_fs *c)
 +{
-+
 +	return sizeof(struct bch_fs_usage) / sizeof(u64) +
 +		READ_ONCE(c->replicas.nr);
 +}
@@ -29186,22 +30627,55 @@ index 000000000000..5ed9441cb115
 +
 +/* key/bucket marking: */
 +
-+void bch2_bucket_seq_cleanup(struct bch_fs *);
 +void bch2_fs_usage_initialize(struct bch_fs *);
 +
-+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 +void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 +			       size_t, enum bch_data_type, unsigned,
 +			       struct gc_pos, unsigned);
 +
-+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, unsigned);
++int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
++int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 +
-+int bch2_mark_update(struct btree_trans *, struct btree_path *,
-+		     struct bkey_i *, unsigned);
++int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
++int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
++
++int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 +
 +int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-+			struct bkey_s_c, unsigned);
-+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
++			struct bkey_i *, unsigned);
++
++static inline int bch2_trans_mark_old(struct btree_trans *trans,
++				      struct bkey_s_c old, unsigned flags)
++{
++	struct bkey_i deleted;
++
++	bkey_init(&deleted.k);
++	deleted.k.p = old.k->p;
++
++	return bch2_trans_mark_key(trans, old, &deleted,
++				   BTREE_TRIGGER_OVERWRITE|flags);
++}
++
++static inline int bch2_trans_mark_new(struct btree_trans *trans,
++				      struct bkey_i *new, unsigned flags)
++{
++	struct bkey_i deleted;
++
++	bkey_init(&deleted.k);
++	deleted.k.p = new->k.p;
++
++	return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new,
++				   BTREE_TRIGGER_INSERT|flags);
++}
++
++int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 +
 +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
 +				    size_t, enum bch_data_type, unsigned);
@@ -29259,10 +30733,10 @@ index 000000000000..5ed9441cb115
 +#endif /* _BUCKETS_H */
 diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
 new file mode 100644
-index 000000000000..b2de2995c5e7
+index 000000000000..e79a33795bf9
 --- /dev/null
 +++ b/fs/bcachefs/buckets_types.h
-@@ -0,0 +1,124 @@
+@@ -0,0 +1,104 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BUCKETS_TYPES_H
 +#define _BUCKETS_TYPES_H
@@ -29272,42 +30746,15 @@ index 000000000000..b2de2995c5e7
 +
 +#define BUCKET_JOURNAL_SEQ_BITS		16
 +
-+struct bucket_mark {
-+	union {
-+	atomic64_t	v;
-+
-+	struct {
-+	u8		gen;
-+	u8		data_type:3,
-+			owned_by_allocator:1,
-+			journal_seq_valid:1,
-+			stripe:1;
-+	u16		dirty_sectors;
-+	u16		cached_sectors;
-+
-+	/*
-+	 * low bits of journal sequence number when this bucket was most
-+	 * recently modified: if journal_seq_valid is set, this bucket can't be
-+	 * reused until the journal sequence number written to disk is >= the
-+	 * bucket's journal sequence number:
-+	 */
-+	u16		journal_seq;
-+	};
-+	};
-+};
-+
 +struct bucket {
-+	union {
-+		struct bucket_mark	_mark;
-+		const struct bucket_mark mark;
-+	};
-+
-+	u64				io_time[2];
-+	u8				oldest_gen;
-+	u8				gc_gen;
-+	unsigned			gen_valid:1;
-+	u8				stripe_redundancy;
-+	u32				stripe;
++	u8			lock;
++	u8			gen_valid:1;
++	u8			data_type:7;
++	u8			gen;
++	u8			stripe_redundancy;
++	u32			stripe;
++	u32			dirty_sectors;
++	u32			cached_sectors;
 +};
 +
 +struct bucket_array {
@@ -29317,6 +30764,13 @@ index 000000000000..b2de2995c5e7
 +	struct bucket		b[];
 +};
 +
++struct bucket_gens {
++	struct rcu_head		rcu;
++	u16			first_bucket;
++	size_t			nbuckets;
++	u8			b[];
++};
++
 +struct bch_dev_usage {
 +	u64			buckets_ec;
 +	u64			buckets_unavailable;
@@ -29379,7 +30833,7 @@ index 000000000000..b2de2995c5e7
 +	u8			dev;
 +	u8			gen;
 +	u8			replicas;
-+	u16			fragmentation;
++	u32			fragmentation;
 +	u32			sectors;
 +	u64			offset;
 +};
@@ -29387,12 +30841,235 @@ index 000000000000..b2de2995c5e7
 +typedef HEAP(struct copygc_heap_entry) copygc_heap;
 +
 +#endif /* _BUCKETS_TYPES_H */
+diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
+new file mode 100644
+index 000000000000..2e5b955080de
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal.c
+@@ -0,0 +1,167 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "buckets_waiting_for_journal.h"
++#include <linux/random.h>
++
++static inline struct bucket_hashed *
++bucket_hash(struct buckets_waiting_for_journal_table *t,
++	    unsigned hash_seed_idx, u64 dev_bucket)
++{
++	unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
++
++	BUG_ON(!is_power_of_2(t->size));
++
++	return t->d + (h & (t->size - 1));
++}
++
++static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
++{
++	unsigned i;
++
++	t->size = size;
++	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
++		get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
++	memset(t->d, 0, sizeof(t->d[0]) * size);
++}
++
++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
++				      u64 flushed_seq,
++				      unsigned dev, u64 bucket)
++{
++	struct buckets_waiting_for_journal_table *t;
++	u64 dev_bucket = (u64) dev << 56 | bucket;
++	bool ret = false;
++	unsigned i;
++
++	mutex_lock(&b->lock);
++	t = b->t;
++
++	for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
++		struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
++
++		if (h->dev_bucket == dev_bucket) {
++			ret = h->journal_seq > flushed_seq;
++			break;
++		}
++	}
++
++	mutex_unlock(&b->lock);
++
++	return ret;
++}
++
++static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
++				struct bucket_hashed *new,
++				u64 flushed_seq)
++{
++	struct bucket_hashed *last_evicted = NULL;
++	unsigned tries, i;
++
++	for (tries = 0; tries < 10; tries++) {
++		struct bucket_hashed *old, *victim = NULL;
++
++		for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
++			old = bucket_hash(t, i, new->dev_bucket);
++
++			if (old->dev_bucket == new->dev_bucket ||
++			    old->journal_seq <= flushed_seq) {
++				*old = *new;
++				return true;
++			}
++
++			if (last_evicted != old)
++				victim = old;
++		}
++
++		/* hashed to same slot 3 times: */
++		if (!victim)
++			break;
++
++		/* Failed to find an empty slot: */
++		swap(*new, *victim);
++		last_evicted = victim;
++	}
++
++	return false;
++}
++
++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
++					 u64 flushed_seq,
++					 unsigned dev, u64 bucket,
++					 u64 journal_seq)
++{
++	struct buckets_waiting_for_journal_table *t, *n;
++	struct bucket_hashed tmp, new = {
++		.dev_bucket	= (u64) dev << 56 | bucket,
++		.journal_seq	= journal_seq,
++	};
++	size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
++	int ret = 0;
++
++	mutex_lock(&b->lock);
++
++	if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
++		goto out;
++
++	t = b->t;
++	for (i = 0; i < t->size; i++)
++		nr_elements += t->d[i].journal_seq > flushed_seq;
++
++	new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
++
++	n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
++	if (!n) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++retry_rehash:
++	nr_rehashes++;
++	bucket_table_init(n, new_size);
++
++	tmp = new;
++	BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
++
++	for (i = 0; i < t->size; i++) {
++		if (t->d[i].journal_seq <= flushed_seq)
++			continue;
++
++		tmp = t->d[i];
++		if (!bucket_table_insert(n, &tmp, flushed_seq))
++			goto retry_rehash;
++	}
++
++	b->t = n;
++	kvfree(t);
++
++	pr_debug("took %zu rehashes, table at %zu/%zu elements",
++		 nr_rehashes, nr_elements, b->t->size);
++out:
++	mutex_unlock(&b->lock);
++
++	return ret;
++}
++
++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
++{
++	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
++
++	kvfree(b->t);
++}
++
++#define INITIAL_TABLE_SIZE	8
++
++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
++{
++	struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
++
++	mutex_init(&b->lock);
++
++	b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
++	if (!b->t)
++		return -ENOMEM;
++
++	bucket_table_init(b->t, INITIAL_TABLE_SIZE);
++	return 0;
++}
+diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
+new file mode 100644
+index 000000000000..d2ae19cbe18c
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal.h
+@@ -0,0 +1,15 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
++#define _BUCKETS_WAITING_FOR_JOURNAL_H
++
++#include "buckets_waiting_for_journal_types.h"
++
++bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
++				      u64, unsigned, u64);
++int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
++					 u64, unsigned, u64, u64);
++
++void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
++int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
++
++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
+diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h
+new file mode 100644
+index 000000000000..fea7f944d0ed
+--- /dev/null
++++ b/fs/bcachefs/buckets_waiting_for_journal_types.h
+@@ -0,0 +1,23 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
++#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
++
++#include <linux/siphash.h>
++
++struct bucket_hashed {
++	u64			dev_bucket;
++	u64			journal_seq;
++};
++
++struct buckets_waiting_for_journal_table {
++	size_t			size;
++	siphash_key_t		hash_seeds[3];
++	struct bucket_hashed	d[];
++};
++
++struct buckets_waiting_for_journal {
++	struct mutex		lock;
++	struct buckets_waiting_for_journal_table *t;
++};
++
++#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
 diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
 new file mode 100644
-index 000000000000..db68a78276cf
+index 000000000000..aa26588ed5ed
 --- /dev/null
 +++ b/fs/bcachefs/chardev.c
-@@ -0,0 +1,758 @@
+@@ -0,0 +1,761 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_CHARDEV
 +
@@ -29963,8 +31640,11 @@ index 000000000000..db68a78276cf
 +	if (!capable(CAP_SYS_ADMIN))
 +		return -EPERM;
 +
++	if (!dev)
++		return -EINVAL;
++
 +	for_each_online_member(ca, c, i)
-+		if (ca->disk_sb.bdev->bd_dev == dev) {
++		if (ca->dev == dev) {
 +			percpu_ref_put(&ca->io_ref);
 +			return i;
 +		}
@@ -30190,10 +31870,10 @@ index 000000000000..3a4890d39ff9
 +#endif /* _BCACHEFS_CHARDEV_H */
 diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
 new file mode 100644
-index 000000000000..fbe8603cfb30
+index 000000000000..425582f60d7a
 --- /dev/null
 +++ b/fs/bcachefs/checksum.c
-@@ -0,0 +1,653 @@
+@@ -0,0 +1,665 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "checksum.h"
@@ -30289,9 +31969,9 @@ index 000000000000..fbe8603cfb30
 +	}
 +}
 +
-+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
-+				 struct nonce nonce,
-+				 struct scatterlist *sg, size_t len)
++static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
++				struct nonce nonce,
++				struct scatterlist *sg, size_t len)
 +{
 +	SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
 +	int ret;
@@ -30300,17 +31980,20 @@ index 000000000000..fbe8603cfb30
 +	skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
 +
 +	ret = crypto_skcipher_encrypt(req);
-+	BUG_ON(ret);
++	if (ret)
++		pr_err("got error %i from crypto_skcipher_encrypt()", ret);
++
++	return ret;
 +}
 +
-+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
++static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 +			      struct nonce nonce,
 +			      void *buf, size_t len)
 +{
 +	struct scatterlist sg;
 +
 +	sg_init_one(&sg, buf, len);
-+	do_encrypt_sg(tfm, nonce, &sg, len);
++	return do_encrypt_sg(tfm, nonce, &sg, len);
 +}
 +
 +int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
@@ -30332,25 +32015,29 @@ index 000000000000..fbe8603cfb30
 +		goto err;
 +	}
 +
-+	do_encrypt(chacha20, nonce, buf, len);
++	ret = do_encrypt(chacha20, nonce, buf, len);
 +err:
 +	crypto_free_sync_skcipher(chacha20);
 +	return ret;
 +}
 +
-+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
-+			 struct nonce nonce)
++static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
++			struct nonce nonce)
 +{
 +	u8 key[POLY1305_KEY_SIZE];
++	int ret;
 +
 +	nonce.d[3] ^= BCH_NONCE_POLY;
 +
 +	memset(key, 0, sizeof(key));
-+	do_encrypt(c->chacha20, nonce, key, sizeof(key));
++	ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
++	if (ret)
++		return ret;
 +
 +	desc->tfm = c->poly1305;
 +	crypto_shash_init(desc);
 +	crypto_shash_update(desc, key, sizeof(key));
++	return 0;
 +}
 +
 +struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -30392,13 +32079,13 @@ index 000000000000..fbe8603cfb30
 +	}
 +}
 +
-+void bch2_encrypt(struct bch_fs *c, unsigned type,
++int bch2_encrypt(struct bch_fs *c, unsigned type,
 +		  struct nonce nonce, void *data, size_t len)
 +{
 +	if (!bch2_csum_type_is_encryption(type))
-+		return;
++		return 0;
 +
-+	do_encrypt(c->chacha20, nonce, data, len);
++	return do_encrypt(c->chacha20, nonce, data, len);
 +}
 +
 +static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -30473,23 +32160,27 @@ index 000000000000..fbe8603cfb30
 +	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 +}
 +
-+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
-+		      struct nonce nonce, struct bio *bio)
++int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
++		     struct nonce nonce, struct bio *bio)
 +{
 +	struct bio_vec bv;
 +	struct bvec_iter iter;
 +	struct scatterlist sgl[16], *sg = sgl;
 +	size_t bytes = 0;
++	int ret = 0;
 +
 +	if (!bch2_csum_type_is_encryption(type))
-+		return;
++		return 0;
 +
 +	sg_init_table(sgl, ARRAY_SIZE(sgl));
 +
 +	bio_for_each_segment(bv, bio, iter) {
 +		if (sg == sgl + ARRAY_SIZE(sgl)) {
 +			sg_mark_end(sg - 1);
-+			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
++
++			ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
++			if (ret)
++				return ret;
 +
 +			nonce = nonce_add(nonce, bytes);
 +			bytes = 0;
@@ -30503,7 +32194,7 @@ index 000000000000..fbe8603cfb30
 +	}
 +
 +	sg_mark_end(sg - 1);
-+	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
++	return do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 +}
 +
 +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
@@ -30603,16 +32294,12 @@ index 000000000000..fbe8603cfb30
 +}
 +
 +#ifdef __KERNEL__
-+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
++static int __bch2_request_key(char *key_description, struct bch_key *key)
 +{
-+	char key_description[60];
 +	struct key *keyring_key;
 +	const struct user_key_payload *ukp;
 +	int ret;
 +
-+	snprintf(key_description, sizeof(key_description),
-+		 "bcachefs:%pUb", &sb->user_uuid);
-+
 +	keyring_key = request_key(&key_type_logon, key_description, NULL);
 +	if (IS_ERR(keyring_key))
 +		return PTR_ERR(keyring_key);
@@ -30632,16 +32319,10 @@ index 000000000000..fbe8603cfb30
 +}
 +#else
 +#include <keyutils.h>
-+#include <uuid/uuid.h>
 +
-+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
++static int __bch2_request_key(char *key_description, struct bch_key *key)
 +{
 +	key_serial_t key_id;
-+	char key_description[60];
-+	char uuid[40];
-+
-+	uuid_unparse_lower(sb->user_uuid.b, uuid);
-+	sprintf(key_description, "bcachefs:%s", uuid);
 +
 +	key_id = request_key("user", key_description, NULL,
 +			     KEY_SPEC_USER_KEYRING);
@@ -30655,6 +32336,17 @@ index 000000000000..fbe8603cfb30
 +}
 +#endif
 +
++int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
++{
++	char key_description[60];
++	char uuid[40];
++
++	uuid_unparse_lower(sb->user_uuid.b, uuid);
++	sprintf(key_description, "bcachefs:%s", uuid);
++
++	return __bch2_request_key(key_description, key);
++}
++
 +int bch2_decrypt_sb_key(struct bch_fs *c,
 +			struct bch_sb_field_crypt *crypt,
 +			struct bch_key *key)
@@ -30849,7 +32541,7 @@ index 000000000000..fbe8603cfb30
 +}
 diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
 new file mode 100644
-index 000000000000..f5c1a609c5c4
+index 000000000000..c86c3c05d620
 --- /dev/null
 +++ b/fs/bcachefs/checksum.h
 @@ -0,0 +1,204 @@
@@ -30904,7 +32596,7 @@ index 000000000000..f5c1a609c5c4
 +int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 +int bch2_request_key(struct bch_sb *, struct bch_key *);
 +
-+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
++int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 +		 void *data, size_t);
 +
 +struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
@@ -30916,8 +32608,8 @@ index 000000000000..f5c1a609c5c4
 +			struct bch_extent_crc_unpacked *,
 +			unsigned, unsigned, unsigned);
 +
-+void bch2_encrypt_bio(struct bch_fs *, unsigned,
-+		    struct nonce, struct bio *);
++int bch2_encrypt_bio(struct bch_fs *, unsigned,
++		     struct nonce, struct bio *);
 +
 +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 +			struct bch_key *);
@@ -31343,10 +33035,10 @@ index 000000000000..5fae0012d808
 +#endif /* _BCACHEFS_CLOCK_TYPES_H */
 diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
 new file mode 100644
-index 000000000000..f63651d291e5
+index 000000000000..482fcff93b62
 --- /dev/null
 +++ b/fs/bcachefs/compress.c
-@@ -0,0 +1,640 @@
+@@ -0,0 +1,641 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "checksum.h"
@@ -31375,7 +33067,7 @@ index 000000000000..f63651d291e5
 +{
 +	void *b;
 +
-+	BUG_ON(size > c->sb.encoded_extent_max << 9);
++	BUG_ON(size > c->opts.encoded_extent_max);
 +
 +	b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
 +	if (b)
@@ -31417,7 +33109,7 @@ index 000000000000..f63651d291e5
 +	struct page **pages = NULL;
 +	void *data;
 +
-+	BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
++	BUG_ON(start.bi_size > c->opts.encoded_extent_max);
 +
 +	if (!PageHighMem(bio_iter_page(bio, start)) &&
 +	    bio_phys_contig(bio, start))
@@ -31548,6 +33240,8 @@ index 000000000000..f63651d291e5
 +		workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
 +		ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
 +
++		src_len = le32_to_cpup(src_data.b);
++
 +		ret = ZSTD_decompressDCtx(ctx,
 +				dst_data,	dst_len,
 +				src_data.b + 4, real_src_len);
@@ -31580,8 +33274,8 @@ index 000000000000..f63651d291e5
 +	BUG_ON(!bio->bi_vcnt);
 +	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 +
-+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
-+	    crc->compressed_size	> c->sb.encoded_extent_max) {
++	if (crc->uncompressed_size << 9	> c->opts.encoded_extent_max ||
++	    crc->compressed_size << 9	> c->opts.encoded_extent_max) {
 +		bch_err(c, "error rewriting existing data: extent too big");
 +		return -EIO;
 +	}
@@ -31621,8 +33315,8 @@ index 000000000000..f63651d291e5
 +	size_t dst_len = crc.uncompressed_size << 9;
 +	int ret = -ENOMEM;
 +
-+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
-+	    crc.compressed_size		> c->sb.encoded_extent_max)
++	if (crc.uncompressed_size << 9	> c->opts.encoded_extent_max ||
++	    crc.compressed_size << 9	> c->opts.encoded_extent_max)
 +		return -EIO;
 +
 +	dst_data = dst_len == dst_iter.bi_size
@@ -31725,7 +33419,7 @@ index 000000000000..f63651d291e5
 +	BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
 +
 +	/* If it's only one block, don't bother trying to compress: */
-+	if (bio_sectors(src) <= c->opts.block_size)
++	if (src->bi_iter.bi_size <= c->opts.block_size)
 +		return 0;
 +
 +	dst_data = bio_map_or_bounce(c, dst, WRITE);
@@ -31815,7 +33509,7 @@ index 000000000000..f63651d291e5
 +
 +	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 +	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-+				     c->sb.encoded_extent_max << 9);
++				     c->opts.encoded_extent_max);
 +	/* Don't generate a bigger output than input: */
 +	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 +
@@ -31893,10 +33587,9 @@ index 000000000000..f63651d291e5
 +
 +static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 +{
-+	size_t max_extent = c->sb.encoded_extent_max << 9;
 +	size_t decompress_workspace_size = 0;
 +	bool decompress_workspace_needed;
-+	ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
++	ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
 +	struct {
 +		unsigned	feature;
 +		unsigned	type;
@@ -31928,14 +33621,14 @@ index 000000000000..f63651d291e5
 +
 +	if (!mempool_initialized(&c->compression_bounce[READ])) {
 +		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-+						  1, max_extent);
++						  1, c->opts.encoded_extent_max);
 +		if (ret)
 +			goto out;
 +	}
 +
 +	if (!mempool_initialized(&c->compression_bounce[WRITE])) {
 +		ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-+						  1, max_extent);
++						  1, c->opts.encoded_extent_max);
 +		if (ret)
 +			goto out;
 +	}
@@ -32011,12 +33704,94 @@ index 000000000000..4bab1f61b3b5
 +int bch2_fs_compress_init(struct bch_fs *);
 +
 +#endif /* _BCACHEFS_COMPRESS_H */
+diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
+new file mode 100644
+index 000000000000..745b1cdb0d17
+--- /dev/null
++++ b/fs/bcachefs/darray.h
+@@ -0,0 +1,76 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_DARRAY_H
++#define _BCACHEFS_DARRAY_H
++
++/*
++ * Dynamic arrays:
++ *
++ * Inspired by CCAN's darray
++ */
++
++#include "util.h"
++#include <linux/slab.h>
++
++#define DARRAY(type)							\
++struct {								\
++	size_t nr, size;						\
++	type *data;							\
++}
++
++typedef DARRAY(void) darray_void;
++
++static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
++{
++	if (d->nr + more > d->size) {
++		size_t new_size = roundup_pow_of_two(d->nr + more);
++		void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL);
++
++		if (!data)
++			return -ENOMEM;
++
++		d->data	= data;
++		d->size = new_size;
++	}
++
++	return 0;
++}
++
++#define darray_make_room(_d, _more)					\
++	__darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more))
++
++#define darray_top(_d)		((_d).data[(_d).nr])
++
++#define darray_push(_d, _item)						\
++({									\
++	int _ret = darray_make_room((_d), 1);				\
++									\
++	if (!_ret)							\
++		(_d).data[(_d).nr++] = (_item);				\
++	_ret;								\
++})
++
++#define darray_insert_item(_d, _pos, _item)				\
++({									\
++	int _ret = darray_make_room((_d), 1);				\
++									\
++	if (!_ret)							\
++		array_insert_item((_d).data, (_d).nr, (_pos), (_item));	\
++	_ret;								\
++})
++
++#define darray_for_each(_d, _i)						\
++	for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
++
++#define darray_init(_d)							\
++do {									\
++	(_d).data = NULL;						\
++	(_d).nr = (_d).size = 0;					\
++} while (0)
++
++#define darray_exit(_d)							\
++do {									\
++	kfree((_d).data);						\
++	darray_init(_d);						\
++} while (0)
++
++#endif /* _BCACHEFS_DARRAY_H */
 diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
 new file mode 100644
-index 000000000000..294e4baf4deb
+index 000000000000..2d65ae370931
 --- /dev/null
 +++ b/fs/bcachefs/debug.c
-@@ -0,0 +1,476 @@
+@@ -0,0 +1,628 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Assorted bcachefs debug code
@@ -32188,10 +33963,11 @@ index 000000000000..294e4baf4deb
 +		failed |= bch2_btree_verify_replica(c, b, p);
 +
 +	if (failed) {
-+		char buf[200];
++		struct printbuf buf = PRINTBUF;
 +
-+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
-+		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
++		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
++		printbuf_exit(&buf);
 +	}
 +out:
 +	mutex_unlock(&c->verify_lock);
@@ -32203,12 +33979,12 @@ index 000000000000..294e4baf4deb
 +/* XXX: bch_fs refcounting */
 +
 +struct dump_iter {
-+	struct bpos		from;
-+	struct bch_fs	*c;
++	struct bch_fs		*c;
 +	enum btree_id		id;
++	struct bpos		from;
++	u64			iter;
 +
-+	char			buf[1 << 12];
-+	size_t			bytes;	/* what's currently in buf */
++	struct printbuf		buf;
 +
 +	char __user		*ubuf;	/* destination user buffer */
 +	size_t			size;	/* size of requested read */
@@ -32217,9 +33993,9 @@ index 000000000000..294e4baf4deb
 +
 +static int flush_buf(struct dump_iter *i)
 +{
-+	if (i->bytes) {
-+		size_t bytes = min(i->bytes, i->size);
-+		int err = copy_to_user(i->ubuf, i->buf, bytes);
++	if (i->buf.pos) {
++		size_t bytes = min_t(size_t, i->buf.pos, i->size);
++		int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
 +
 +		if (err)
 +			return err;
@@ -32227,8 +34003,8 @@ index 000000000000..294e4baf4deb
 +		i->ret	 += bytes;
 +		i->ubuf	 += bytes;
 +		i->size	 -= bytes;
-+		i->bytes -= bytes;
-+		memmove(i->buf, i->buf + bytes, i->bytes);
++		i->buf.pos -= bytes;
++		memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
 +	}
 +
 +	return 0;
@@ -32245,15 +34021,20 @@ index 000000000000..294e4baf4deb
 +
 +	file->private_data = i;
 +	i->from = POS_MIN;
++	i->iter	= 0;
 +	i->c	= container_of(bd, struct bch_fs, btree_debug[bd->id]);
 +	i->id	= bd->id;
++	i->buf	= PRINTBUF;
 +
 +	return 0;
 +}
 +
 +static int bch2_dump_release(struct inode *inode, struct file *file)
 +{
-+	kfree(file->private_data);
++	struct dump_iter *i = file->private_data;
++
++	printbuf_exit(&i->buf);
++	kfree(i);
 +	return 0;
 +}
 +
@@ -32285,11 +34066,8 @@ index 000000000000..294e4baf4deb
 +	k = bch2_btree_iter_peek(&iter);
 +
 +	while (k.k && !(err = bkey_err(k))) {
-+		bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
-+		i->bytes = strlen(i->buf);
-+		BUG_ON(i->bytes >= sizeof(i->buf));
-+		i->buf[i->bytes] = '\n';
-+		i->bytes++;
++		bch2_bkey_val_to_text(&i->buf, i->c, k);
++		pr_char(&i->buf, '\n');
 +
 +		k = bch2_btree_iter_next(&iter);
 +		i->from = iter.pos;
@@ -32338,8 +34116,7 @@ index 000000000000..294e4baf4deb
 +	bch2_trans_init(&trans, i->c, 0, 0);
 +
 +	for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
-+		bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
-+		i->bytes = strlen(i->buf);
++		bch2_btree_node_to_text(&i->buf, i->c, b);
 +		err = flush_buf(i);
 +		if (err)
 +			break;
@@ -32392,7 +34169,9 @@ index 000000000000..294e4baf4deb
 +
 +	bch2_trans_init(&trans, i->c, 0, 0);
 +
-+	bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH);
++	bch2_trans_iter_init(&trans, &iter, i->id, i->from,
++			     BTREE_ITER_PREFETCH|
++			     BTREE_ITER_ALL_SNAPSHOTS);
 +
 +	while ((k = bch2_btree_iter_peek(&iter)).k &&
 +	       !(err = bkey_err(k))) {
@@ -32401,16 +34180,14 @@ index 000000000000..294e4baf4deb
 +			bch2_btree_node_iter_peek(&l->iter, l->b);
 +
 +		if (l->b != prev_node) {
-+			bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
-+			i->bytes = strlen(i->buf);
++			bch2_btree_node_to_text(&i->buf, i->c, l->b);
 +			err = flush_buf(i);
 +			if (err)
 +				break;
 +		}
 +		prev_node = l->b;
 +
-+		bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
-+		i->bytes = strlen(i->buf);
++		bch2_bfloat_to_text(&i->buf, l->b, _k);
 +		err = flush_buf(i);
 +		if (err)
 +			break;
@@ -32425,6 +34202,8 @@ index 000000000000..294e4baf4deb
 +		if (!i->size)
 +			break;
 +	}
++	bch2_trans_iter_exit(&trans, &iter);
++
 +	bch2_trans_exit(&trans);
 +
 +	return err < 0 ? err : i->ret;
@@ -32437,10 +34216,148 @@ index 000000000000..294e4baf4deb
 +	.read		= bch2_read_bfloat_failed,
 +};
 +
++static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
++					   struct btree *b)
++{
++	out->tabstops[0] = 32;
++
++	pr_buf(out, "%px btree=%s l=%u ",
++	       b,
++	       bch2_btree_ids[b->c.btree_id],
++	       b->c.level);
++	pr_newline(out);
++
++	pr_indent_push(out, 2);
++
++	bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
++	pr_newline(out);
++
++	pr_buf(out, "flags: ");
++	pr_tab(out);
++	bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
++	pr_newline(out);
++
++	pr_buf(out, "written:");
++	pr_tab(out);
++	pr_buf(out, "%u", b->written);
++	pr_newline(out);
++
++	pr_buf(out, "writes blocked:");
++	pr_tab(out);
++	pr_buf(out, "%u", !list_empty_careful(&b->write_blocked));
++	pr_newline(out);
++
++	pr_buf(out, "will make reachable:");
++	pr_tab(out);
++	pr_buf(out, "%lx", b->will_make_reachable);
++	pr_newline(out);
++
++	pr_buf(out, "journal pin %px:", &b->writes[0].journal);
++	pr_tab(out);
++	pr_buf(out, "%llu", b->writes[0].journal.seq);
++	pr_newline(out);
++
++	pr_buf(out, "journal pin %px:", &b->writes[1].journal);
++	pr_tab(out);
++	pr_buf(out, "%llu", b->writes[1].journal.seq);
++	pr_newline(out);
++
++	pr_indent_pop(out, 2);
++}
++
++static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
++					    size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct bch_fs *c = i->c;
++	bool done = false;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	do {
++		struct bucket_table *tbl;
++		struct rhash_head *pos;
++		struct btree *b;
++
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		rcu_read_lock();
++		i->buf.atomic++;
++		tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
++					  &c->btree_cache.table);
++		if (i->iter < tbl->size) {
++			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
++				bch2_cached_btree_node_to_text(&i->buf, c, b);
++			i->iter++;;
++		} else {
++			done = true;
++		}
++		--i->buf.atomic;
++		rcu_read_unlock();
++	} while (!done);
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations cached_btree_nodes_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_cached_btree_nodes_read,
++};
++
++static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
++				      size_t size, loff_t *ppos)
++{
++	struct dump_iter *i = file->private_data;
++	struct bch_fs *c = i->c;
++	bool done = false;
++	int err;
++
++	i->ubuf = buf;
++	i->size	= size;
++	i->ret	= 0;
++
++	do {
++		err = flush_buf(i);
++		if (err)
++			return err;
++
++		if (!i->size)
++			break;
++
++		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
++		i->iter++;
++	} while (!done);
++
++	if (i->buf.allocation_failure)
++		return -ENOMEM;
++
++	return i->ret;
++}
++
++static const struct file_operations journal_pins_ops = {
++	.owner		= THIS_MODULE,
++	.open		= bch2_dump_open,
++	.release	= bch2_dump_release,
++	.read		= bch2_journal_pins_read,
++};
++
 +void bch2_fs_debug_exit(struct bch_fs *c)
 +{
-+	if (!IS_ERR_OR_NULL(c->debug))
-+		debugfs_remove_recursive(c->debug);
++	if (!IS_ERR_OR_NULL(c->fs_debug_dir))
++		debugfs_remove_recursive(c->fs_debug_dir);
 +}
 +
 +void bch2_fs_debug_init(struct bch_fs *c)
@@ -32452,29 +34369,39 @@ index 000000000000..294e4baf4deb
 +		return;
 +
 +	snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
-+	c->debug = debugfs_create_dir(name, bch_debug);
-+	if (IS_ERR_OR_NULL(c->debug))
++	c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
++	if (IS_ERR_OR_NULL(c->fs_debug_dir))
++		return;
++
++	debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
++			    c->btree_debug, &cached_btree_nodes_ops);
++
++	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
++			    c->btree_debug, &journal_pins_ops);
++
++	c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
++	if (IS_ERR_OR_NULL(c->btree_debug_dir))
 +		return;
 +
 +	for (bd = c->btree_debug;
 +	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 +	     bd++) {
 +		bd->id = bd - c->btree_debug;
-+		bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
-+						0400, c->debug, bd,
-+						&btree_debug_ops);
++		debugfs_create_file(bch2_btree_ids[bd->id],
++				    0400, c->btree_debug_dir, bd,
++				    &btree_debug_ops);
 +
 +		snprintf(name, sizeof(name), "%s-formats",
 +			 bch2_btree_ids[bd->id]);
 +
-+		bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
-+						       &btree_format_debug_ops);
++		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
++				    &btree_format_debug_ops);
 +
 +		snprintf(name, sizeof(name), "%s-bfloat-failed",
 +			 bch2_btree_ids[bd->id]);
 +
-+		bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
-+						 &bfloat_failed_debug_ops);
++		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
++				    &bfloat_failed_debug_ops);
 +	}
 +}
 +
@@ -32531,10 +34458,10 @@ index 000000000000..0b86736e5e1b
 +#endif /* _BCACHEFS_DEBUG_H */
 diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
 new file mode 100644
-index 000000000000..fe4a85a6a8cb
+index 000000000000..760e4f74715f
 --- /dev/null
 +++ b/fs/bcachefs/dirent.c
-@@ -0,0 +1,546 @@
+@@ -0,0 +1,545 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -32659,9 +34586,9 @@ index 000000000000..fe4a85a6a8cb
 +{
 +	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 +
-+	bch_scnmemcpy(out, d.v->d_name,
-+		      bch2_dirent_name_bytes(d));
-+	pr_buf(out, " -> %llu type %s",
++	pr_buf(out, "%.*s -> %llu type %s",
++	       bch2_dirent_name_bytes(d),
++	       d.v->d_name,
 +	       d.v->d_type != DT_SUBVOL
 +	       ? le64_to_cpu(d.v->d_inum)
 +	       : le32_to_cpu(d.v->d_child_subvol),
@@ -32767,7 +34694,7 @@ index 000000000000..fe4a85a6a8cb
 +{
 +	struct btree_iter src_iter = { NULL };
 +	struct btree_iter dst_iter = { NULL };
-+	struct bkey_s_c old_src, old_dst;
++	struct bkey_s_c old_src, old_dst = bkey_s_c_null;
 +	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 +	struct bpos dst_pos =
 +		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
@@ -32904,7 +34831,9 @@ index 000000000000..fe4a85a6a8cb
 +		}
 +	}
 +
-+	bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
++	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
++	if (ret)
++		goto out;
 +out_set_src:
 +
 +	/*
@@ -32921,7 +34850,9 @@ index 000000000000..fe4a85a6a8cb
 +		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
 +	}
 +
-+	bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
++	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
++	if (ret)
++		goto out;
 +
 +	if (mode == BCH_RENAME_EXCHANGE)
 +		*src_offset = new_src->k.p.offset;
@@ -33003,16 +34934,13 @@ index 000000000000..fe4a85a6a8cb
 +	if (ret)
 +		return ret;
 +
-+	for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
-+			   SPOS(dir.inum, 0, snapshot), 0, k, ret) {
-+		if (k.k->p.inode > dir.inum)
-+			break;
-+
++	for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
++			   SPOS(dir.inum, 0, snapshot),
++			   POS(dir.inum, U64_MAX), 0, k, ret)
 +		if (k.k->type == KEY_TYPE_dirent) {
 +			ret = -ENOTEMPTY;
 +			break;
 +		}
-+	}
 +	bch2_trans_iter_exit(trans, &iter);
 +
 +	return ret;
@@ -33036,11 +34964,9 @@ index 000000000000..fe4a85a6a8cb
 +	if (ret)
 +		goto err;
 +
-+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
-+			   SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
-+		if (k.k->p.inode > inum.inum)
-+			break;
-+
++	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
++			   SPOS(inum.inum, ctx->pos, snapshot),
++			   POS(inum.inum, U64_MAX), 0, k, ret) {
 +		if (k.k->type != KEY_TYPE_dirent)
 +			continue;
 +
@@ -33156,10 +35082,10 @@ index 000000000000..1bb4d802bc1d
 +#endif /* _BCACHEFS_DIRENT_H */
 diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
 new file mode 100644
-index 000000000000..c52b6faac9b4
+index 000000000000..81b41b07c24b
 --- /dev/null
 +++ b/fs/bcachefs/disk_groups.c
-@@ -0,0 +1,486 @@
+@@ -0,0 +1,506 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "disk_groups.h"
@@ -33179,24 +35105,20 @@ index 000000000000..c52b6faac9b4
 +		strncmp(l->label, r->label, sizeof(l->label));
 +}
 +
-+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
-+						struct bch_sb_field *f)
++static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
++					struct bch_sb_field *f,
++					struct printbuf *err)
 +{
 +	struct bch_sb_field_disk_groups *groups =
 +		field_to_type(f, disk_groups);
 +	struct bch_disk_group *g, *sorted = NULL;
-+	struct bch_sb_field_members *mi;
-+	struct bch_member *m;
-+	unsigned i, nr_groups, len;
-+	const char *err = NULL;
++	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++	unsigned nr_groups = disk_groups_nr(groups);
++	unsigned i, len;
++	int ret = -EINVAL;
 +
-+	mi		= bch2_sb_get_members(sb);
-+	groups		= bch2_sb_get_disk_groups(sb);
-+	nr_groups	= disk_groups_nr(groups);
-+
-+	for (m = mi->members;
-+	     m < mi->members + sb->nr_devices;
-+	     m++) {
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
 +		unsigned g;
 +
 +		if (!BCH_MEMBER_GROUP(m))
@@ -33204,45 +35126,54 @@ index 000000000000..c52b6faac9b4
 +
 +		g = BCH_MEMBER_GROUP(m) - 1;
 +
-+		if (g >= nr_groups ||
-+		    BCH_GROUP_DELETED(&groups->entries[g]))
-+			return "disk has invalid group";
++		if (g >= nr_groups) {
++			pr_buf(err, "disk %u has invalid label %u (have %u)",
++			       i, g, nr_groups);
++			return -EINVAL;
++		}
++
++		if (BCH_GROUP_DELETED(&groups->entries[g])) {
++			pr_buf(err, "disk %u has deleted label %u", i, g);
++			return -EINVAL;
++		}
 +	}
 +
 +	if (!nr_groups)
-+		return NULL;
++		return 0;
++
++	for (i = 0; i < nr_groups; i++) {
++		g = groups->entries + i;
 +
-+	for (g = groups->entries;
-+	     g < groups->entries + nr_groups;
-+	     g++) {
 +		if (BCH_GROUP_DELETED(g))
 +			continue;
 +
 +		len = strnlen(g->label, sizeof(g->label));
 +		if (!len) {
-+			err = "group with empty label";
-+			goto err;
++			pr_buf(err, "label %u empty", i);
++			return -EINVAL;
 +		}
 +	}
 +
 +	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
 +	if (!sorted)
-+		return "cannot allocate memory";
++		return -ENOMEM;
 +
 +	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
 +	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
 +
-+	for (i = 0; i + 1 < nr_groups; i++)
-+		if (!BCH_GROUP_DELETED(sorted + i) &&
-+		    !group_cmp(sorted + i, sorted + i + 1)) {
-+			err = "duplicate groups";
++	for (g = sorted; g + 1 < sorted + nr_groups; g++)
++		if (!BCH_GROUP_DELETED(g) &&
++		    !group_cmp(&g[0], &g[1])) {
++			pr_buf(err, "duplicate label %llu.%.*s",
++			       BCH_GROUP_PARENT(g),
++			       (int) sizeof(g->label), g->label);
 +			goto err;
 +		}
 +
-+	err = NULL;
++	ret = 0;
 +err:
 +	kfree(sorted);
-+	return err;
++	return 0;
 +}
 +
 +static void bch2_sb_disk_groups_to_text(struct printbuf *out,
@@ -33500,12 +35431,10 @@ index 000000000000..c52b6faac9b4
 +	return v;
 +}
 +
-+void bch2_disk_path_to_text(struct printbuf *out,
-+			    struct bch_sb_handle *sb,
-+			    unsigned v)
++void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
 +{
 +	struct bch_sb_field_disk_groups *groups =
-+		bch2_sb_get_disk_groups(sb->sb);
++		bch2_sb_get_disk_groups(sb);
 +	struct bch_disk_group *g;
 +	unsigned nr = 0;
 +	u16 path[32];
@@ -33534,15 +35463,13 @@ index 000000000000..c52b6faac9b4
 +		v = path[--nr];
 +		g = groups->entries + v;
 +
-+		bch_scnmemcpy(out, g->label,
-+			      strnlen(g->label, sizeof(g->label)));
-+
++		pr_buf(out, "%.*s", (int) sizeof(g->label), g->label);
 +		if (nr)
 +			pr_buf(out, ".");
 +	}
 +	return;
 +inval:
-+	pr_buf(out, "invalid group %u", v);
++	pr_buf(out, "invalid label %u", v);
 +}
 +
 +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -33606,7 +35533,10 @@ index 000000000000..c52b6faac9b4
 +	return -EINVAL;
 +}
 +
-+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
++void bch2_opt_target_to_text(struct printbuf *out,
++			     struct bch_fs *c,
++			     struct bch_sb *sb,
++			     u64 v)
 +{
 +	struct target t = target_decode(v);
 +
@@ -33614,33 +35544,49 @@ index 000000000000..c52b6faac9b4
 +	case TARGET_NULL:
 +		pr_buf(out, "none");
 +		break;
-+	case TARGET_DEV: {
-+		struct bch_dev *ca;
++	case TARGET_DEV:
++		if (c) {
++			struct bch_dev *ca;
 +
-+		rcu_read_lock();
-+		ca = t.dev < c->sb.nr_devices
-+			? rcu_dereference(c->devs[t.dev])
-+			: NULL;
++			rcu_read_lock();
++			ca = t.dev < c->sb.nr_devices
++				? rcu_dereference(c->devs[t.dev])
++				: NULL;
 +
-+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
-+			char b[BDEVNAME_SIZE];
++			if (ca && percpu_ref_tryget(&ca->io_ref)) {
++				char b[BDEVNAME_SIZE];
 +
-+			pr_buf(out, "/dev/%s",
-+			     bdevname(ca->disk_sb.bdev, b));
-+			percpu_ref_put(&ca->io_ref);
-+		} else if (ca) {
-+			pr_buf(out, "offline device %u", t.dev);
++				pr_buf(out, "/dev/%s",
++				       bdevname(ca->disk_sb.bdev, b));
++				percpu_ref_put(&ca->io_ref);
++			} else if (ca) {
++				pr_buf(out, "offline device %u", t.dev);
++			} else {
++				pr_buf(out, "invalid device %u", t.dev);
++			}
++
++			rcu_read_unlock();
 +		} else {
-+			pr_buf(out, "invalid device %u", t.dev);
-+		}
++			struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++			struct bch_member *m = mi->members + t.dev;
 +
-+		rcu_read_unlock();
++			if (bch2_dev_exists(sb, mi, t.dev)) {
++				pr_buf(out, "Device ");
++				pr_uuid(out, m->uuid.b);
++				pr_buf(out, " (%u)", t.dev);
++			} else {
++				pr_buf(out, "Bad device %u", t.dev);
++			}
++		}
 +		break;
-+	}
 +	case TARGET_GROUP:
-+		mutex_lock(&c->sb_lock);
-+		bch2_disk_path_to_text(out, &c->disk_sb, t.group);
-+		mutex_unlock(&c->sb_lock);
++		if (c) {
++			mutex_lock(&c->sb_lock);
++			bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
++			mutex_unlock(&c->sb_lock);
++		} else {
++			bch2_disk_path_to_text(out, sb, t.group);
++		}
 +		break;
 +	default:
 +		BUG();
@@ -33648,10 +35594,10 @@ index 000000000000..c52b6faac9b4
 +}
 diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
 new file mode 100644
-index 000000000000..3d84f23c34ed
+index 000000000000..de915480514b
 --- /dev/null
 +++ b/fs/bcachefs/disk_groups.h
-@@ -0,0 +1,91 @@
+@@ -0,0 +1,90 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_DISK_GROUPS_H
 +#define _BCACHEFS_DISK_GROUPS_H
@@ -33729,11 +35675,10 @@ index 000000000000..3d84f23c34ed
 +/* Exported for userspace bcachefs-tools: */
 +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 +
-+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
-+			    unsigned);
++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
 +
 +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 +
 +int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
 +
@@ -33745,10 +35690,10 @@ index 000000000000..3d84f23c34ed
 +#endif /* _BCACHEFS_DISK_GROUPS_H */
 diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
 new file mode 100644
-index 000000000000..bca1b8a7b673
+index 000000000000..616a551265e0
 --- /dev/null
 +++ b/fs/bcachefs/ec.c
-@@ -0,0 +1,1780 @@
+@@ -0,0 +1,1682 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +/* erasure coding */
@@ -33766,6 +35711,7 @@ index 000000000000..bca1b8a7b673
 +#include "io.h"
 +#include "keylist.h"
 +#include "recovery.h"
++#include "replicas.h"
 +#include "super-io.h"
 +#include "util.h"
 +
@@ -33893,8 +35839,8 @@ index 000000000000..bca1b8a7b673
 +}
 +
 +/* returns blocknr in stripe that we matched: */
-+static int bkey_matches_stripe(struct bch_stripe *s,
-+			       struct bkey_s_c k)
++static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
++						struct bkey_s_c k, unsigned *block)
 +{
 +	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +	const struct bch_extent_ptr *ptr;
@@ -33903,10 +35849,12 @@ index 000000000000..bca1b8a7b673
 +	bkey_for_each_ptr(ptrs, ptr)
 +		for (i = 0; i < nr_data; i++)
 +			if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-+						      le16_to_cpu(s->sectors)))
-+				return i;
++						      le16_to_cpu(s->sectors))) {
++				*block = i;
++				return ptr;
++			}
 +
-+	return -1;
++	return NULL;
 +}
 +
 +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
@@ -34034,14 +35982,15 @@ index 000000000000..bca1b8a7b673
 +			struct bch_csum got = ec_block_checksum(buf, i, offset);
 +
 +			if (bch2_crc_cmp(want, got)) {
-+				char buf2[200];
++				struct printbuf buf2 = PRINTBUF;
 +
-+				bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
++				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
 +
 +				bch_err_ratelimited(c,
 +					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
 +					(void *) _RET_IP_, i, j, v->csum_type,
-+					want.lo, got.lo, buf2);
++					want.lo, got.lo, buf2.buf);
++				printbuf_exit(&buf2);
 +				clear_bit(i, buf->valid);
 +				break;
 +			}
@@ -34295,11 +36244,11 @@ index 000000000000..bca1b8a7b673
 +		free_heap(&n);
 +	}
 +
-+	if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
++	if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
 +		return -ENOMEM;
 +
 +	if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
-+	    !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
++	    !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
 +		return -ENOMEM;
 +
 +	return 0;
@@ -34344,13 +36293,13 @@ index 000000000000..bca1b8a7b673
 +{
 +	struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
 +
-+	genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
++	genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
 +}
 +
 +static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 +{
 +	ec_stripes_heap *h = &c->ec_stripes_heap;
-+	struct stripe *m = genradix_ptr(&c->stripes[0], idx);
++	struct stripe *m = genradix_ptr(&c->stripes, idx);
 +
 +	BUG_ON(!m->alive);
 +	BUG_ON(m->heap_idx >= h->used);
@@ -34425,7 +36374,7 @@ index 000000000000..bca1b8a7b673
 +	return bch2_btree_delete_range(c, BTREE_ID_stripes,
 +				       POS(0, idx),
 +				       POS(0, idx + 1),
-+				       NULL);
++				       0, NULL);
 +}
 +
 +static void ec_stripe_delete_work(struct work_struct *work)
@@ -34442,7 +36391,7 @@ index 000000000000..bca1b8a7b673
 +			break;
 +		}
 +
-+		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
++		bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
 +		spin_unlock(&c->ec_stripes_heap_lock);
 +
 +		if (ec_stripe_delete(c, idx))
@@ -34452,22 +36401,18 @@ index 000000000000..bca1b8a7b673
 +
 +/* stripe creation: */
 +
-+static int ec_stripe_bkey_insert(struct bch_fs *c,
++static int ec_stripe_bkey_insert(struct btree_trans *trans,
 +				 struct bkey_i_stripe *stripe,
 +				 struct disk_reservation *res)
 +{
-+	struct btree_trans trans;
++	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bpos min_pos = POS(0, 1);
 +	struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
 +	int ret;
 +
-+	bch2_trans_init(&trans, c, 0, 0);
-+retry:
-+	bch2_trans_begin(&trans);
-+
-+	for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
++	for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
 +			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 +		if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
 +			if (start_pos.offset) {
@@ -34488,29 +36433,24 @@ index 000000000000..bca1b8a7b673
 +found_slot:
 +	start_pos = iter.pos;
 +
-+	ret = ec_stripe_mem_alloc(&trans, &iter);
++	ret = ec_stripe_mem_alloc(trans, &iter);
 +	if (ret)
 +		goto err;
 +
 +	stripe->k.p = iter.pos;
 +
-+	ret   = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?:
-+		bch2_trans_commit(&trans, res, NULL,
-+				BTREE_INSERT_NOFAIL);
++	ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
++
++	c->ec_stripe_hint = start_pos.offset;
 +err:
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ret == -EINTR)
-+		goto retry;
-+
-+	c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
-+	bch2_trans_exit(&trans);
++	bch2_trans_iter_exit(trans, &iter);
 +
 +	return ret;
 +}
 +
 +static int ec_stripe_bkey_update(struct btree_trans *trans,
-+				 struct bkey_i_stripe *new)
++				 struct bkey_i_stripe *new,
++				 struct disk_reservation *res)
 +{
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
@@ -34593,6 +36533,7 @@ index 000000000000..bca1b8a7b673
 +	       (k = bch2_btree_iter_peek(&iter)).k &&
 +	       !(ret = bkey_err(k)) &&
 +	       bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
++		const struct bch_extent_ptr *ptr_c;
 +		struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 +
 +		if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
@@ -34600,8 +36541,12 @@ index 000000000000..bca1b8a7b673
 +			continue;
 +		}
 +
-+		block = bkey_matches_stripe(&s->key.v, k);
-+		if (block < 0) {
++		ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
++		/*
++		 * It doesn't generally make sense to erasure code cached ptrs:
++		 * XXX: should we be incrementing a counter?
++		 */
++		if (!ptr_c || ptr_c->cached) {
 +			bch2_btree_iter_advance(&iter);
 +			continue;
 +		}
@@ -34697,10 +36642,10 @@ index 000000000000..bca1b8a7b673
 +		goto err_put_writes;
 +	}
 +
-+	ret = s->have_existing_stripe
-+		? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
-+				ec_stripe_bkey_update(&trans, &s->new_stripe.key))
-+		: ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
++	ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
++			    s->have_existing_stripe
++			    ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
++			    : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
 +	if (ret) {
 +		bch_err(c, "error creating stripe: error creating stripe key");
 +		goto err_put_writes;
@@ -34715,7 +36660,7 @@ index 000000000000..bca1b8a7b673
 +	}
 +
 +	spin_lock(&c->ec_stripes_heap_lock);
-+	m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
++	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
 +
 +	BUG_ON(m->on_heap);
 +	bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
@@ -34815,7 +36760,7 @@ index 000000000000..bca1b8a7b673
 +	if (!ob)
 +		return NULL;
 +
-+	ca	= bch_dev_bkey_exists(c, ob->ptr.dev);
++	ca	= bch_dev_bkey_exists(c, ob->dev);
 +	offset	= ca->mi.bucket_size - ob->sectors_free;
 +
 +	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
@@ -34904,7 +36849,7 @@ index 000000000000..bca1b8a7b673
 +	s->v.algorithm			= 0;
 +	s->v.nr_blocks			= nr_data + nr_parity;
 +	s->v.nr_redundant		= nr_parity;
-+	s->v.csum_granularity_bits	= ilog2(c->sb.encoded_extent_max);
++	s->v.csum_granularity_bits	= ilog2(c->opts.encoded_extent_max >> 9);
 +	s->v.csum_type			= BCH_CSUM_crc32c;
 +	s->v.pad			= 0;
 +
@@ -35023,16 +36968,15 @@ index 000000000000..bca1b8a7b673
 +	return h;
 +}
 +
-+static enum bucket_alloc_ret
-+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
-+			 struct closure *cl)
++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
++				    struct closure *cl)
 +{
 +	struct bch_devs_mask devs = h->devs;
 +	struct open_bucket *ob;
 +	struct open_buckets buckets;
 +	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 +	bool have_cache = true;
-+	enum bucket_alloc_ret ret = ALLOC_SUCCESS;
++	int ret = 0;
 +
 +	for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
 +		if (test_bit(i, h->s->blocks_gotten)) {
@@ -35047,9 +36991,6 @@ index 000000000000..bca1b8a7b673
 +	BUG_ON(nr_have_data	> h->s->nr_data);
 +	BUG_ON(nr_have_parity	> h->s->nr_parity);
 +
-+	percpu_down_read(&c->mark_lock);
-+	rcu_read_lock();
-+
 +	buckets.nr = 0;
 +	if (nr_have_parity < h->s->nr_parity) {
 +		ret = bch2_bucket_alloc_set(c, &buckets,
@@ -35059,8 +37000,8 @@ index 000000000000..bca1b8a7b673
 +					    &nr_have_parity,
 +					    &have_cache,
 +					    h->copygc
-+					    ? RESERVE_MOVINGGC
-+					    : RESERVE_NONE,
++					    ? RESERVE_movinggc
++					    : RESERVE_none,
 +					    0,
 +					    cl);
 +
@@ -35071,12 +37012,12 @@ index 000000000000..bca1b8a7b673
 +			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
 +
 +			h->s->blocks[j] = buckets.v[i];
-+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
++			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
 +			__set_bit(j, h->s->blocks_gotten);
 +		}
 +
 +		if (ret)
-+			goto err;
++			return ret;
 +	}
 +
 +	buckets.nr = 0;
@@ -35088,8 +37029,8 @@ index 000000000000..bca1b8a7b673
 +					    &nr_have_data,
 +					    &have_cache,
 +					    h->copygc
-+					    ? RESERVE_MOVINGGC
-+					    : RESERVE_NONE,
++					    ? RESERVE_movinggc
++					    : RESERVE_none,
 +					    0,
 +					    cl);
 +
@@ -35099,17 +37040,15 @@ index 000000000000..bca1b8a7b673
 +			BUG_ON(j >= h->s->nr_data);
 +
 +			h->s->blocks[j] = buckets.v[i];
-+			h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
++			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
 +			__set_bit(j, h->s->blocks_gotten);
 +		}
 +
 +		if (ret)
-+			goto err;
++			return ret;
 +	}
-+err:
-+	rcu_read_unlock();
-+	percpu_up_read(&c->mark_lock);
-+	return ret;
++
++	return 0;
 +}
 +
 +/* XXX: doesn't obey target: */
@@ -35132,7 +37071,7 @@ index 000000000000..bca1b8a7b673
 +			continue;
 +
 +		stripe_idx = h->data[heap_idx].idx;
-+		m = genradix_ptr(&c->stripes[0], stripe_idx);
++		m = genradix_ptr(&c->stripes, stripe_idx);
 +
 +		if (m->algorithm	== head->algo &&
 +		    m->nr_redundant	== head->redundancy &&
@@ -35267,7 +37206,7 @@ index 000000000000..bca1b8a7b673
 +
 +err:
 +	bch2_ec_stripe_head_put(c, h);
-+	return ERR_PTR(-ret);
++	return ERR_PTR(ret);
 +}
 +
 +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
@@ -35288,7 +37227,7 @@ index 000000000000..bca1b8a7b673
 +				continue;
 +
 +			ob = c->open_buckets + h->s->blocks[i];
-+			if (ob->ptr.dev == ca->dev_idx)
++			if (ob->dev == ca->dev_idx)
 +				goto found;
 +		}
 +		goto unlock;
@@ -35306,151 +37245,59 @@ index 000000000000..bca1b8a7b673
 +	struct genradix_iter iter;
 +	struct stripe *m;
 +
-+	genradix_for_each(&c->stripes[0], iter, m)
++	genradix_for_each(&c->stripes, iter, m)
 +		if (m->alive)
 +			bch2_stripes_heap_insert(c, m, iter.pos);
 +}
 +
-+static int __bch2_stripe_write_key(struct btree_trans *trans,
-+				   struct btree_iter *iter,
-+				   struct stripe *m,
-+				   size_t idx,
-+				   struct bkey_i_stripe *new_key)
-+{
-+	const struct bch_stripe *v;
-+	struct bkey_s_c k;
-+	unsigned i;
-+	int ret;
-+
-+	bch2_btree_iter_set_pos(iter, POS(0, idx));
-+
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		return ret;
-+
-+	if (k.k->type != KEY_TYPE_stripe)
-+		return -EIO;
-+
-+	v = bkey_s_c_to_stripe(k).v;
-+	for (i = 0; i < v->nr_blocks; i++)
-+		if (m->block_sectors[i] != stripe_blockcount_get(v, i))
-+			goto write;
-+	return 0;
-+write:
-+	bkey_reassemble(&new_key->k_i, k);
-+
-+	for (i = 0; i < new_key->v.nr_blocks; i++)
-+		stripe_blockcount_set(&new_key->v, i,
-+				      m->block_sectors[i]);
-+
-+	return bch2_trans_update(trans, iter, &new_key->k_i, 0);
-+}
-+
-+int bch2_stripes_write(struct bch_fs *c, unsigned flags)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct genradix_iter giter;
-+	struct bkey_i_stripe *new_key;
-+	struct stripe *m;
-+	int ret = 0;
-+
-+	new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
-+	BUG_ON(!new_key);
-+
-+	bch2_trans_init(&trans, c, 0, 0);
-+
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN,
-+			     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-+
-+	genradix_for_each(&c->stripes[0], giter, m) {
-+		if (!m->alive)
-+			continue;
-+
-+		ret = __bch2_trans_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_NOFAIL|flags,
-+			__bch2_stripe_write_key(&trans, &iter, m,
-+					giter.pos, new_key));
-+
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	bch2_trans_exit(&trans);
-+
-+	kfree(new_key);
-+
-+	return ret;
-+}
-+
-+static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
-+	int ret = 0;
-+
-+	if (k.k->type == KEY_TYPE_stripe)
-+		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-+			bch2_mark_key(trans, k,
-+				      BTREE_TRIGGER_NOATOMIC);
-+
-+	return ret;
-+}
-+
 +int bch2_stripes_read(struct bch_fs *c)
 +{
 +	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	const struct bch_stripe *s;
++	struct stripe *m;
++	unsigned i;
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
-+					  bch2_stripes_read_fn);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		if (k.k->type != KEY_TYPE_stripe)
++			continue;
++
++		ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
++		if (ret)
++			break;
++
++		s = bkey_s_c_to_stripe(k).v;
++
++		m = genradix_ptr(&c->stripes, k.k->p.offset);
++		m->alive	= true;
++		m->sectors	= le16_to_cpu(s->sectors);
++		m->algorithm	= s->algorithm;
++		m->nr_blocks	= s->nr_blocks;
++		m->nr_redundant	= s->nr_redundant;
++		m->blocks_nonempty = 0;
++
++		for (i = 0; i < s->nr_blocks; i++)
++			m->blocks_nonempty += !!stripe_blockcount_get(s, i);
++
++		spin_lock(&c->ec_stripes_heap_lock);
++		bch2_stripes_heap_update(c, m, k.k->p.offset);
++		spin_unlock(&c->ec_stripes_heap_lock);
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
 +	bch2_trans_exit(&trans);
++
 +	if (ret)
 +		bch_err(c, "error reading stripes: %i", ret);
 +
 +	return ret;
 +}
 +
-+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	size_t i, idx = 0;
-+	int ret = 0;
-+
-+	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
-+
-+	k = bch2_btree_iter_prev(&iter);
-+	ret = bkey_err(k);
-+	if (!ret && k.k)
-+		idx = k.k->p.offset + 1;
-+
-+	bch2_trans_iter_exit(&trans, &iter);
-+	bch2_trans_exit(&trans);
-+	if (ret)
-+		return ret;
-+
-+	if (!idx)
-+		return 0;
-+
-+	if (!gc &&
-+	    !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
-+		       GFP_KERNEL))
-+		return -ENOMEM;
-+#if 0
-+	ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
-+#else
-+	for (i = 0; i < idx; i++)
-+		if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
-+			return -ENOMEM;
-+#endif
-+	return 0;
-+}
-+
 +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +	ec_stripes_heap *h = &c->ec_stripes_heap;
@@ -35459,7 +37306,7 @@ index 000000000000..bca1b8a7b673
 +
 +	spin_lock(&c->ec_stripes_heap_lock);
 +	for (i = 0; i < min_t(size_t, h->used, 20); i++) {
-+		m = genradix_ptr(&c->stripes[0], h->data[i].idx);
++		m = genradix_ptr(&c->stripes, h->data[i].idx);
 +
 +		pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
 +		       h->data[i].blocks_nonempty,
@@ -35517,7 +37364,7 @@ index 000000000000..bca1b8a7b673
 +	BUG_ON(!list_empty(&c->ec_stripe_new_list));
 +
 +	free_heap(&c->ec_stripes_heap);
-+	genradix_free(&c->stripes[0]);
++	genradix_free(&c->stripes);
 +	bioset_exit(&c->ec_bioset);
 +}
 +
@@ -35531,10 +37378,10 @@ index 000000000000..bca1b8a7b673
 +}
 diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
 new file mode 100644
-index 000000000000..eb16e140e2c8
+index 000000000000..9d508a2f3bbc
 --- /dev/null
 +++ b/fs/bcachefs/ec.h
-@@ -0,0 +1,229 @@
+@@ -0,0 +1,228 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EC_H
 +#define _BCACHEFS_EC_H
@@ -35551,6 +37398,8 @@ index 000000000000..eb16e140e2c8
 +	.key_invalid	= bch2_stripe_invalid,		\
 +	.val_to_text	= bch2_stripe_to_text,		\
 +	.swab		= bch2_ptr_swab,		\
++	.trans_trigger	= bch2_trans_mark_stripe,	\
++	.atomic_trigger	= bch2_mark_stripe,		\
 +}
 +
 +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
@@ -35645,7 +37494,7 @@ index 000000000000..eb16e140e2c8
 +					 le16_to_cpu(s->sectors));
 +}
 +
-+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
++static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
 +					     struct extent_ptr_decoded p)
 +{
 +	unsigned nr_data = m->nr_blocks - m->nr_redundant;
@@ -35753,9 +37602,6 @@ index 000000000000..eb16e140e2c8
 +void bch2_stripes_heap_start(struct bch_fs *);
 +
 +int bch2_stripes_read(struct bch_fs *);
-+int bch2_stripes_write(struct bch_fs *, unsigned);
-+
-+int bch2_ec_mem_alloc(struct bch_fs *, bool);
 +
 +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
@@ -35766,10 +37612,10 @@ index 000000000000..eb16e140e2c8
 +#endif /* _BCACHEFS_EC_H */
 diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
 new file mode 100644
-index 000000000000..3fc31222459a
+index 000000000000..edd93da663c1
 --- /dev/null
 +++ b/fs/bcachefs/ec_types.h
-@@ -0,0 +1,37 @@
+@@ -0,0 +1,46 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EC_TYPES_H
 +#define _BCACHEFS_EC_TYPES_H
@@ -35793,6 +37639,15 @@ index 000000000000..3fc31222459a
 +	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 +	unsigned		on_heap:1;
 +	u8			blocks_nonempty;
++};
++
++struct gc_stripe {
++	u16			sectors;
++
++	u8			nr_blocks;
++	u8			nr_redundant;
++
++	unsigned		alive:1; /* does a corresponding key exist in stripes btree? */
 +	u16			block_sectors[BCH_BKEY_PTRS_MAX];
 +	struct bch_extent_ptr	ptrs[BCH_BKEY_PTRS_MAX];
 +
@@ -35807,9 +37662,27 @@ index 000000000000..3fc31222459a
 +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
 +
 +#endif /* _BCACHEFS_EC_TYPES_H */
+diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
+new file mode 100644
+index 000000000000..f7d12915c1cc
+--- /dev/null
++++ b/fs/bcachefs/errcode.h
+@@ -0,0 +1,12 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_ERRCODE_H
++#define _BCACHEFS_ERRCODE_H
++
++enum {
++	/* Bucket allocator: */
++	OPEN_BUCKETS_EMPTY =	2048,
++	FREELIST_EMPTY,		/* Allocator thread not keeping up */
++	INSUFFICIENT_DEVICES,
++};
++
++#endif /* _BCACHFES_ERRCODE_H */
 diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
 new file mode 100644
-index 000000000000..2cea694575e9
+index 000000000000..8279a9ba76a5
 --- /dev/null
 +++ b/fs/bcachefs/error.c
 @@ -0,0 +1,185 @@
@@ -35830,7 +37703,7 @@ index 000000000000..2cea694575e9
 +		return false;
 +	case BCH_ON_ERROR_ro:
 +		if (bch2_fs_emergency_read_only(c))
-+			bch_err(c, "emergency read only");
++			bch_err(c, "inconsistency detected - emergency read only");
 +		return true;
 +	case BCH_ON_ERROR_panic:
 +		panic(bch2_fmt(c, "panic after error"));
@@ -35850,7 +37723,7 @@ index 000000000000..2cea694575e9
 +void bch2_fatal_error(struct bch_fs *c)
 +{
 +	if (bch2_fs_emergency_read_only(c))
-+		bch_err(c, "emergency read only");
++		bch_err(c, "fatal error - emergency read only");
 +}
 +
 +void bch2_io_error_work(struct work_struct *work)
@@ -36000,10 +37873,10 @@ index 000000000000..2cea694575e9
 +}
 diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
 new file mode 100644
-index 000000000000..986938298adc
+index 000000000000..6e63c38186f3
 --- /dev/null
 +++ b/fs/bcachefs/error.h
-@@ -0,0 +1,218 @@
+@@ -0,0 +1,238 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ERROR_H
 +#define _BCACHEFS_ERROR_H
@@ -36045,7 +37918,7 @@ index 000000000000..986938298adc
 +
 +#define bch2_fs_inconsistent_on(cond, c, ...)				\
 +({									\
-+	int _ret = !!(cond);						\
++	bool _ret = unlikely(!!(cond));					\
 +									\
 +	if (_ret)							\
 +		bch2_fs_inconsistent(c, __VA_ARGS__);			\
@@ -36065,7 +37938,7 @@ index 000000000000..986938298adc
 +
 +#define bch2_dev_inconsistent_on(cond, ca, ...)				\
 +({									\
-+	int _ret = !!(cond);						\
++	bool _ret = unlikely(!!(cond));					\
 +									\
 +	if (_ret)							\
 +		bch2_dev_inconsistent(ca, __VA_ARGS__);			\
@@ -36073,6 +37946,26 @@ index 000000000000..986938298adc
 +})
 +
 +/*
++ * When a transaction update discovers or is causing a fs inconsistency, it's
++ * helpful to also dump the pending updates:
++ */
++#define bch2_trans_inconsistent(trans, ...)				\
++({									\
++	bch_err(trans->c, __VA_ARGS__);					\
++	bch2_inconsistent_error(trans->c);				\
++	bch2_dump_trans_updates(trans);					\
++})
++
++#define bch2_trans_inconsistent_on(cond, trans, ...)			\
++({									\
++	bool _ret = unlikely(!!(cond));					\
++									\
++	if (_ret)							\
++		bch2_trans_inconsistent(trans, __VA_ARGS__);		\
++	_ret;								\
++})
++
++/*
 + * Fsck errors: inconsistency errors we detect at mount time, and should ideally
 + * be able to repair:
 + */
@@ -36135,7 +38028,7 @@ index 000000000000..986938298adc
 +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 +
 +#define __fsck_err_on(cond, c, _flags, ...)				\
-+	((cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
++	(unlikely(cond) ? __fsck_err(c, _flags,	##__VA_ARGS__) : false)
 +
 +#define need_fsck_err_on(cond, c, ...)					\
 +	__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
@@ -36170,7 +38063,7 @@ index 000000000000..986938298adc
 +
 +#define bch2_fs_fatal_err_on(cond, c, ...)				\
 +({									\
-+	int _ret = !!(cond);						\
++	bool _ret = unlikely(!!(cond));					\
 +									\
 +	if (_ret)							\
 +		bch2_fs_fatal_error(c, __VA_ARGS__);			\
@@ -36224,10 +38117,10 @@ index 000000000000..986938298adc
 +#endif /* _BCACHEFS_ERROR_H */
 diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
 new file mode 100644
-index 000000000000..58b2c96f450c
+index 000000000000..2fd5d9672a44
 --- /dev/null
 +++ b/fs/bcachefs/extent_update.c
-@@ -0,0 +1,169 @@
+@@ -0,0 +1,178 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "btree_update.h"
@@ -36245,17 +38138,26 @@ index 000000000000..58b2c96f450c
 +{
 +	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +	const union bch_extent_entry *entry;
-+	unsigned ret = 0;
++	unsigned ret = 0, lru = 0;
 +
 +	bkey_extent_entry_for_each(ptrs, entry) {
 +		switch (__extent_entry_type(entry)) {
 +		case BCH_EXTENT_ENTRY_ptr:
++			/* Might also be updating LRU btree */
++			if (entry->ptr.cached)
++				lru++;
++
++			fallthrough;
 +		case BCH_EXTENT_ENTRY_stripe_ptr:
 +			ret++;
 +		}
 +	}
 +
-+	return ret;
++	/*
++	 * Updating keys in the alloc btree may also update keys in the
++	 * freespace or discard btrees:
++	 */
++	return lru + ret * 2;
 +}
 +
 +static int count_iters_for_insert(struct btree_trans *trans,
@@ -36417,10 +38319,10 @@ index 000000000000..6f5cf449361a
 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */
 diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
 new file mode 100644
-index 000000000000..89b5be907eea
+index 000000000000..77a0d49a2372
 --- /dev/null
 +++ b/fs/bcachefs/extents.c
-@@ -0,0 +1,1249 @@
+@@ -0,0 +1,1259 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
@@ -36726,7 +38628,7 @@ index 000000000000..89b5be907eea
 +
 +			if (lp.crc.csum_type &&
 +			    lp.crc.uncompressed_size +
-+			    rp.crc.uncompressed_size > c->sb.encoded_extent_max)
++			    rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
 +				return false;
 +
 +			if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
@@ -37377,15 +39279,25 @@ index 000000000000..89b5be907eea
 +		switch (__extent_entry_type(entry)) {
 +		case BCH_EXTENT_ENTRY_ptr:
 +			ptr = entry_to_ptr(entry);
-+			ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
++			ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
 +				? bch_dev_bkey_exists(c, ptr->dev)
 +				: NULL;
 +
-+			pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-+			       (u64) ptr->offset, ptr->gen,
-+			       ptr->cached ? " cached" : "",
-+			       ca && ptr_stale(ca, ptr)
-+			       ? " stale" : "");
++			if (!ca) {
++				pr_buf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
++				       (u64) ptr->offset, ptr->gen,
++				       ptr->cached ? " cached" : "");
++			} else {
++				u32 offset;
++				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
++
++				pr_buf(out, "ptr: %u:%llu:%u gen %u%s", ptr->dev,
++				       b, offset, ptr->gen,
++				       ptr->cached ? " cached" : "");
++
++				if (ca && ptr_stale(ca, ptr))
++					pr_buf(out, " stale");
++			}
 +			break;
 +		case BCH_EXTENT_ENTRY_crc32:
 +		case BCH_EXTENT_ENTRY_crc64:
@@ -37461,7 +39373,7 @@ index 000000000000..89b5be907eea
 +
 +	if (k.k->type == KEY_TYPE_btree_ptr ||
 +	    k.k->type == KEY_TYPE_btree_ptr_v2)
-+		size_ondisk = c->opts.btree_node_size;
++		size_ondisk = btree_sectors(c);
 +
 +	bkey_extent_entry_for_each(ptrs, entry) {
 +		if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
@@ -37672,10 +39584,10 @@ index 000000000000..89b5be907eea
 +}
 diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
 new file mode 100644
-index 000000000000..9c2567274a2b
+index 000000000000..ae650849d98a
 --- /dev/null
 +++ b/fs/bcachefs/extents.h
-@@ -0,0 +1,680 @@
+@@ -0,0 +1,688 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EXTENTS_H
 +#define _BCACHEFS_EXTENTS_H
@@ -38059,6 +39971,8 @@ index 000000000000..9c2567274a2b
 +	.key_invalid	= bch2_btree_ptr_invalid,		\
 +	.val_to_text	= bch2_btree_ptr_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
 +}
 +
 +#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
@@ -38066,6 +39980,8 @@ index 000000000000..9c2567274a2b
 +	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
 +	.compat		= bch2_btree_ptr_v2_compat,		\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
 +}
 +
 +/* KEY_TYPE_extent: */
@@ -38080,6 +39996,8 @@ index 000000000000..9c2567274a2b
 +	.swab		= bch2_ptr_swab,			\
 +	.key_normalize	= bch2_extent_normalize,		\
 +	.key_merge	= bch2_extent_merge,			\
++	.trans_trigger	= bch2_trans_mark_extent,		\
++	.atomic_trigger	= bch2_mark_extent,			\
 +}
 +
 +/* KEY_TYPE_reservation: */
@@ -38092,6 +40010,8 @@ index 000000000000..9c2567274a2b
 +	.key_invalid	= bch2_reservation_invalid,		\
 +	.val_to_text	= bch2_reservation_to_text,		\
 +	.key_merge	= bch2_reservation_merge,		\
++	.trans_trigger	= bch2_trans_mark_reservation,		\
++	.atomic_trigger	= bch2_mark_reservation,		\
 +}
 +
 +/* Extent checksum entries: */
@@ -38404,10 +40324,10 @@ index 000000000000..43d6c341ecca
 +#endif /* _BCACHEFS_EXTENTS_TYPES_H */
 diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
 new file mode 100644
-index 000000000000..26d5cad7e6a5
+index 000000000000..05429c9631cd
 --- /dev/null
 +++ b/fs/bcachefs/eytzinger.h
-@@ -0,0 +1,285 @@
+@@ -0,0 +1,281 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _EYTZINGER_H
 +#define _EYTZINGER_H
@@ -38427,10 +40347,6 @@ index 000000000000..26d5cad7e6a5
 + *
 + * With one based indexing each level of the tree starts at a power of two -
 + * good for cacheline alignment:
-+ *
-+ * Size parameter is treated as if we were using 0 based indexing, however:
-+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
-+ * are actually size - 1 elements
 + */
 +
 +static inline unsigned eytzinger1_child(unsigned i, unsigned child)
@@ -38452,12 +40368,12 @@ index 000000000000..26d5cad7e6a5
 +
 +static inline unsigned eytzinger1_first(unsigned size)
 +{
-+	return rounddown_pow_of_two(size - 1);
++	return rounddown_pow_of_two(size);
 +}
 +
 +static inline unsigned eytzinger1_last(unsigned size)
 +{
-+	return rounddown_pow_of_two(size) - 1;
++	return rounddown_pow_of_two(size + 1) - 1;
 +}
 +
 +/*
@@ -38472,13 +40388,13 @@ index 000000000000..26d5cad7e6a5
 +
 +static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 +{
-+	EBUG_ON(i >= size);
++	EBUG_ON(i > size);
 +
-+	if (eytzinger1_right_child(i) < size) {
++	if (eytzinger1_right_child(i) <= size) {
 +		i = eytzinger1_right_child(i);
 +
-+		i <<= __fls(size) - __fls(i);
-+		i >>= i >= size;
++		i <<= __fls(size + 1) - __fls(i);
++		i >>= i > size;
 +	} else {
 +		i >>= ffz(i) + 1;
 +	}
@@ -38488,14 +40404,14 @@ index 000000000000..26d5cad7e6a5
 +
 +static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 +{
-+	EBUG_ON(i >= size);
++	EBUG_ON(i > size);
 +
-+	if (eytzinger1_left_child(i) < size) {
++	if (eytzinger1_left_child(i) <= size) {
 +		i = eytzinger1_left_child(i) + 1;
 +
-+		i <<= __fls(size) - __fls(i);
++		i <<= __fls(size + 1) - __fls(i);
 +		i -= 1;
-+		i >>= i >= size;
++		i >>= i > size;
 +	} else {
 +		i >>= __ffs(i) + 1;
 +	}
@@ -38505,17 +40421,17 @@ index 000000000000..26d5cad7e6a5
 +
 +static inline unsigned eytzinger1_extra(unsigned size)
 +{
-+	return (size - rounddown_pow_of_two(size - 1)) << 1;
++	return (size + 1 - rounddown_pow_of_two(size)) << 1;
 +}
 +
 +static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 +					      unsigned extra)
 +{
 +	unsigned b = __fls(i);
-+	unsigned shift = __fls(size - 1) - b;
++	unsigned shift = __fls(size) - b;
 +	int s;
 +
-+	EBUG_ON(!i || i >= size);
++	EBUG_ON(!i || i > size);
 +
 +	i  ^= 1U << b;
 +	i <<= 1;
@@ -38540,7 +40456,7 @@ index 000000000000..26d5cad7e6a5
 +	unsigned shift;
 +	int s;
 +
-+	EBUG_ON(!i || i >= size);
++	EBUG_ON(!i || i > size);
 +
 +	/*
 +	 * sign bit trick:
@@ -38554,7 +40470,7 @@ index 000000000000..26d5cad7e6a5
 +	shift = __ffs(i);
 +
 +	i >>= shift + 1;
-+	i  |= 1U << (__fls(size - 1) - shift);
++	i  |= 1U << (__fls(size) - shift);
 +
 +	return i;
 +}
@@ -38595,39 +40511,39 @@ index 000000000000..26d5cad7e6a5
 +
 +static inline unsigned eytzinger0_first(unsigned size)
 +{
-+	return eytzinger1_first(size + 1) - 1;
++	return eytzinger1_first(size) - 1;
 +}
 +
 +static inline unsigned eytzinger0_last(unsigned size)
 +{
-+	return eytzinger1_last(size + 1) - 1;
++	return eytzinger1_last(size) - 1;
 +}
 +
 +static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 +{
-+	return eytzinger1_next(i + 1, size + 1) - 1;
++	return eytzinger1_next(i + 1, size) - 1;
 +}
 +
 +static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 +{
-+	return eytzinger1_prev(i + 1, size + 1) - 1;
++	return eytzinger1_prev(i + 1, size) - 1;
 +}
 +
 +static inline unsigned eytzinger0_extra(unsigned size)
 +{
-+	return eytzinger1_extra(size + 1);
++	return eytzinger1_extra(size);
 +}
 +
 +static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
 +					       unsigned extra)
 +{
-+	return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
++	return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
 +}
 +
 +static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
 +					       unsigned extra)
 +{
-+	return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
++	return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
 +}
 +
 +static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
@@ -38828,10 +40744,10 @@ index 000000000000..cdb272708a4b
 +#endif /* _BCACHEFS_FIFO_H */
 diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
 new file mode 100644
-index 000000000000..5f3429e99115
+index 000000000000..d543480be111
 --- /dev/null
 +++ b/fs/bcachefs/fs-common.c
-@@ -0,0 +1,493 @@
+@@ -0,0 +1,494 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -39163,6 +41079,7 @@ index 000000000000..5f3429e99115
 +	bool ret = false;
 +
 +	for (id = 0; id < Inode_opt_nr; id++) {
++		/* Skip attributes that were explicitly set on this inode */
 +		if (dst_u->bi_fields_set & (1 << id))
 +			continue;
 +
@@ -39376,10 +41293,10 @@ index 000000000000..dde237859514
 +#endif /* _BCACHEFS_FS_COMMON_H */
 diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
 new file mode 100644
-index 000000000000..25643c71ec91
+index 000000000000..4004fa5c1cc9
 --- /dev/null
 +++ b/fs/bcachefs/fs-io.c
-@@ -0,0 +1,3427 @@
+@@ -0,0 +1,3495 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -39799,6 +41716,110 @@ index 000000000000..25643c71ec91
 +				      bv.bv_len >> 9, nr_ptrs, state);
 +}
 +
++static void mark_pagecache_unallocated(struct bch_inode_info *inode,
++				       u64 start, u64 end)
++{
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct pagevec pvec;
++
++	if (end <= start)
++		return;
++
++	pagevec_init(&pvec);
++
++	do {
++		unsigned nr_pages, i, j;
++
++		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
++						&index, end_index);
++		for (i = 0; i < nr_pages; i++) {
++			struct page *page = pvec.pages[i];
++			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
++			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
++			unsigned pg_offset = max(start, pg_start) - pg_start;
++			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
++			struct bch_page_state *s;
++
++			BUG_ON(end <= pg_start);
++			BUG_ON(pg_offset >= PAGE_SECTORS);
++			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
++
++			lock_page(page);
++			s = bch2_page_state(page);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = pg_offset; j < pg_offset + pg_len; j++)
++					s->s[j].nr_replicas = 0;
++				spin_unlock(&s->lock);
++			}
++
++			unlock_page(page);
++		}
++		pagevec_release(&pvec);
++	} while (index <= end_index);
++}
++
++static void mark_pagecache_reserved(struct bch_inode_info *inode,
++				    u64 start, u64 end)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct pagevec pvec;
++	s64 i_sectors_delta = 0;
++
++	if (end <= start)
++		return;
++
++	pagevec_init(&pvec);
++
++	do {
++		unsigned nr_pages, i, j;
++
++		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
++						&index, end_index);
++		for (i = 0; i < nr_pages; i++) {
++			struct page *page = pvec.pages[i];
++			u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
++			u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
++			unsigned pg_offset = max(start, pg_start) - pg_start;
++			unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
++			struct bch_page_state *s;
++
++			BUG_ON(end <= pg_start);
++			BUG_ON(pg_offset >= PAGE_SECTORS);
++			BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
++
++			lock_page(page);
++			s = bch2_page_state(page);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = pg_offset; j < pg_offset + pg_len; j++)
++					switch (s->s[j].state) {
++					case SECTOR_UNALLOCATED:
++						s->s[j].state = SECTOR_RESERVED;
++						break;
++					case SECTOR_DIRTY:
++						s->s[j].state = SECTOR_DIRTY_RESERVED;
++						i_sectors_delta--;
++						break;
++					default:
++						break;
++					}
++				spin_unlock(&s->lock);
++			}
++
++			unlock_page(page);
++		}
++		pagevec_release(&pvec);
++	} while (index <= end_index);
++
++	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++}
++
 +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 +{
 +	/* XXX: this should not be open coded */
@@ -39954,8 +41975,7 @@ index 000000000000..25643c71ec91
 +
 +	bch2_disk_reservation_put(c, &disk_res);
 +
-+	if (dirty_sectors)
-+		i_sectors_acct(c, inode, NULL, dirty_sectors);
++	i_sectors_acct(c, inode, NULL, dirty_sectors);
 +
 +	bch2_page_state_release(page);
 +}
@@ -40003,8 +42023,7 @@ index 000000000000..25643c71ec91
 +
 +	spin_unlock(&s->lock);
 +
-+	if (dirty_sectors)
-+		i_sectors_acct(c, inode, &res->quota, dirty_sectors);
++	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 +
 +	if (!PageDirty(page))
 +		__set_page_dirty_nobuffers(page);
@@ -40304,7 +42323,7 @@ index 000000000000..25643c71ec91
 +
 +	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 +			     SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-+			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
++			     BTREE_ITER_SLOTS);
 +	while (1) {
 +		struct bkey_s_c k;
 +		unsigned bytes, sectors, offset_into_extent;
@@ -40342,8 +42361,6 @@ index 000000000000..25643c71ec91
 +
 +		sectors = min(sectors, k.k->size - offset_into_extent);
 +
-+		bch2_trans_unlock(trans);
-+
 +		if (readpages_iter)
 +			readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
 +					    extent_partial_reads_expensive(k));
@@ -40560,7 +42577,7 @@ index 000000000000..25643c71ec91
 +	 * racing with fallocate can cause us to add fewer sectors than
 +	 * expected - but we shouldn't add more sectors than expected:
 +	 */
-+	BUG_ON(io->op.i_sectors_delta > 0);
++	WARN_ON_ONCE(io->op.i_sectors_delta > 0);
 +
 +	/*
 +	 * (error (due to going RO) halfway through a page can screw that up
@@ -40746,8 +42763,8 @@ index 000000000000..25643c71ec91
 +				     sectors << 9, offset << 9));
 +
 +		/* Check for writing past i_size: */
-+		WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-+			round_up(i_size, block_bytes(c)));
++		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
++			     round_up(i_size, block_bytes(c)));
 +
 +		w->io->op.res.sectors += reserved_sectors;
 +		w->io->op.i_sectors_delta -= dirty_sectors;
@@ -41201,7 +43218,7 @@ index 000000000000..25643c71ec91
 +	iter->count -= shorten;
 +
 +	bio = bio_alloc_bioset(GFP_KERNEL,
-+			       iov_iter_npages(iter, BIO_MAX_VECS),
++			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 +			       &c->dio_read_bioset);
 +
 +	bio->bi_end_io = bch2_direct_IO_read_endio;
@@ -41236,7 +43253,7 @@ index 000000000000..25643c71ec91
 +	goto start;
 +	while (iter->count) {
 +		bio = bio_alloc_bioset(GFP_KERNEL,
-+				       iov_iter_npages(iter, BIO_MAX_VECS),
++				       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 +				       &c->bio_read);
 +		bio->bi_end_io		= bch2_direct_IO_read_split_endio;
 +start:
@@ -41383,7 +43400,7 @@ index 000000000000..25643c71ec91
 +	while (1) {
 +		iter_count = dio->iter.count;
 +
-+		if (kthread)
++		if (kthread && dio->mm)
 +			kthread_use_mm(dio->mm);
 +		BUG_ON(current->faults_disabled_mapping);
 +		current->faults_disabled_mapping = mapping;
@@ -41393,7 +43410,7 @@ index 000000000000..25643c71ec91
 +		dropped_locks = fdm_dropped_locks();
 +
 +		current->faults_disabled_mapping = NULL;
-+		if (kthread)
++		if (kthread && dio->mm)
 +			kthread_unuse_mm(dio->mm);
 +
 +		/*
@@ -41586,9 +43603,7 @@ index 000000000000..25643c71ec91
 +	}
 +
 +	bio = bio_alloc_bioset(GFP_KERNEL,
-+			       iov_iter_is_bvec(iter)
-+			       ? 0
-+			       : iov_iter_npages(iter, BIO_MAX_VECS),
++			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
 +			       &c->dio_write_bioset);
 +	dio = container_of(bio, struct dio_write, op.wbio.bio);
 +	init_completion(&dio->done);
@@ -41985,6 +44000,9 @@ index 000000000000..25643c71ec91
 +			U64_MAX, &i_sectors_delta);
 +	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 +
++	WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
++		!bch2_journal_error(&c->journal));
++
 +	if (unlikely(ret))
 +		goto err;
 +
@@ -42325,6 +44343,9 @@ index 000000000000..25643c71ec91
 +			ret = 0;
 +	}
 +
++	bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
++	mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
++
 +	if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
 +		struct quota_res quota_res = { 0 };
 +		s64 i_sectors_delta = 0;
@@ -42430,43 +44451,6 @@ index 000000000000..25643c71ec91
 +	return ret;
 +}
 +
-+static void mark_range_unallocated(struct bch_inode_info *inode,
-+				   loff_t start, loff_t end)
-+{
-+	pgoff_t index = start >> PAGE_SHIFT;
-+	pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
-+	struct pagevec pvec;
-+
-+	pagevec_init(&pvec);
-+
-+	do {
-+		unsigned nr_pages, i, j;
-+
-+		nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
-+						&index, end_index);
-+		if (nr_pages == 0)
-+			break;
-+
-+		for (i = 0; i < nr_pages; i++) {
-+			struct page *page = pvec.pages[i];
-+			struct bch_page_state *s;
-+
-+			lock_page(page);
-+			s = bch2_page_state(page);
-+
-+			if (s) {
-+				spin_lock(&s->lock);
-+				for (j = 0; j < PAGE_SECTORS; j++)
-+					s->s[j].nr_replicas = 0;
-+				spin_unlock(&s->lock);
-+			}
-+
-+			unlock_page(page);
-+		}
-+		pagevec_release(&pvec);
-+	} while (index <= end_index);
-+}
-+
 +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 +			     struct file *file_dst, loff_t pos_dst,
 +			     loff_t len, unsigned remap_flags)
@@ -42512,7 +44496,8 @@ index 000000000000..25643c71ec91
 +	if (ret)
 +		goto err;
 +
-+	mark_range_unallocated(src, pos_src, pos_src + aligned_len);
++	mark_pagecache_unallocated(src, pos_src >> 9,
++				   (pos_src + aligned_len) >> 9);
 +
 +	ret = bch2_remap_range(c,
 +			       inode_inum(dst), pos_dst >> 9,
@@ -43488,10 +45473,10 @@ index 000000000000..f201980ef2c3
 +#endif /* _BCACHEFS_FS_IOCTL_H */
 diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
 new file mode 100644
-index 000000000000..7eb33da9c253
+index 000000000000..d462c06899d6
 --- /dev/null
 +++ b/fs/bcachefs/fs.c
-@@ -0,0 +1,1935 @@
+@@ -0,0 +1,1940 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -43524,6 +45509,7 @@ index 000000000000..7eb33da9c253
 +#include <linux/pagemap.h>
 +#include <linux/posix_acl.h>
 +#include <linux/random.h>
++#include <linux/seq_file.h>
 +#include <linux/statfs.h>
 +#include <linux/string.h>
 +#include <linux/xattr.h>
@@ -43598,7 +45584,7 @@ index 000000000000..7eb33da9c253
 +
 +	bch2_assert_pos_locked(trans, BTREE_ID_inodes,
 +			       POS(0, bi->bi_inum),
-+			       0 && c->opts.inodes_use_key_cache);
++			       c->opts.inodes_use_key_cache);
 +
 +	set_nlink(&inode->v, bch2_inode_nlink_get(bi));
 +	i_uid_write(&inode->v, bi->bi_uid);
@@ -43628,7 +45614,6 @@ index 000000000000..7eb33da9c253
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, 0, 512);
-+	trans.ip = _RET_IP_;
 +retry:
 +	bch2_trans_begin(&trans);
 +
@@ -44360,8 +46345,8 @@ index 000000000000..7eb33da9c253
 +			else
 +				offset += p.crc.offset;
 +
-+			if ((offset & (c->opts.block_size - 1)) ||
-+			    (k.k->size & (c->opts.block_size - 1)))
++			if ((offset & (block_sectors(c) - 1)) ||
++			    (k.k->size & (block_sectors(c) - 1)))
 +				flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 +
 +			ret = fiemap_fill_next_extent(info,
@@ -44428,9 +46413,9 @@ index 000000000000..7eb33da9c253
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 +			     SPOS(ei->v.i_ino, start, snapshot), 0);
 +
-+	while ((k = bch2_btree_iter_peek(&iter)).k &&
-+	       !(ret = bkey_err(k)) &&
-+	       bkey_cmp(iter.pos, end) < 0) {
++	while (!(ret = btree_trans_too_many_iters(&trans)) &&
++	       (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
++	       !(ret = bkey_err(k))) {
 +		enum btree_id data_btree = BTREE_ID_extents;
 +
 +		if (!bkey_extent_is_data(k.k) &&
@@ -44966,12 +46951,12 @@ index 000000000000..7eb33da9c253
 +				KEY_TYPE_QUOTA_WARN);
 +		bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
 +				KEY_TYPE_QUOTA_WARN);
-+		bch2_inode_rm(c, inode_inum(inode), true);
++		bch2_inode_rm(c, inode_inum(inode));
 +	}
 +}
 +
 +void bch2_evict_subvolume_inodes(struct bch_fs *c,
-+				 struct snapshot_id_list *s)
++				 snapshot_id_list *s)
 +{
 +	struct super_block *sb = c->vfs_sb;
 +	struct inode *inode;
@@ -45169,25 +47154,30 @@ index 000000000000..7eb33da9c253
 +{
 +	struct bch_fs *c = root->d_sb->s_fs_info;
 +	enum bch_opt_id i;
-+	char buf[512];
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
 +
 +	for (i = 0; i < bch2_opts_nr; i++) {
 +		const struct bch_option *opt = &bch2_opt_table[i];
 +		u64 v = bch2_opt_get_by_id(&c->opts, i);
 +
-+		if (!(opt->mode & OPT_MOUNT))
++		if (!(opt->flags & OPT_MOUNT))
 +			continue;
 +
 +		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
 +			continue;
 +
-+		bch2_opt_to_text(&PBUF(buf), c, opt, v,
++		printbuf_reset(&buf);
++		bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v,
 +				 OPT_SHOW_MOUNT_STYLE);
 +		seq_putc(seq, ',');
-+		seq_puts(seq, buf);
++		seq_puts(seq, buf.buf);
 +	}
 +
-+	return 0;
++	if (buf.allocation_failure)
++		ret = -ENOMEM;
++	printbuf_exit(&buf);
++	return ret;
 +}
 +
 +static void bch2_put_super(struct super_block *sb)
@@ -45429,7 +47419,7 @@ index 000000000000..7eb33da9c253
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
 new file mode 100644
-index 000000000000..b2211ec7f302
+index 000000000000..9f4b57e30e2a
 --- /dev/null
 +++ b/fs/bcachefs/fs.h
 @@ -0,0 +1,208 @@
@@ -45626,7 +47616,7 @@ index 000000000000..b2211ec7f302
 +			 struct iattr *);
 +int __bch2_unlink(struct inode *, struct dentry *, bool);
 +
-+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
++void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *);
 +
 +void bch2_vfs_exit(void);
 +int bch2_vfs_init(void);
@@ -45634,7 +47624,7 @@ index 000000000000..b2211ec7f302
 +#else
 +
 +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
-+					       struct snapshot_id_list *s) {}
++					       snapshot_id_list *s) {}
 +static inline void bch2_vfs_exit(void) {}
 +static inline int bch2_vfs_init(void) { return 0; }
 +
@@ -45643,15 +47633,16 @@ index 000000000000..b2211ec7f302
 +#endif /* _BCACHEFS_FS_H */
 diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
 new file mode 100644
-index 000000000000..361dbf338023
+index 000000000000..2582ddf14803
 --- /dev/null
 +++ b/fs/bcachefs/fsck.c
-@@ -0,0 +1,2345 @@
+@@ -0,0 +1,2356 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
 +#include "btree_update.h"
++#include "darray.h"
 +#include "dirent.h"
 +#include "error.h"
 +#include "fs-common.h"
@@ -46120,11 +48111,11 @@ index 000000000000..361dbf338023
 +	pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
 +
 +	if (bkey_cmp(s->pos, pos))
-+		s->nr = 0;
++		s->ids.nr = 0;
 +	s->pos = pos;
 +
 +	/* Might get called multiple times due to lock restarts */
-+	if (s->nr && s->d[s->nr - 1] == pos.snapshot)
++	if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
 +		return 0;
 +
 +	return snapshots_seen_add(c, s, pos.snapshot);
@@ -46147,7 +48138,7 @@ index 000000000000..361dbf338023
 +	ancestor	= snapshot_t(c, ancestor)->equiv;
 +
 +	/* @ancestor should be the snapshot most recently added to @seen */
-+	BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
++	BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
 +	BUG_ON(seen->pos.snapshot != ancestor);
 +
 +	if (id == ancestor)
@@ -46156,11 +48147,11 @@ index 000000000000..361dbf338023
 +	if (!bch2_snapshot_is_ancestor(c, id, ancestor))
 +		return false;
 +
-+	for (i = seen->nr - 2;
-+	     i >= 0 && seen->d[i] >= id;
++	for (i = seen->ids.nr - 2;
++	     i >= 0 && seen->ids.data[i] >= id;
 +	     --i)
-+		if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
-+		    bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
++		if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
++		    bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
 +			return false;
 +
 +	return true;
@@ -46186,26 +48177,25 @@ index 000000000000..361dbf338023
 +}
 +
 +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)	\
-+	for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
++	for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
 +		if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
 +
++struct inode_walker_entry {
++	struct bch_inode_unpacked inode;
++	u32			snapshot;
++	u64			count;
++};
++
 +struct inode_walker {
 +	bool				first_this_inode;
 +	u64				cur_inum;
 +
-+	size_t				nr;
-+	size_t				size;
-+	struct inode_walker_entry {
-+		struct bch_inode_unpacked inode;
-+		u32			snapshot;
-+		u64			count;
-+	} *d;
++	DARRAY(struct inode_walker_entry) inodes;
 +};
 +
 +static void inode_walker_exit(struct inode_walker *w)
 +{
-+	kfree(w->d);
-+	w->d = NULL;
++	darray_exit(w->inodes);
 +}
 +
 +static struct inode_walker inode_walker_init(void)
@@ -46213,40 +48203,17 @@ index 000000000000..361dbf338023
 +	return (struct inode_walker) { 0, };
 +}
 +
-+static int inode_walker_realloc(struct inode_walker *w)
-+{
-+	if (w->nr == w->size) {
-+		size_t new_size = max_t(size_t, 8UL, w->size * 2);
-+		void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
-+				   GFP_KERNEL);
-+		if (!d)
-+			return -ENOMEM;
-+
-+		w->d = d;
-+		w->size = new_size;
-+	}
-+
-+	return 0;
-+}
-+
 +static int add_inode(struct bch_fs *c, struct inode_walker *w,
 +		     struct bkey_s_c inode)
 +{
 +	struct bch_inode_unpacked u;
-+	int ret;
-+
-+	ret = inode_walker_realloc(w);
-+	if (ret)
-+		return ret;
 +
 +	BUG_ON(bch2_inode_unpack(inode, &u));
 +
-+	w->d[w->nr++] = (struct inode_walker_entry) {
++	return darray_push(w->inodes, ((struct inode_walker_entry) {
 +		.inode		= u,
 +		.snapshot	= snapshot_t(c, inode.k->p.snapshot)->equiv,
-+	};
-+
-+	return 0;
++	}));
 +}
 +
 +static int __walk_inode(struct btree_trans *trans,
@@ -46265,7 +48232,7 @@ index 000000000000..361dbf338023
 +		goto lookup_snapshot;
 +	}
 +
-+	w->nr = 0;
++	w->inodes.nr = 0;
 +
 +	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
 +			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -46283,26 +48250,25 @@ index 000000000000..361dbf338023
 +	w->cur_inum		= pos.inode;
 +	w->first_this_inode	= true;
 +lookup_snapshot:
-+	for (i = 0; i < w->nr; i++)
-+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
++	for (i = 0; i < w->inodes.nr; i++)
++		if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
 +			goto found;
 +	return INT_MAX;
 +found:
-+	BUG_ON(pos.snapshot > w->d[i].snapshot);
++	BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
 +
-+	if (pos.snapshot != w->d[i].snapshot) {
++	if (pos.snapshot != w->inodes.data[i].snapshot) {
 +		ancestor_pos = i;
 +
-+		while (i && w->d[i - 1].snapshot > pos.snapshot)
++		while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
 +			--i;
 +
-+		ret = inode_walker_realloc(w);
++		ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]);
 +		if (ret)
 +			return ret;
 +
-+		array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
-+		w->d[i].snapshot = pos.snapshot;
-+		w->d[i].count	= 0;
++		w->inodes.data[i].snapshot = pos.snapshot;
++		w->inodes.data[i].count	= 0;
 +	}
 +
 +	return i;
@@ -46318,7 +48284,7 @@ index 000000000000..361dbf338023
 +	struct bkey_s_c k;
 +	int ret;
 +
-+	w->nr = 0;
++	w->inodes.nr = 0;
 +
 +	for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
 +			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -46344,15 +48310,16 @@ index 000000000000..361dbf338023
 +				  struct bkey_s_c k)
 +{
 +	struct bch_fs *c = trans->c;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
 +			"key in missing snapshot: %s",
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-+		return bch2_btree_delete_at(trans, iter,
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
++		ret = bch2_btree_delete_at(trans, iter,
 +					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -46392,7 +48359,7 @@ index 000000000000..361dbf338023
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter = { NULL };
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	struct bkey_s_c k;
 +	u64 hash;
 +	int ret = 0;
@@ -46416,8 +48383,9 @@ index 000000000000..361dbf338023
 +		if (fsck_err_on(k.k->type == desc.key_type &&
 +				!desc.cmp_bkey(k, hash_k), c,
 +				"duplicate hash table keys:\n%s",
-+				(bch2_bkey_val_to_text(&PBUF(buf), c,
-+						       hash_k), buf))) {
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, hash_k),
++				 buf.buf))) {
 +			ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
 +			break;
 +		}
@@ -46428,13 +48396,16 @@ index 000000000000..361dbf338023
 +		}
 +
 +	}
++out:
 +	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf);
 +	return ret;
 +bad_hash:
 +	if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
 +		     "hashed to %llu\n%s",
 +		     desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
-+		     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
++		     (printbuf_reset(&buf),
++		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
 +		return 0;
 +
 +	ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
@@ -46442,9 +48413,9 @@ index 000000000000..361dbf338023
 +		bch_err(c, "hash_redo_key err %i", ret);
 +		return ret;
 +	}
-+	return -EINTR;
++	ret = -EINTR;
 +fsck_err:
-+	return ret;
++	goto out;
 +}
 +
 +static int check_inode(struct btree_trans *trans,
@@ -46774,7 +48745,7 @@ index 000000000000..361dbf338023
 +	int ret = 0, ret2 = 0;
 +	s64 count2;
 +
-+	for (i = w->d; i < w->d + w->nr; i++) {
++	darray_for_each(w->inodes, i) {
 +		if (i->inode.bi_sectors == i->count)
 +			continue;
 +
@@ -46812,32 +48783,34 @@ index 000000000000..361dbf338023
 +	struct bch_fs *c = trans->c;
 +	struct bkey_s_c k;
 +	struct inode_walker_entry *i;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	k = bch2_btree_iter_peek(iter);
 +	if (!k.k)
-+		return 0;
++		goto out;
 +
 +	ret = bkey_err(k);
 +	if (ret)
-+		return ret;
++		goto err;
 +
 +	ret = check_key_has_snapshot(trans, iter, k);
-+	if (ret)
-+		return ret < 0 ? ret : 0;
++	if (ret) {
++		ret = ret < 0 ? ret : 0;
++		goto out;
++	}
 +
 +	ret = snapshots_seen_update(c, s, k.k->p);
 +	if (ret)
-+		return ret;
++		goto err;
 +
 +	if (k.k->type == KEY_TYPE_whiteout)
-+		return 0;
++		goto out;
 +
 +	if (inode->cur_inum != k.k->p.inode) {
 +		ret = check_i_sectors(trans, inode);
 +		if (ret)
-+			return ret;
++			goto err;
 +	}
 +#if 0
 +	if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
@@ -46847,33 +48820,43 @@ index 000000000000..361dbf338023
 +		bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
 +		bch2_bkey_val_to_text(&PBUF(buf2), c, k);
 +
-+		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-+			return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
++		if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
++			ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
++			goto out;
++		}
 +	}
 +#endif
 +	ret = __walk_inode(trans, inode, k.k->p);
 +	if (ret < 0)
-+		return ret;
++		goto err;
 +
 +	if (fsck_err_on(ret == INT_MAX, c,
 +			"extent in missing inode:\n  %s",
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-+		return bch2_btree_delete_at(trans, iter,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
 +					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
 +
-+	if (ret == INT_MAX)
-+		return 0;
++	if (ret == INT_MAX) {
++		ret = 0;
++		goto out;
++	}
 +
-+	i = inode->d + ret;
++	i = inode->inodes.data + ret;
 +	ret = 0;
 +
 +	if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
 +			!S_ISLNK(i->inode.bi_mode), c,
 +			"extent in non regular inode mode %o:\n  %s",
 +			i->inode.bi_mode,
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-+		return bch2_btree_delete_at(trans, iter,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
 +					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
 +
 +	if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
 +		for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
@@ -46883,11 +48866,12 @@ index 000000000000..361dbf338023
 +					"extent type %u offset %llu past end of inode %llu, i_size %llu",
 +					k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
 +				bch2_fs_lazy_rw(c);
-+				return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
++				ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
 +						SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
 +						     k.k->p.snapshot),
 +						POS(k.k->p.inode, U64_MAX),
 +						0, NULL) ?: -EINTR;
++				goto out;
 +			}
 +		}
 +	}
@@ -46899,7 +48883,10 @@ index 000000000000..361dbf338023
 +	bch2_bkey_buf_reassemble(&prev, c, k);
 +#endif
 +
++out:
++err:
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -46958,12 +48945,13 @@ index 000000000000..361dbf338023
 +	int ret = 0, ret2 = 0;
 +	s64 count2;
 +
-+	for (i = w->d; i < w->d + w->nr; i++) {
++	darray_for_each(w->inodes, i) {
 +		if (i->inode.bi_nlink == i->count)
 +			continue;
 +
-+		count2 = lockrestart_do(trans,
-+				bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
++		count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
++		if (count2 < 0)
++			return count2;
 +
 +		if (i->count != count2) {
 +			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
@@ -46996,7 +48984,7 @@ index 000000000000..361dbf338023
 +	struct bch_fs *c = trans->c;
 +	struct bkey_i_dirent *n;
 +	bool backpointer_exists = true;
-+	char buf[200];
++	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
 +	if (!target->bi_dir &&
@@ -47022,9 +49010,7 @@ index 000000000000..361dbf338023
 +				"directory %llu with multiple links",
 +				target->bi_inum)) {
 +			ret = __remove_dirent(trans, d.k->p);
-+			if (ret)
-+				goto err;
-+			return 0;
++			goto out;
 +		}
 +
 +		if (fsck_err_on(backpointer_exists &&
@@ -47061,18 +49047,19 @@ index 000000000000..361dbf338023
 +			"incorrect d_type: got %s, should be %s:\n%s",
 +			bch2_d_type_str(d.v->d_type),
 +			bch2_d_type_str(inode_d_type(target)),
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
 +		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
 +		ret = PTR_ERR_OR_ZERO(n);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		bkey_reassemble(&n->k_i, d.s_c);
 +		n->v.d_type = inode_d_type(target);
 +
 +		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		d = dirent_i_to_s_c(n);
 +	}
@@ -47086,19 +49073,21 @@ index 000000000000..361dbf338023
 +		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
 +		ret = PTR_ERR_OR_ZERO(n);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		bkey_reassemble(&n->k_i, d.s_c);
 +		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
 +
 +		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		d = dirent_i_to_s_c(n);
 +	}
++out:
 +err:
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -47112,68 +49101,81 @@ index 000000000000..361dbf338023
 +	struct bkey_s_c k;
 +	struct bkey_s_c_dirent d;
 +	struct inode_walker_entry *i;
-+	char buf[200];
-+	int ret;
++	struct printbuf buf = PRINTBUF;
++	int ret = 0;
 +
 +	k = bch2_btree_iter_peek(iter);
 +	if (!k.k)
-+		return 0;
++		goto out;
 +
 +	ret = bkey_err(k);
 +	if (ret)
-+		return ret;
++		goto err;
 +
 +	ret = check_key_has_snapshot(trans, iter, k);
-+	if (ret)
-+		return ret < 0 ? ret : 0;
++	if (ret) {
++		ret = ret < 0 ? ret : 0;
++		goto out;
++	}
 +
 +	ret = snapshots_seen_update(c, s, k.k->p);
 +	if (ret)
-+		return ret;
++		goto err;
 +
 +	if (k.k->type == KEY_TYPE_whiteout)
-+		return 0;
++		goto out;
 +
 +	if (dir->cur_inum != k.k->p.inode) {
 +		ret = check_subdir_count(trans, dir);
 +		if (ret)
-+			return ret;
++			goto err;
 +	}
 +
 +	ret = __walk_inode(trans, dir, k.k->p);
 +	if (ret < 0)
-+		return ret;
++		goto err;
 +
 +	if (fsck_err_on(ret == INT_MAX, c,
 +			"dirent in nonexisting directory:\n%s",
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-+		return bch2_btree_delete_at(trans, iter,
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter,
 +				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		goto out;
++	}
 +
-+	if (ret == INT_MAX)
-+		return 0;
++	if (ret == INT_MAX) {
++		ret = 0;
++		goto out;
++	}
 +
-+	i = dir->d + ret;
++	i = dir->inodes.data + ret;
 +	ret = 0;
 +
 +	if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
 +			"dirent in non directory inode type %s:\n%s",
 +			bch2_d_type_str(inode_d_type(&i->inode)),
-+			(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-+		return bch2_btree_delete_at(trans, iter, 0);
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter, 0);
++		goto out;
++	}
 +
 +	if (dir->first_this_inode)
-+		*hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
++		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 +
 +	ret = hash_check_key(trans, bch2_dirent_hash_desc,
 +			     hash_info, iter, k);
 +	if (ret < 0)
-+		return ret;
-+	if (ret) /* dirent has been deleted */
-+		return 0;
++		goto err;
++	if (ret) {
++		/* dirent has been deleted */
++		ret = 0;
++		goto out;
++	}
 +
 +	if (k.k->type != KEY_TYPE_dirent)
-+		return 0;
++		goto out;
 +
 +	d = bkey_s_c_to_dirent(k);
 +
@@ -47186,24 +49188,27 @@ index 000000000000..361dbf338023
 +		ret = __subvol_lookup(trans, target_subvol,
 +				      &target_snapshot, &target_inum);
 +		if (ret && ret != -ENOENT)
-+			return ret;
++			goto err;
 +
 +		if (fsck_err_on(ret, c,
 +				"dirent points to missing subvolume %llu",
-+				le64_to_cpu(d.v->d_child_subvol)))
-+			return __remove_dirent(trans, d.k->p);
++				le64_to_cpu(d.v->d_child_subvol))) {
++			ret = __remove_dirent(trans, d.k->p);
++			goto err;
++		}
 +
 +		ret = __lookup_inode(trans, target_inum,
 +				   &subvol_root, &target_snapshot);
 +		if (ret && ret != -ENOENT)
-+			return ret;
++			goto err;
 +
 +		if (fsck_err_on(ret, c,
 +				"subvolume %u points to missing subvolume root %llu",
 +				target_subvol,
 +				target_inum)) {
 +			bch_err(c, "repair not implemented yet");
-+			return -EINVAL;
++			ret = -EINVAL;
++			goto err;
 +		}
 +
 +		if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
@@ -47213,32 +49218,33 @@ index 000000000000..361dbf338023
 +			subvol_root.bi_subvol = target_subvol;
 +			ret = __write_inode(trans, &subvol_root, target_snapshot);
 +			if (ret)
-+				return ret;
++				goto err;
 +		}
 +
 +		ret = check_dirent_target(trans, iter, d, &subvol_root,
 +					  target_snapshot);
 +		if (ret)
-+			return ret;
++			goto err;
 +	} else {
 +		ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
 +		if (ret)
-+			return ret;
++			goto err;
 +
-+		if (fsck_err_on(!target->nr, c,
++		if (fsck_err_on(!target->inodes.nr, c,
 +				"dirent points to missing inode:\n%s",
-+				(bch2_bkey_val_to_text(&PBUF(buf), c,
-+						       k), buf))) {
++				(printbuf_reset(&buf),
++				 bch2_bkey_val_to_text(&buf, c, k),
++				 buf.buf))) {
 +			ret = __remove_dirent(trans, d.k->p);
 +			if (ret)
-+				return ret;
++				goto err;
 +		}
 +
-+		for (i = target->d; i < target->d + target->nr; i++) {
++		darray_for_each(target->inodes, i) {
 +			ret = check_dirent_target(trans, iter, d,
 +						  &i->inode, i->snapshot);
 +			if (ret)
-+				return ret;
++				goto err;
 +		}
 +	}
 +
@@ -47246,7 +49252,10 @@ index 000000000000..361dbf338023
 +		for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
 +			i->count++;
 +
++out:
++err:
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -47329,7 +49338,7 @@ index 000000000000..361dbf338023
 +	ret = 0;
 +
 +	if (inode->first_this_inode)
-+		*hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
++		*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
 +
 +	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 +fsck_err:
@@ -47439,21 +49448,18 @@ index 000000000000..361dbf338023
 +		check_root_trans(&trans));
 +}
 +
-+struct pathbuf {
-+	size_t		nr;
-+	size_t		size;
-+
-+	struct pathbuf_entry {
-+		u64	inum;
-+		u32	snapshot;
-+	}		*entries;
++struct pathbuf_entry {
++	u64	inum;
++	u32	snapshot;
 +};
 +
-+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
++typedef DARRAY(struct pathbuf_entry) pathbuf;
++
++static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 +{
 +	struct pathbuf_entry *i;
 +
-+	for (i = p->entries; i < p->entries + p->nr; i++)
++	darray_for_each(*p, i)
 +		if (i->inum	== inum &&
 +		    i->snapshot	== snapshot)
 +			return true;
@@ -47461,26 +49467,18 @@ index 000000000000..361dbf338023
 +	return false;
 +}
 +
-+static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
++static int path_down(struct bch_fs *c, pathbuf *p,
++		     u64 inum, u32 snapshot)
 +{
-+	if (p->nr == p->size) {
-+		size_t new_size = max_t(size_t, 256UL, p->size * 2);
-+		void *n = krealloc(p->entries,
-+				   new_size * sizeof(p->entries[0]),
-+				   GFP_KERNEL);
-+		if (!n) {
-+			return -ENOMEM;
-+		}
-+
-+		p->entries = n;
-+		p->size = new_size;
-+	};
-+
-+	p->entries[p->nr++] = (struct pathbuf_entry) {
++	int ret = darray_push(*p, ((struct pathbuf_entry) {
 +		.inum		= inum,
 +		.snapshot	= snapshot,
-+	};
-+	return 0;
++	}));
++
++	if (ret)
++		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
++			p->size);
++	return ret;
 +}
 +
 +/*
@@ -47489,7 +49487,7 @@ index 000000000000..361dbf338023
 + * XXX: we should also be verifying that inodes are in the right subvolumes
 + */
 +static int check_path(struct btree_trans *trans,
-+		      struct pathbuf *p,
++		      pathbuf *p,
 +		      struct bch_inode_unpacked *inode,
 +		      u32 snapshot)
 +{
@@ -47542,7 +49540,7 @@ index 000000000000..361dbf338023
 +		if (!S_ISDIR(inode->bi_mode))
 +			break;
 +
-+		ret = path_down(p, inode->bi_inum, snapshot);
++		ret = path_down(c, p, inode->bi_inum, snapshot);
 +		if (ret) {
 +			bch_err(c, "memory allocation failure");
 +			return ret;
@@ -47563,7 +49561,7 @@ index 000000000000..361dbf338023
 +			/* XXX print path */
 +			bch_err(c, "directory structure loop");
 +
-+			for (i = p->entries; i < p->entries + p->nr; i++)
++			darray_for_each(*p, i)
 +				pr_err("%llu:%u", i->inum, i->snapshot);
 +			pr_err("%llu:%u", inode->bi_inum, snapshot);
 +
@@ -47600,7 +49598,7 @@ index 000000000000..361dbf338023
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bch_inode_unpacked u;
-+	struct pathbuf path = { 0, 0, NULL };
++	pathbuf path = { 0, };
 +	int ret;
 +
 +	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@@ -47630,7 +49628,7 @@ index 000000000000..361dbf338023
 +
 +	BUG_ON(ret == -EINTR);
 +
-+	kfree(path.entries);
++	darray_exit(path);
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -47647,12 +49645,15 @@ index 000000000000..361dbf338023
 +	}		*d;
 +};
 +
-+static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
++static int add_nlink(struct bch_fs *c, struct nlink_table *t,
++		     u64 inum, u32 snapshot)
 +{
 +	if (t->nr == t->size) {
 +		size_t new_size = max_t(size_t, 128UL, t->size * 2);
 +		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
 +		if (!d) {
++			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
++				new_size);
 +			return -ENOMEM;
 +		}
 +
@@ -47742,7 +49743,7 @@ index 000000000000..361dbf338023
 +		if (!u.bi_nlink)
 +			continue;
 +
-+		ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
++		ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
 +		if (ret) {
 +			*end = k.k->p.offset;
 +			ret = 0;
@@ -48008,16 +50009,17 @@ index 000000000000..264f2706b12d
 +#endif /* _BCACHEFS_FSCK_H */
 diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
 new file mode 100644
-index 000000000000..ffce68a80490
+index 000000000000..14b0b595202d
 --- /dev/null
 +++ b/fs/bcachefs/inode.c
-@@ -0,0 +1,744 @@
+@@ -0,0 +1,720 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_key_cache.h"
 +#include "bkey_methods.h"
 +#include "btree_update.h"
++#include "buckets.h"
 +#include "error.h"
 +#include "extents.h"
 +#include "extent_update.h"
@@ -48038,16 +50040,6 @@ index 000000000000..ffce68a80490
 +};
 +
 +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-+static const u8 bits_table[8] = {
-+	1  * 8 - 1,
-+	2  * 8 - 2,
-+	3  * 8 - 3,
-+	4  * 8 - 4,
-+	6  * 8 - 5,
-+	8  * 8 - 6,
-+	10 * 8 - 7,
-+	13 * 8 - 8,
-+};
 +
 +static int inode_decode_field(const u8 *in, const u8 *end,
 +			      u64 out[2], unsigned *out_bits)
@@ -48275,15 +50267,13 @@ index 000000000000..ffce68a80490
 +	u32 snapshot;
 +	int ret;
 +
-+	if (0 && trans->c->opts.inodes_use_key_cache)
-+		flags |= BTREE_ITER_CACHED;
-+
 +	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 +	if (ret)
 +		return ret;
 +
 +	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
-+			     SPOS(0, inum.inum, snapshot), flags);
++			     SPOS(0, inum.inum, snapshot),
++			     flags|BTREE_ITER_CACHED);
 +	k = bch2_btree_iter_peek_slot(iter);
 +	ret = bkey_err(k);
 +	if (ret)
@@ -48608,76 +50598,62 @@ index 000000000000..ffce68a80490
 +static int bch2_inode_delete_keys(struct btree_trans *trans,
 +				  subvol_inum inum, enum btree_id id)
 +{
-+	u64 offset = 0;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i delete;
++	u32 snapshot;
 +	int ret = 0;
 +
-+	while (!ret || ret == -EINTR) {
-+		struct btree_iter iter;
-+		struct bkey_s_c k;
-+		struct bkey_i delete;
-+		u32 snapshot;
++	/*
++	 * We're never going to be deleting extents, no need to use an extent
++	 * iterator:
++	 */
++	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_INTENT);
 +
++	while (1) {
 +		bch2_trans_begin(trans);
 +
 +		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 +		if (ret)
-+			continue;
++			goto err;
 +
-+		bch2_trans_iter_init(trans, &iter, id,
-+				     SPOS(inum.inum, offset, snapshot),
-+				     BTREE_ITER_INTENT);
-+		k = bch2_btree_iter_peek(&iter);
-+
-+		if (!k.k || iter.pos.inode != inum.inum) {
-+			bch2_trans_iter_exit(trans, &iter);
-+			break;
-+		}
++		bch2_btree_iter_set_snapshot(&iter, snapshot);
 +
++		k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
 +		ret = bkey_err(k);
 +		if (ret)
 +			goto err;
 +
++		if (!k.k)
++			break;
++
 +		bkey_init(&delete.k);
 +		delete.k.p = iter.pos;
 +
-+		if (btree_node_type_is_extents(iter.btree_id)) {
-+			unsigned max_sectors =
-+				min_t(u64, U64_MAX - iter.pos.offset,
-+				      KEY_SIZE_MAX & (~0 << trans->c->block_bits));
-+
-+			/* create the biggest key we can */
-+			bch2_key_resize(&delete.k, max_sectors);
-+
-+			ret = bch2_extent_trim_atomic(trans, &iter, &delete);
-+			if (ret)
-+				goto err;
-+		}
-+
 +		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 +		      bch2_trans_commit(trans, NULL, NULL,
 +					BTREE_INSERT_NOFAIL);
 +err:
-+		offset = iter.pos.offset;
-+		bch2_trans_iter_exit(trans, &iter);
++		if (ret && ret != -EINTR)
++			break;
 +	}
 +
++	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
 +
-+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
++int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter = { NULL };
 +	struct bkey_i_inode_generation delete;
 +	struct bch_inode_unpacked inode_u;
 +	struct bkey_s_c k;
-+	unsigned iter_flags = BTREE_ITER_INTENT;
 +	u32 snapshot;
 +	int ret;
 +
-+	if (0 && cached && c->opts.inodes_use_key_cache)
-+		iter_flags |= BTREE_ITER_CACHED;
-+
 +	bch2_trans_init(&trans, c, 0, 1024);
 +
 +	/*
@@ -48701,7 +50677,8 @@ index 000000000000..ffce68a80490
 +		goto err;
 +
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-+			     SPOS(0, inum.inum, snapshot), iter_flags);
++			     SPOS(0, inum.inum, snapshot),
++			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 +	k = bch2_btree_iter_peek_slot(&iter);
 +
 +	ret = bkey_err(k);
@@ -48758,10 +50735,10 @@ index 000000000000..ffce68a80490
 +}
 diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
 new file mode 100644
-index 000000000000..723186d8afb6
+index 000000000000..2337ecfc600e
 --- /dev/null
 +++ b/fs/bcachefs/inode.h
-@@ -0,0 +1,200 @@
+@@ -0,0 +1,204 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_INODE_H
 +#define _BCACHEFS_INODE_H
@@ -48777,11 +50754,15 @@ index 000000000000..723186d8afb6
 +#define bch2_bkey_ops_inode (struct bkey_ops) {		\
 +	.key_invalid	= bch2_inode_invalid,		\
 +	.val_to_text	= bch2_inode_to_text,		\
++	.trans_trigger	= bch2_trans_mark_inode,	\
++	.atomic_trigger	= bch2_mark_inode,		\
 +}
 +
 +#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
 +	.key_invalid	= bch2_inode_v2_invalid,	\
 +	.val_to_text	= bch2_inode_to_text,		\
++	.trans_trigger	= bch2_trans_mark_inode,	\
++	.atomic_trigger	= bch2_mark_inode,		\
 +}
 +
 +static inline bool bkey_is_inode(const struct bkey *k)
@@ -48851,7 +50832,7 @@ index 000000000000..723186d8afb6
 +int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 +		      struct bch_inode_unpacked *, u32, u64);
 +
-+int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
++int bch2_inode_rm(struct bch_fs *, subvol_inum);
 +
 +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
 +				  struct bch_inode_unpacked *);
@@ -48964,10 +50945,10 @@ index 000000000000..723186d8afb6
 +#endif /* _BCACHEFS_INODE_H */
 diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
 new file mode 100644
-index 000000000000..5a3c9eff1b50
+index 000000000000..36929451af2c
 --- /dev/null
 +++ b/fs/bcachefs/io.c
-@@ -0,0 +1,2375 @@
+@@ -0,0 +1,2416 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Some low level IO code, and hacks for various block layer limitations
@@ -49635,11 +51616,7 @@ index 000000000000..5a3c9eff1b50
 +{
 +	struct bch_fs *c = op->c;
 +	struct bkey_i_extent *e;
-+	struct open_bucket *ob;
-+	unsigned i;
 +
-+	BUG_ON(crc.compressed_size > wp->sectors_free);
-+	wp->sectors_free -= crc.compressed_size;
 +	op->pos.offset += crc.uncompressed_size;
 +
 +	e = bkey_extent_init(op->insert_keys.top);
@@ -49652,22 +51629,8 @@ index 000000000000..5a3c9eff1b50
 +	    crc.nonce)
 +		bch2_extent_crc_append(&e->k_i, crc);
 +
-+	open_bucket_for_each(c, &wp->ptrs, ob, i) {
-+		struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-+		union bch_extent_entry *end =
-+			bkey_val_end(bkey_i_to_s(&e->k_i));
-+
-+		end->ptr = ob->ptr;
-+		end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-+		end->ptr.cached = !ca->mi.durability ||
-+			(op->flags & BCH_WRITE_CACHED) != 0;
-+		end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
-+
-+		e->k.u64s++;
-+
-+		BUG_ON(crc.compressed_size > ob->sectors_free);
-+		ob->sectors_free -= crc.compressed_size;
-+	}
++	bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
++				       op->flags & BCH_WRITE_CACHED);
 +
 +	bch2_keylist_push(&op->insert_keys);
 +}
@@ -49708,7 +51671,7 @@ index 000000000000..5a3c9eff1b50
 +	 */
 +	bch2_bio_alloc_pages_pool(c, bio,
 +				  min_t(unsigned, output_available,
-+					c->sb.encoded_extent_max << 9));
++					c->opts.encoded_extent_max));
 +
 +	if (bio->bi_iter.bi_size < output_available)
 +		*page_alloc_failed =
@@ -49752,6 +51715,7 @@ index 000000000000..5a3c9eff1b50
 +	struct bch_fs *c = op->c;
 +	struct nonce nonce = extent_nonce(op->version, op->crc);
 +	struct bch_csum csum;
++	int ret;
 +
 +	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
 +		return 0;
@@ -49766,10 +51730,10 @@ index 000000000000..5a3c9eff1b50
 +	if (bch2_crc_cmp(op->crc.csum, csum))
 +		return -EIO;
 +
-+	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
++	ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
 +	op->crc.csum_type = 0;
 +	op->crc.csum = (struct bch_csum) { 0, 0 };
-+	return 0;
++	return ret;
 +}
 +
 +static enum prep_encoded_ret {
@@ -49905,8 +51869,8 @@ index 000000000000..5a3c9eff1b50
 +		size_t dst_len, src_len;
 +
 +		if (page_alloc_failed &&
-+		    bio_sectors(dst) < wp->sectors_free &&
-+		    bio_sectors(dst) < c->sb.encoded_extent_max)
++		    dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
++		    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
 +			break;
 +
 +		BUG_ON(op->compression_type &&
@@ -49926,7 +51890,7 @@ index 000000000000..5a3c9eff1b50
 +
 +			if (op->csum_type)
 +				dst_len = min_t(unsigned, dst_len,
-+						c->sb.encoded_extent_max << 9);
++						c->opts.encoded_extent_max);
 +
 +			if (bounce) {
 +				swap(dst->bi_iter.bi_size, dst_len);
@@ -49984,8 +51948,11 @@ index 000000000000..5a3c9eff1b50
 +			crc.live_size		= src_len >> 9;
 +
 +			swap(dst->bi_iter.bi_size, dst_len);
-+			bch2_encrypt_bio(c, op->csum_type,
-+					 extent_nonce(version, crc), dst);
++			ret = bch2_encrypt_bio(c, op->csum_type,
++					       extent_nonce(version, crc), dst);
++			if (ret)
++				goto err;
++
 +			crc.csum = bch2_checksum_bio(c, op->csum_type,
 +					 extent_nonce(version, crc), dst);
 +			crc.csum_type = op->csum_type;
@@ -50043,7 +52010,7 @@ index 000000000000..5a3c9eff1b50
 +	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 +	struct bch_fs *c = op->c;
 +	struct write_point *wp;
-+	struct bio *bio;
++	struct bio *bio = NULL;
 +	bool skip_put = true;
 +	unsigned nofs_flags;
 +	int ret;
@@ -50081,7 +52048,7 @@ index 000000000000..5a3c9eff1b50
 +		 */
 +		wp = bch2_alloc_sectors_start(c,
 +			op->target,
-+			op->opts.erasure_code,
++			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
 +			op->write_point,
 +			&op->devs_have,
 +			op->nr_replicas,
@@ -50259,7 +52226,7 @@ index 000000000000..5a3c9eff1b50
 +	bch2_keylist_init(&op->insert_keys, op->inline_keys);
 +	wbio_init(bio)->put_bio = false;
 +
-+	if (bio_sectors(bio) & (c->opts.block_size - 1)) {
++	if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
 +		bch_err_inum_ratelimited(c, op->pos.inode,
 +					 "misaligned write");
 +		op->error = -EIO;
@@ -50760,6 +52727,7 @@ index 000000000000..5a3c9eff1b50
 +	struct nonce nonce = extent_nonce(rbio->version, crc);
 +	unsigned nofs_flags;
 +	struct bch_csum csum;
++	int ret;
 +
 +	nofs_flags = memalloc_nofs_save();
 +
@@ -50794,7 +52762,10 @@ index 000000000000..5a3c9eff1b50
 +	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 +
 +	if (crc_is_compressed(crc)) {
-+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
++
 +		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
 +			goto decompression_err;
 +	} else {
@@ -50805,7 +52776,9 @@ index 000000000000..5a3c9eff1b50
 +		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
 +		src->bi_iter.bi_size = dst_iter.bi_size;
 +
-+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
 +
 +		if (rbio->bounce) {
 +			struct bvec_iter src_iter = src->bi_iter;
@@ -50818,7 +52791,10 @@ index 000000000000..5a3c9eff1b50
 +		 * Re encrypt data we decrypted, so it's consistent with
 +		 * rbio->crc:
 +		 */
-+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
++		if (ret)
++			goto decrypt_err;
++
 +		promote_start(rbio->promote, rbio);
 +		rbio->promote = NULL;
 +	}
@@ -50853,6 +52829,11 @@ index 000000000000..5a3c9eff1b50
 +				 "decompression error");
 +	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
 +	goto out;
++decrypt_err:
++	bch_err_inum_ratelimited(c, rbio->read_pos.inode,
++				 "decrypt error");
++	bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
++	goto out;
 +}
 +
 +static void bch2_read_endio(struct bio *bio)
@@ -50881,9 +52862,8 @@ index 000000000000..5a3c9eff1b50
 +		return;
 +	}
 +
-+	if (rbio->pick.ptr.cached &&
-+	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-+	     ptr_stale(ca, &rbio->pick.ptr))) {
++	if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
++	    ptr_stale(ca, &rbio->pick.ptr)) {
 +		atomic_long_inc(&c->read_realloc_races);
 +
 +		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -50942,6 +52922,35 @@ index 000000000000..5a3c9eff1b50
 +	return ret;
 +}
 +
++static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
++						   struct bkey_s_c k,
++						   struct bch_extent_ptr ptr)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
++	struct btree_iter iter;
++	struct printbuf buf = PRINTBUF;
++	int ret;
++
++	bch2_bkey_val_to_text(&buf, c, k);
++	bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf.buf);
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
++			     BTREE_ITER_CACHED);
++
++	ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
++	if (ret)
++		goto out;
++
++	bch2_bkey_val_to_text(&buf, c, k);
++	bch_err(c, "%s", buf.buf);
++	bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
++	bch2_trans_iter_exit(trans, &iter);
++out:
++	printbuf_exit(&buf);
++}
++
 +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 +		       struct bvec_iter iter, struct bpos read_pos,
 +		       enum btree_id data_btree, struct bkey_s_c k,
@@ -50951,7 +52960,7 @@ index 000000000000..5a3c9eff1b50
 +	struct bch_fs *c = trans->c;
 +	struct extent_ptr_decoded pick;
 +	struct bch_read_bio *rbio = NULL;
-+	struct bch_dev *ca;
++	struct bch_dev *ca = NULL;
 +	struct promote_op *promote = NULL;
 +	bool bounce = false, read_full = false, narrow_crcs = false;
 +	struct bpos data_pos = bkey_start_pos(k.k);
@@ -50968,7 +52977,7 @@ index 000000000000..5a3c9eff1b50
 +		zero_fill_bio_iter(&orig->bio, iter);
 +		goto out_read_done;
 +	}
-+
++retry_pick:
 +	pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 +
 +	/* hole or reservation - just zero fill: */
@@ -50981,8 +52990,27 @@ index 000000000000..5a3c9eff1b50
 +		goto err;
 +	}
 +
-+	if (pick_ret > 0)
-+		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
++
++	/*
++	 * Stale dirty pointers are treated as IO errors, but @failed isn't
++	 * allocated unless we're in the retry path - so if we're not in the
++	 * retry path, don't check here, it'll be caught in bch2_read_endio()
++	 * and we'll end up in the retry path:
++	 */
++	if ((flags & BCH_READ_IN_RETRY) &&
++	    !pick.ptr.cached &&
++	    unlikely(ptr_stale(ca, &pick.ptr))) {
++		read_from_stale_dirty_pointer(trans, k, pick.ptr);
++		bch2_mark_io_failure(failed, &pick);
++		goto retry_pick;
++	}
++
++	/*
++	 * Unlock the iterator while the btree node's lock is still in
++	 * cache, before doing the IO:
++	 */
++	bch2_trans_unlock(trans);
 +
 +	if (flags & BCH_READ_NODECODE) {
 +		/*
@@ -51229,7 +53257,7 @@ index 000000000000..5a3c9eff1b50
 +
 +	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
 +			     SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-+			     BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS);
++			     BTREE_ITER_SLOTS);
 +	while (1) {
 +		unsigned bytes, sectors, offset_into_extent;
 +		enum btree_id data_btree = BTREE_ID_extents;
@@ -51270,12 +53298,6 @@ index 000000000000..5a3c9eff1b50
 +		 */
 +		sectors = min(sectors, k.k->size - offset_into_extent);
 +
-+		/*
-+		 * Unlock the iterator while the btree node's lock is still in
-+		 * cache, before doing the IO:
-+		 */
-+		bch2_trans_unlock(&trans);
-+
 +		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 +		swap(bvec_iter.bi_size, bytes);
 +
@@ -51336,8 +53358,8 @@ index 000000000000..5a3c9eff1b50
 +	    mempool_init_page_pool(&c->bio_bounce_pages,
 +				   max_t(unsigned,
 +					 c->opts.btree_node_size,
-+					 c->sb.encoded_extent_max) /
-+				   PAGE_SECTORS, 0) ||
++					 c->opts.encoded_extent_max) /
++				   PAGE_SIZE, 0) ||
 +	    rhashtable_init(&c->promote_table, &bch_promote_params))
 +		return -ENOMEM;
 +
@@ -51345,7 +53367,7 @@ index 000000000000..5a3c9eff1b50
 +}
 diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
 new file mode 100644
-index 000000000000..1aa422dccef7
+index 000000000000..fb5114518666
 --- /dev/null
 +++ b/fs/bcachefs/io.h
 @@ -0,0 +1,189 @@
@@ -51401,7 +53423,7 @@ index 000000000000..1aa422dccef7
 +
 +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 +{
-+	return op->alloc_reserve == RESERVE_MOVINGGC
++	return op->alloc_reserve == RESERVE_movinggc
 +		? op->c->copygc_wq
 +		: op->c->btree_update_wq;
 +}
@@ -51430,7 +53452,7 @@ index 000000000000..1aa422dccef7
 +	op->compression_type	= bch2_compression_opt_to_type[opts.compression];
 +	op->nr_replicas		= 0;
 +	op->nr_replicas_required = c->opts.data_replicas_required;
-+	op->alloc_reserve	= RESERVE_NONE;
++	op->alloc_reserve	= RESERVE_none;
 +	op->incompressible	= 0;
 +	op->open_buckets.nr	= 0;
 +	op->devs_have.nr	= 0;
@@ -51707,10 +53729,10 @@ index 000000000000..78bff13d36f2
 +#endif /* _BCACHEFS_IO_TYPES_H */
 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
 new file mode 100644
-index 000000000000..14bea8a2535e
+index 000000000000..505e8367b5f2
 --- /dev/null
 +++ b/fs/bcachefs/journal.c
-@@ -0,0 +1,1284 @@
+@@ -0,0 +1,1410 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcachefs journalling code, for btree insertions
@@ -51728,23 +53750,26 @@ index 000000000000..14bea8a2535e
 +#include "journal.h"
 +#include "journal_io.h"
 +#include "journal_reclaim.h"
++#include "journal_sb.h"
 +#include "journal_seq_blacklist.h"
-+#include "super-io.h"
 +
 +#include <trace/events/bcachefs.h>
 +
-+static u64 last_unwritten_seq(struct journal *j)
-+{
-+	union journal_res_state s = READ_ONCE(j->reservations);
++#define x(n)	#n,
++static const char * const bch2_journal_watermarks[] = {
++	JOURNAL_WATERMARKS()
++	NULL
++};
 +
-+	lockdep_assert_held(&j->lock);
-+
-+	return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
-+}
++static const char * const bch2_journal_errors[] = {
++	JOURNAL_ERRORS()
++	NULL
++};
++#undef x
 +
 +static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 +{
-+	return seq >= last_unwritten_seq(j);
++	return seq > j->seq_ondisk;
 +}
 +
 +static bool __journal_entry_is_open(union journal_res_state state)
@@ -51752,6 +53777,11 @@ index 000000000000..14bea8a2535e
 +	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 +}
 +
++static inline unsigned nr_unwritten_journal_entries(struct journal *j)
++{
++	return atomic64_read(&j->seq) - j->seq_ondisk;
++}
++
 +static bool journal_entry_is_open(struct journal *j)
 +{
 +	return __journal_entry_is_open(j->reservations);
@@ -51763,8 +53793,6 @@ index 000000000000..14bea8a2535e
 +	struct journal_buf *buf = NULL;
 +
 +	EBUG_ON(seq > journal_cur_seq(j));
-+	EBUG_ON(seq == journal_cur_seq(j) &&
-+		j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
 +
 +	if (journal_seq_unwritten(j, seq)) {
 +		buf = j->buf + (seq & JOURNAL_BUF_MASK);
@@ -51782,54 +53810,6 @@ index 000000000000..14bea8a2535e
 +	p->devs.nr = 0;
 +}
 +
-+static void journal_pin_new_entry(struct journal *j)
-+{
-+	/*
-+	 * The fifo_push() needs to happen at the same time as j->seq is
-+	 * incremented for journal_last_seq() to be calculated correctly
-+	 */
-+	atomic64_inc(&j->seq);
-+	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
-+}
-+
-+static void bch2_journal_buf_init(struct journal *j)
-+{
-+	struct journal_buf *buf = journal_cur_buf(j);
-+
-+	bkey_extent_init(&buf->key);
-+	buf->noflush	= false;
-+	buf->must_flush	= false;
-+	buf->separate_flush = false;
-+
-+	memset(buf->data, 0, sizeof(*buf->data));
-+	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
-+	buf->data->u64s	= 0;
-+}
-+
-+void bch2_journal_halt(struct journal *j)
-+{
-+	union journal_res_state old, new;
-+	u64 v = atomic64_read(&j->reservations.counter);
-+
-+	do {
-+		old.v = new.v = v;
-+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-+			return;
-+
-+		new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-+	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-+				       old.v, new.v)) != old.v);
-+
-+	/*
-+	 * XXX: we're not using j->lock here because this can be called from
-+	 * interrupt context, this can race with journal_write_done()
-+	 */
-+	if (!j->err_seq)
-+		j->err_seq = journal_cur_seq(j);
-+	journal_wake(j);
-+	closure_wake_up(&journal_cur_buf(j)->wait);
-+}
-+
 +/* journal entry close/open: */
 +
 +void __bch2_journal_buf_put(struct journal *j)
@@ -51845,7 +53825,7 @@ index 000000000000..14bea8a2535e
 + * We don't close a journal_buf until the next journal_buf is finished writing,
 + * and can be opened again - this also initializes the next journal_buf:
 + */
-+static bool __journal_entry_close(struct journal *j)
++static void __journal_entry_close(struct journal *j, unsigned closed_val)
 +{
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	struct journal_buf *buf = journal_cur_buf(j);
@@ -51853,34 +53833,24 @@ index 000000000000..14bea8a2535e
 +	u64 v = atomic64_read(&j->reservations.counter);
 +	unsigned sectors;
 +
++	BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL &&
++	       closed_val != JOURNAL_ENTRY_ERROR_VAL);
++
 +	lockdep_assert_held(&j->lock);
 +
 +	do {
 +		old.v = new.v = v;
-+		if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-+			return true;
++		new.cur_entry_offset = closed_val;
 +
-+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
-+			/* this entry will never be written: */
-+			closure_wake_up(&buf->wait);
-+			return true;
-+		}
-+
-+		if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-+			set_bit(JOURNAL_NEED_WRITE, &j->flags);
-+			j->need_write_time = local_clock();
-+		}
-+
-+		new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
-+		new.idx++;
-+
-+		if (new.idx == new.unwritten_idx)
-+			return false;
-+
-+		BUG_ON(journal_state_count(new, new.idx));
++		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ||
++		    old.cur_entry_offset == new.cur_entry_offset)
++			return;
 +	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 +				       old.v, new.v)) != old.v);
 +
++	if (!__journal_entry_is_open(old))
++		return;
++
 +	/* Close out old buffer: */
 +	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);
 +
@@ -51910,36 +53880,42 @@ index 000000000000..14bea8a2535e
 +	 */
 +	buf->last_seq		= journal_last_seq(j);
 +	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
++	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 +
 +	__bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
 +
-+	/* Initialize new buffer: */
-+	journal_pin_new_entry(j);
-+
-+	bch2_journal_buf_init(j);
-+
 +	cancel_delayed_work(&j->write_work);
-+	clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 +
 +	bch2_journal_space_available(j);
 +
 +	bch2_journal_buf_put(j, old.idx);
-+	return true;
++}
++
++void bch2_journal_halt(struct journal *j)
++{
++	spin_lock(&j->lock);
++	__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
++	if (!j->err_seq)
++		j->err_seq = journal_cur_seq(j);
++	spin_unlock(&j->lock);
 +}
 +
 +static bool journal_entry_want_write(struct journal *j)
 +{
-+	union journal_res_state s = READ_ONCE(j->reservations);
-+	bool ret = false;
++	bool ret = !journal_entry_is_open(j) ||
++		journal_cur_seq(j) == journal_last_unwritten_seq(j);
 +
-+	/*
-+	 * Don't close it yet if we already have a write in flight, but do set
-+	 * NEED_WRITE:
-+	 */
-+	if (s.idx != s.unwritten_idx)
-+		set_bit(JOURNAL_NEED_WRITE, &j->flags);
-+	else
-+		ret = __journal_entry_close(j);
++	/* Don't close it yet if we already have a write in flight: */
++	if (ret)
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++	else if (nr_unwritten_journal_entries(j)) {
++		struct journal_buf *buf = journal_cur_buf(j);
++
++		if (!buf->flush_time) {
++			buf->flush_time	= local_clock() ?: 1;
++			buf->expires = jiffies;
++		}
++	}
 +
 +	return ret;
 +}
@@ -51968,34 +53944,71 @@ index 000000000000..14bea8a2535e
 +static int journal_entry_open(struct journal *j)
 +{
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-+	struct journal_buf *buf = journal_cur_buf(j);
++	struct journal_buf *buf = j->buf +
++		((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK);
 +	union journal_res_state old, new;
 +	int u64s;
 +	u64 v;
 +
-+	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-+
 +	lockdep_assert_held(&j->lock);
 +	BUG_ON(journal_entry_is_open(j));
++	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 +
 +	if (j->blocked)
-+		return cur_entry_blocked;
++		return JOURNAL_ERR_blocked;
 +
 +	if (j->cur_entry_error)
 +		return j->cur_entry_error;
 +
++	if (bch2_journal_error(j))
++		return JOURNAL_ERR_insufficient_devices; /* -EROFS */
++
++	if (!fifo_free(&j->pin))
++		return JOURNAL_ERR_journal_pin_full;
++
++	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1)
++		return JOURNAL_ERR_max_in_flight;
++
 +	BUG_ON(!j->cur_entry_sectors);
 +
++	buf->expires		=
++		(journal_cur_seq(j) == j->flushed_seq_ondisk
++		 ? jiffies
++		 : j->last_flush_write) +
++		msecs_to_jiffies(c->opts.journal_flush_delay);
++
 +	buf->u64s_reserved	= j->entry_u64s_reserved;
 +	buf->disk_sectors	= j->cur_entry_sectors;
 +	buf->sectors		= min(buf->disk_sectors, buf->buf_size >> 9);
 +
 +	u64s = (int) (buf->sectors << 9) / sizeof(u64) -
 +		journal_entry_overhead(j);
-+	u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
++	u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 +
-+	if (u64s <= le32_to_cpu(buf->data->u64s))
-+		return cur_entry_journal_full;
++	if (u64s <= 0)
++		return JOURNAL_ERR_journal_full;
++
++	if (fifo_empty(&j->pin) && j->reclaim_thread)
++		wake_up_process(j->reclaim_thread);
++
++	/*
++	 * The fifo_push() needs to happen at the same time as j->seq is
++	 * incremented for journal_last_seq() to be calculated correctly
++	 */
++	atomic64_inc(&j->seq);
++	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
++
++	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
++
++	bkey_extent_init(&buf->key);
++	buf->noflush	= false;
++	buf->must_flush	= false;
++	buf->separate_flush = false;
++	buf->flush_time	= 0;
++
++	memset(buf->data, 0, sizeof(*buf->data));
++	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
++	buf->data->u64s	= 0;
 +
 +	/*
 +	 * Must be set before marking the journal entry as open:
@@ -52006,14 +54019,14 @@ index 000000000000..14bea8a2535e
 +	do {
 +		old.v = new.v = v;
 +
-+		if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-+			return cur_entry_insufficient_devices;
++		BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL);
 +
-+		/* Handle any already added entries */
-+		new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
++		new.idx++;
++		BUG_ON(journal_state_count(new, new.idx));
++		BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
 +
-+		EBUG_ON(journal_state_count(new, new.idx));
 +		journal_state_inc(&new);
++		new.cur_entry_offset = 0;
 +	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 +				       old.v, new.v)) != old.v);
 +
@@ -52024,15 +54037,14 @@ index 000000000000..14bea8a2535e
 +
 +	mod_delayed_work(c->io_complete_wq,
 +			 &j->write_work,
-+			 msecs_to_jiffies(j->write_delay_ms));
++			 msecs_to_jiffies(c->opts.journal_flush_delay));
 +	journal_wake(j);
 +	return 0;
 +}
 +
 +static bool journal_quiesced(struct journal *j)
 +{
-+	union journal_res_state s = READ_ONCE(j->reservations);
-+	bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
++	bool ret = atomic64_read(&j->seq) == j->seq_ondisk;
 +
 +	if (!ret)
 +		journal_entry_close(j);
@@ -52047,8 +54059,21 @@ index 000000000000..14bea8a2535e
 +static void journal_write_work(struct work_struct *work)
 +{
 +	struct journal *j = container_of(work, struct journal, write_work.work);
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	long delta;
 +
-+	journal_entry_close(j);
++	spin_lock(&j->lock);
++	if (!__journal_entry_is_open(j->reservations))
++		goto unlock;
++
++	delta = journal_cur_buf(j)->expires - jiffies;
++
++	if (delta > 0)
++		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
++	else
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++unlock:
++	spin_unlock(&j->lock);
 +}
 +
 +static int __journal_res_get(struct journal *j, struct journal_res *res,
@@ -52077,13 +54102,12 @@ index 000000000000..14bea8a2535e
 +		return 0;
 +	}
 +
-+	if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
++	if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) {
 +		/*
 +		 * Don't want to close current journal entry, just need to
 +		 * invoke reclaim:
 +		 */
-+		ret = cur_entry_journal_full;
++		ret = JOURNAL_ERR_journal_full;
 +		goto unlock;
 +	}
 +
@@ -52098,20 +54122,13 @@ index 000000000000..14bea8a2535e
 +	    buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
 +		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 +
-+	if (journal_entry_is_open(j) &&
-+	    !__journal_entry_close(j)) {
-+		/*
-+		 * We failed to get a reservation on the current open journal
-+		 * entry because it's full, and we can't close it because
-+		 * there's still a previous one in flight:
-+		 */
++	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++	ret = journal_entry_open(j);
++
++	if (ret == JOURNAL_ERR_max_in_flight)
 +		trace_journal_entry_full(c);
-+		ret = cur_entry_blocked;
-+	} else {
-+		ret = journal_entry_open(j);
-+	}
 +unlock:
-+	if ((ret && ret != cur_entry_insufficient_devices) &&
++	if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
 +	    !j->res_get_blocked_start) {
 +		j->res_get_blocked_start = local_clock() ?: 1;
 +		trace_journal_full(c);
@@ -52123,23 +54140,24 @@ index 000000000000..14bea8a2535e
 +	if (!ret)
 +		goto retry;
 +
-+	if ((ret == cur_entry_journal_full ||
-+	     ret == cur_entry_journal_pin_full) &&
++	if ((ret == JOURNAL_ERR_journal_full ||
++	     ret == JOURNAL_ERR_journal_pin_full) &&
 +	    !can_discard &&
-+	    j->reservations.idx == j->reservations.unwritten_idx &&
-+	    (flags & JOURNAL_RES_GET_RESERVED)) {
-+		char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
++	    !nr_unwritten_journal_entries(j) &&
++	    (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) {
++		struct printbuf buf = PRINTBUF;
 +
-+		bch_err(c, "Journal stuck!");
-+		if (journal_debug_buf) {
-+			bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-+			bch_err(c, "%s", journal_debug_buf);
++		bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)",
++			bch2_journal_errors[ret]);
 +
-+			bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
-+			bch_err(c, "Journal pins:\n%s", journal_debug_buf);
-+			kfree(journal_debug_buf);
-+		}
++		bch2_journal_debug_to_text(&buf, j);
++		bch_err(c, "%s", buf.buf);
 +
++		printbuf_reset(&buf);
++		bch2_journal_pins_to_text(&buf, j);
++		bch_err(c, "Journal pins:\n%s", buf.buf);
++
++		printbuf_exit(&buf);
 +		bch2_fatal_error(c);
 +		dump_stack();
 +	}
@@ -52148,8 +54166,8 @@ index 000000000000..14bea8a2535e
 +	 * Journal is full - can't rely on reclaim from work item due to
 +	 * freezing:
 +	 */
-+	if ((ret == cur_entry_journal_full ||
-+	     ret == cur_entry_journal_pin_full) &&
++	if ((ret == JOURNAL_ERR_journal_full ||
++	     ret == JOURNAL_ERR_journal_pin_full) &&
 +	    !(flags & JOURNAL_RES_GET_NONBLOCK)) {
 +		if (can_discard) {
 +			bch2_journal_do_discards(j);
@@ -52162,7 +54180,7 @@ index 000000000000..14bea8a2535e
 +		}
 +	}
 +
-+	return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
++	return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN;
 +}
 +
 +/*
@@ -52241,7 +54259,7 @@ index 000000000000..14bea8a2535e
 +		/*
 +		 * Not enough room in current journal entry, have to flush it:
 +		 */
-+		__journal_entry_close(j);
++		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
 +	} else {
 +		journal_cur_buf(j)->u64s_reserved += d;
 +	}
@@ -52286,12 +54304,15 @@ index 000000000000..14bea8a2535e
 +	}
 +
 +	/* if seq was written, but not flushed - flush a newer one instead */
-+	seq = max(seq, last_unwritten_seq(j));
++	seq = max(seq, journal_last_unwritten_seq(j));
 +
 +recheck_need_open:
-+	if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
++	if (seq > journal_cur_seq(j)) {
 +		struct journal_res res = { 0 };
 +
++		if (journal_entry_is_open(j))
++			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
++
 +		spin_unlock(&j->lock);
 +
 +		ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
@@ -52301,7 +54322,11 @@ index 000000000000..14bea8a2535e
 +		seq = res.seq;
 +		buf = j->buf + (seq & JOURNAL_BUF_MASK);
 +		buf->must_flush = true;
-+		set_bit(JOURNAL_NEED_WRITE, &j->flags);
++
++		if (!buf->flush_time) {
++			buf->flush_time	= local_clock() ?: 1;
++			buf->expires = jiffies;
++		}
 +
 +		if (parent && !closure_wait(&buf->wait, parent))
 +			BUG();
@@ -52339,6 +54364,12 @@ index 000000000000..14bea8a2535e
 +	u64 start_time = local_clock();
 +	int ret, ret2;
 +
++	/*
++	 * Don't update time_stats when @seq is already flushed:
++	 */
++	if (seq <= j->flushed_seq_ondisk)
++		return 0;
++
 +	ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 +
 +	if (!ret)
@@ -52347,8 +54378,61 @@ index 000000000000..14bea8a2535e
 +	return ret ?: ret2 < 0 ? ret2 : 0;
 +}
 +
++/*
++ * bch2_journal_flush_async - if there is an open journal entry, or a journal
++ * still being written, write it and wait for the write to complete
++ */
++void bch2_journal_flush_async(struct journal *j, struct closure *parent)
++{
++	bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent);
++}
++
++int bch2_journal_flush(struct journal *j)
++{
++	return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
++}
++
++/*
++ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
++ * @seq
++ */
++bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	u64 unwritten_seq;
++	bool ret = false;
++
++	if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
++		return false;
++
++	if (seq <= c->journal.flushed_seq_ondisk)
++		return false;
++
++	spin_lock(&j->lock);
++	if (seq <= c->journal.flushed_seq_ondisk)
++		goto out;
++
++	for (unwritten_seq = journal_last_unwritten_seq(j);
++	     unwritten_seq < seq;
++	     unwritten_seq++) {
++		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
++
++		/* journal write is already in flight, and was a flush write: */
++		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
++			goto out;
++
++		buf->noflush = true;
++	}
++
++	ret = true;
++out:
++	spin_unlock(&j->lock);
++	return ret;
++}
++
 +int bch2_journal_meta(struct journal *j)
 +{
++	struct journal_buf *buf;
 +	struct journal_res res;
 +	int ret;
 +
@@ -52358,53 +54442,50 @@ index 000000000000..14bea8a2535e
 +	if (ret)
 +		return ret;
 +
++	buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
++	buf->must_flush = true;
++
++	if (!buf->flush_time) {
++		buf->flush_time	= local_clock() ?: 1;
++		buf->expires = jiffies;
++	}
++
 +	bch2_journal_res_put(j, &res);
 +
 +	return bch2_journal_flush_seq(j, res.seq);
 +}
 +
-+/*
-+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
-+ * still being written, write it and wait for the write to complete
-+ */
-+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
++int bch2_journal_log_msg(struct journal *j, const char *fmt, ...)
 +{
-+	u64 seq, journal_seq;
++	struct jset_entry_log *entry;
++	struct journal_res res = { 0 };
++	unsigned msglen, u64s;
++	va_list args;
++	int ret;
 +
-+	spin_lock(&j->lock);
-+	journal_seq = journal_cur_seq(j);
++	va_start(args, fmt);
++	msglen = vsnprintf(NULL, 0, fmt, args) + 1;
++	va_end(args);
 +
-+	if (journal_entry_is_open(j)) {
-+		seq = journal_seq;
-+	} else if (journal_seq) {
-+		seq = journal_seq - 1;
-+	} else {
-+		spin_unlock(&j->lock);
-+		return;
-+	}
-+	spin_unlock(&j->lock);
++	u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64)));
 +
-+	bch2_journal_flush_seq_async(j, seq, parent);
-+}
++	ret = bch2_journal_res_get(j, &res, u64s, 0);
++	if (ret)
++		return ret;
 +
-+int bch2_journal_flush(struct journal *j)
-+{
-+	u64 seq, journal_seq;
++	entry = container_of(journal_res_entry(j, &res),
++			     struct jset_entry_log, entry);;
++	memset(entry, 0, u64s * sizeof(u64));
++	entry->entry.type = BCH_JSET_ENTRY_log;
++	entry->entry.u64s = u64s - 1;
 +
-+	spin_lock(&j->lock);
-+	journal_seq = journal_cur_seq(j);
++	va_start(args, fmt);
++	vsnprintf(entry->d, INT_MAX, fmt, args);
++	va_end(args);
 +
-+	if (journal_entry_is_open(j)) {
-+		seq = journal_seq;
-+	} else if (journal_seq) {
-+		seq = journal_seq - 1;
-+	} else {
-+		spin_unlock(&j->lock);
-+		return 0;
-+	}
-+	spin_unlock(&j->lock);
++	bch2_journal_res_put(j, &res);
 +
-+	return bch2_journal_flush_seq(j, seq);
++	return bch2_journal_flush_seq(j, res.seq);
 +}
 +
 +/* block/unlock the journal: */
@@ -52434,28 +54515,53 @@ index 000000000000..14bea8a2535e
 +{
 +	struct bch_fs *c = ca->fs;
 +	struct journal_device *ja = &ca->journal;
-+	struct bch_sb_field_journal *journal_buckets;
 +	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
++	struct open_bucket **ob = NULL;
++	long *bu = NULL;
++	unsigned i, nr_got = 0, nr_want = nr - ja->nr;
++	unsigned old_nr			= ja->nr;
++	unsigned old_discard_idx	= ja->discard_idx;
++	unsigned old_dirty_idx_ondisk	= ja->dirty_idx_ondisk;
++	unsigned old_dirty_idx		= ja->dirty_idx;
++	unsigned old_cur_idx		= ja->cur_idx;
 +	int ret = 0;
 +
-+	/* don't handle reducing nr of buckets yet: */
-+	if (nr <= ja->nr)
-+		return 0;
++	if (c) {
++		bch2_journal_block(&c->journal);
++		bch2_journal_flush_all_pins(&c->journal);
++	}
 +
++	bu		= kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
++	ob		= kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
 +	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 +	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-+	if (!new_buckets || !new_bucket_seq) {
++	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 +		ret = -ENOMEM;
-+		goto err;
++		goto err_unblock;
 +	}
 +
-+	journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-+					nr + sizeof(*journal_buckets) / sizeof(u64));
-+	if (!journal_buckets) {
-+		ret = -ENOSPC;
-+		goto err;
++	for (nr_got = 0; nr_got < nr_want; nr_got++) {
++		if (new_fs) {
++			bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
++			if (bu[nr_got] < 0) {
++				ret = -ENOSPC;
++				break;
++			}
++		} else {
++			ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
++					       false, cl);
++			if (IS_ERR(ob[nr_got])) {
++				ret = cl ? -EAGAIN : -ENOSPC;
++				break;
++			}
++
++			bu[nr_got] = ob[nr_got]->bucket;
++		}
 +	}
 +
++	if (!nr_got)
++		goto err_unblock;
++
 +	/*
 +	 * We may be called from the device add path, before the new device has
 +	 * actually been added to the running filesystem:
@@ -52468,54 +54574,16 @@ index 000000000000..14bea8a2535e
 +	swap(new_buckets,	ja->buckets);
 +	swap(new_bucket_seq,	ja->bucket_seq);
 +
-+	if (!new_fs)
-+		spin_unlock(&c->journal.lock);
++	for (i = 0; i < nr_got; i++) {
++		unsigned pos = ja->discard_idx ?: ja->nr;
++		long b = bu[i];
 +
-+	while (ja->nr < nr) {
-+		struct open_bucket *ob = NULL;
-+		unsigned pos;
-+		long b;
-+
-+		if (new_fs) {
-+			if (c)
-+				percpu_down_read(&c->mark_lock);
-+			b = bch2_bucket_alloc_new_fs(ca);
-+			if (b < 0) {
-+				percpu_up_read(&c->mark_lock);
-+				ret = -ENOSPC;
-+				goto err;
-+			}
-+		} else {
-+			rcu_read_lock();
-+			ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
-+					       false, cl);
-+			rcu_read_unlock();
-+			if (IS_ERR(ob)) {
-+				ret = cl ? -EAGAIN : -ENOSPC;
-+				goto err;
-+			}
-+
-+			b = sector_to_bucket(ca, ob->ptr.offset);
-+		}
-+
-+		if (c)
-+			spin_lock(&c->journal.lock);
-+
-+		/*
-+		 * XXX
-+		 * For resize at runtime, we should be writing the new
-+		 * superblock before inserting into the journal array
-+		 */
-+
-+		pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
 +		__array_insert_item(ja->buckets,		ja->nr, pos);
 +		__array_insert_item(ja->bucket_seq,		ja->nr, pos);
-+		__array_insert_item(journal_buckets->buckets,	ja->nr, pos);
 +		ja->nr++;
 +
 +		ja->buckets[pos] = b;
 +		ja->bucket_seq[pos] = 0;
-+		journal_buckets->buckets[pos] = cpu_to_le64(b);
 +
 +		if (pos <= ja->discard_idx)
 +			ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -52525,36 +54593,56 @@ index 000000000000..14bea8a2535e
 +			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 +		if (pos <= ja->cur_idx)
 +			ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
++	}
 +
-+		if (c)
-+			spin_unlock(&c->journal.lock);
++	ret = bch2_journal_buckets_to_sb(c, ca);
++	if (ret) {
++		/* Revert: */
++		swap(new_buckets,	ja->buckets);
++		swap(new_bucket_seq,	ja->bucket_seq);
++		ja->nr			= old_nr;
++		ja->discard_idx		= old_discard_idx;
++		ja->dirty_idx_ondisk	= old_dirty_idx_ondisk;
++		ja->dirty_idx		= old_dirty_idx;
++		ja->cur_idx		= old_cur_idx;
++	}
 +
-+		if (new_fs) {
-+			bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
-+						  ca->mi.bucket_size,
-+						  gc_phase(GC_PHASE_SB),
-+						  0);
-+			if (c)
-+				percpu_up_read(&c->mark_lock);
-+		} else {
++	if (!new_fs)
++		spin_unlock(&c->journal.lock);
++
++	if (c)
++		bch2_journal_unblock(&c->journal);
++
++	if (ret)
++		goto err;
++
++	if (!new_fs) {
++		for (i = 0; i < nr_got; i++) {
 +			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 +				bch2_trans_mark_metadata_bucket(&trans, ca,
-+						b, BCH_DATA_journal,
++						bu[i], BCH_DATA_journal,
 +						ca->mi.bucket_size));
-+
-+			bch2_open_bucket_put(c, ob);
-+
-+			if (ret)
++			if (ret) {
++				bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
 +				goto err;
++			}
 +		}
 +	}
 +err:
-+	bch2_sb_resize_journal(&ca->disk_sb,
-+		ja->nr + sizeof(*journal_buckets) / sizeof(u64));
++	if (ob && !new_fs)
++		for (i = 0; i < nr_got; i++)
++			bch2_open_bucket_put(c, ob[i]);
++
 +	kfree(new_bucket_seq);
 +	kfree(new_buckets);
++	kfree(ob);
++	kfree(bu);
 +
 +	return ret;
++err_unblock:
++	if (c)
++		bch2_journal_unblock(&c->journal);
++	goto err;
 +}
 +
 +/*
@@ -52567,11 +54655,15 @@ index 000000000000..14bea8a2535e
 +	struct journal_device *ja = &ca->journal;
 +	struct closure cl;
 +	unsigned current_nr;
-+	int ret;
++	int ret = 0;
++
++	/* don't handle reducing nr of buckets yet: */
++	if (nr < ja->nr)
++		return 0;
 +
 +	closure_init_stack(&cl);
 +
-+	do {
++	while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
 +		struct disk_reservation disk_res = { 0, 0 };
 +
 +		closure_sync(&cl);
@@ -52599,7 +54691,7 @@ index 000000000000..14bea8a2535e
 +		if (ja->nr != current_nr)
 +			bch2_write_super(c);
 +		mutex_unlock(&c->sb_lock);
-+	} while (ret == -EAGAIN);
++	}
 +
 +	return ret;
 +}
@@ -52630,17 +54722,16 @@ index 000000000000..14bea8a2535e
 +
 +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 +{
-+	union journal_res_state state;
 +	bool ret = false;
-+	unsigned i;
++	u64 seq;
 +
 +	spin_lock(&j->lock);
-+	state = READ_ONCE(j->reservations);
-+	i = state.idx;
++	for (seq = journal_last_unwritten_seq(j);
++	     seq <= journal_cur_seq(j) && !ret;
++	     seq++) {
++		struct journal_buf *buf = journal_seq_to_buf(j, seq);
 +
-+	while (i != state.unwritten_idx) {
-+		i = (i - 1) & JOURNAL_BUF_MASK;
-+		if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
++		if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx))
 +			ret = true;
 +	}
 +	spin_unlock(&j->lock);
@@ -52655,6 +54746,7 @@ index 000000000000..14bea8a2535e
 +
 +void bch2_fs_journal_stop(struct journal *j)
 +{
++	bch2_journal_reclaim_stop(j);
 +	bch2_journal_flush_all_pins(j);
 +
 +	wait_event(j->wait, journal_entry_close(j));
@@ -52669,11 +54761,9 @@ index 000000000000..14bea8a2535e
 +
 +	BUG_ON(!bch2_journal_error(j) &&
 +	       test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
-+	       (journal_entry_is_open(j) ||
-+		j->last_empty_seq + 1 != journal_cur_seq(j)));
++	       j->last_empty_seq != journal_cur_seq(j));
 +
 +	cancel_delayed_work_sync(&j->write_work);
-+	bch2_journal_reclaim_stop(j);
 +}
 +
 +int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
@@ -52702,10 +54792,15 @@ index 000000000000..14bea8a2535e
 +	j->replay_journal_seq	= last_seq;
 +	j->replay_journal_seq_end = cur_seq;
 +	j->last_seq_ondisk	= last_seq;
++	j->flushed_seq_ondisk	= cur_seq - 1;
++	j->seq_ondisk		= cur_seq - 1;
 +	j->pin.front		= last_seq;
 +	j->pin.back		= cur_seq;
 +	atomic64_set(&j->seq, cur_seq - 1);
 +
++	if (list_empty(journal_entries))
++		j->last_empty_seq = cur_seq - 1;
++
 +	fifo_for_each_entry_ptr(p, &j->pin, seq)
 +		journal_pin_list_init(p, 1);
 +
@@ -52718,6 +54813,9 @@ index 000000000000..14bea8a2535e
 +		if (seq < last_seq)
 +			continue;
 +
++		if (journal_entry_empty(&i->j))
++			j->last_empty_seq = le64_to_cpu(i->j.seq);
++
 +		p = journal_seq_pin(j, seq);
 +
 +		p->devs.nr = 0;
@@ -52725,16 +54823,16 @@ index 000000000000..14bea8a2535e
 +			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
 +	}
 +
++	if (list_empty(journal_entries))
++		j->last_empty_seq = cur_seq;
++
 +	spin_lock(&j->lock);
 +
 +	set_bit(JOURNAL_STARTED, &j->flags);
 +	j->last_flush_write = jiffies;
 +
-+	journal_pin_new_entry(j);
-+
 +	j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
-+
-+	bch2_journal_buf_init(j);
++	j->reservations.unwritten_idx++;
 +
 +	c->last_bucket_seq_cleanup = journal_cur_seq(j);
 +
@@ -52762,9 +54860,20 @@ index 000000000000..14bea8a2535e
 +	struct journal_device *ja = &ca->journal;
 +	struct bch_sb_field_journal *journal_buckets =
 +		bch2_sb_get_journal(sb);
++	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
++		bch2_sb_get_journal_v2(sb);
 +	unsigned i;
 +
-+	ja->nr = bch2_nr_journal_buckets(journal_buckets);
++	ja->nr = 0;
++
++	if (journal_buckets_v2) {
++		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
++
++		for (i = 0; i < nr; i++)
++			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
++	} else if (journal_buckets) {
++		ja->nr = bch2_nr_journal_buckets(journal_buckets);
++	}
 +
 +	ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 +	if (!ja->bucket_seq)
@@ -52779,8 +54888,18 @@ index 000000000000..14bea8a2535e
 +	if (!ja->buckets)
 +		return -ENOMEM;
 +
-+	for (i = 0; i < ja->nr; i++)
-+		ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
++	if (journal_buckets_v2) {
++		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
++		unsigned j, dst = 0;
++
++		for (i = 0; i < nr; i++)
++			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
++				ja->buckets[dst++] =
++					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
++	} else if (journal_buckets) {
++		for (i = 0; i < ja->nr; i++)
++			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
++	}
 +
 +	return 0;
 +}
@@ -52814,9 +54933,6 @@ index 000000000000..14bea8a2535e
 +
 +	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 +
-+	j->write_delay_ms	= 1000;
-+	j->reclaim_delay_ms	= 100;
-+
 +	atomic64_set(&j->reservations.counter,
 +		((union journal_res_state)
 +		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
@@ -52848,75 +54964,81 @@ index 000000000000..14bea8a2535e
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	union journal_res_state s;
 +	struct bch_dev *ca;
++	unsigned long now = jiffies;
++	u64 seq;
 +	unsigned i;
 +
++	out->atomic++;
++	out->tabstops[0] = 24;
++
 +	rcu_read_lock();
 +	s = READ_ONCE(j->reservations);
 +
-+	pr_buf(out,
-+	       "active journal entries:\t%llu\n"
-+	       "seq:\t\t\t%llu\n"
-+	       "last_seq:\t\t%llu\n"
-+	       "last_seq_ondisk:\t%llu\n"
-+	       "flushed_seq_ondisk:\t%llu\n"
-+	       "prereserved:\t\t%u/%u\n"
-+	       "each entry reserved:\t%u\n"
-+	       "nr flush writes:\t%llu\n"
-+	       "nr noflush writes:\t%llu\n"
-+	       "nr direct reclaim:\t%llu\n"
-+	       "nr background reclaim:\t%llu\n"
-+	       "reclaim kicked:\t\t%u\n"
-+	       "reclaim runs in:\t%u ms\n"
-+	       "current entry sectors:\t%u\n"
-+	       "current entry error:\t%u\n"
-+	       "current entry:\t\t",
-+	       fifo_used(&j->pin),
-+	       journal_cur_seq(j),
-+	       journal_last_seq(j),
-+	       j->last_seq_ondisk,
-+	       j->flushed_seq_ondisk,
-+	       j->prereserved.reserved,
-+	       j->prereserved.remaining,
-+	       j->entry_u64s_reserved,
-+	       j->nr_flush_writes,
-+	       j->nr_noflush_writes,
-+	       j->nr_direct_reclaim,
-+	       j->nr_background_reclaim,
-+	       j->reclaim_kicked,
-+	       jiffies_to_msecs(j->next_reclaim - jiffies),
-+	       j->cur_entry_sectors,
-+	       j->cur_entry_error);
++	pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
++	pr_buf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
++	pr_buf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
++	pr_buf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
++	pr_buf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
++	pr_buf(out, "flushed_seq_ondisk:\t%llu\n",	j->flushed_seq_ondisk);
++	pr_buf(out, "prereserved:\t\t%u/%u\n",		j->prereserved.reserved, j->prereserved.remaining);
++	pr_buf(out, "watermark:\t\t%s\n",		bch2_journal_watermarks[j->watermark]);
++	pr_buf(out, "each entry reserved:\t%u\n",	j->entry_u64s_reserved);
++	pr_buf(out, "nr flush writes:\t%llu\n",		j->nr_flush_writes);
++	pr_buf(out, "nr noflush writes:\t%llu\n",	j->nr_noflush_writes);
++	pr_buf(out, "nr direct reclaim:\t%llu\n",	j->nr_direct_reclaim);
++	pr_buf(out, "nr background reclaim:\t%llu\n",	j->nr_background_reclaim);
++	pr_buf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
++	pr_buf(out, "reclaim runs in:\t%u ms\n",	time_after(j->next_reclaim, now)
++	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
++	pr_buf(out, "current entry sectors:\t%u\n",	j->cur_entry_sectors);
++	pr_buf(out, "current entry error:\t%s\n",	bch2_journal_errors[j->cur_entry_error]);
++	pr_buf(out, "current entry:\t\t");
 +
 +	switch (s.cur_entry_offset) {
 +	case JOURNAL_ENTRY_ERROR_VAL:
-+		pr_buf(out, "error\n");
++		pr_buf(out, "error");
 +		break;
 +	case JOURNAL_ENTRY_CLOSED_VAL:
-+		pr_buf(out, "closed\n");
++		pr_buf(out, "closed");
 +		break;
 +	default:
-+		pr_buf(out, "%u/%u\n",
-+		       s.cur_entry_offset,
-+		       j->cur_entry_u64s);
++		pr_buf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s);
 +		break;
 +	}
 +
-+	pr_buf(out,
-+	       "current entry:\t\tidx %u refcount %u\n",
-+	       s.idx, journal_state_count(s, s.idx));
++	pr_newline(out);
 +
-+	i = s.idx;
-+	while (i != s.unwritten_idx) {
-+		i = (i - 1) & JOURNAL_BUF_MASK;
++	for (seq = journal_cur_seq(j);
++	     seq >= journal_last_unwritten_seq(j);
++	     --seq) {
++		i = seq & JOURNAL_BUF_MASK;
 +
-+		pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
-+		       i, journal_state_count(s, i), j->buf[i].sectors);
++		pr_buf(out, "unwritten entry:");
++		pr_tab(out);
++		pr_buf(out, "%llu", seq);
++		pr_newline(out);
++		pr_indent_push(out, 2);
++
++		pr_buf(out, "refcount:");
++		pr_tab(out);
++		pr_buf(out, "%u", journal_state_count(s, i));
++		pr_newline(out);
++
++		pr_buf(out, "sectors:");
++		pr_tab(out);
++		pr_buf(out, "%u", j->buf[i].sectors);
++		pr_newline(out);
++
++		pr_buf(out, "expires");
++		pr_tab(out);
++		pr_buf(out, "%li jiffies", j->buf[i].expires - jiffies);
++		pr_newline(out);
++
++		pr_indent_pop(out, 2);
 +	}
 +
 +	pr_buf(out,
-+	       "need write:\t\t%i\n"
 +	       "replay done:\t\t%i\n",
-+	       test_bit(JOURNAL_NEED_WRITE,	&j->flags),
 +	       test_bit(JOURNAL_REPLAY_DONE,	&j->flags));
 +
 +	pr_buf(out, "space:\n");
@@ -52943,25 +55065,19 @@ index 000000000000..14bea8a2535e
 +		if (!ja->nr)
 +			continue;
 +
-+		pr_buf(out,
-+		       "dev %u:\n"
-+		       "\tnr\t\t%u\n"
-+		       "\tbucket size\t%u\n"
-+		       "\tavailable\t%u:%u\n"
-+		       "\tdiscard_idx\t%u\n"
-+		       "\tdirty_ondisk\t%u (seq %llu)\n"
-+		       "\tdirty_idx\t%u (seq %llu)\n"
-+		       "\tcur_idx\t\t%u (seq %llu)\n",
-+		       i, ja->nr, ca->mi.bucket_size,
-+		       bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
-+		       ja->sectors_free,
-+		       ja->discard_idx,
-+		       ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk],
-+		       ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx],
-+		       ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
++		pr_buf(out, "dev %u:\n",		i);
++		pr_buf(out, "\tnr\t\t%u\n",		ja->nr);
++		pr_buf(out, "\tbucket size\t%u\n",	ca->mi.bucket_size);
++		pr_buf(out, "\tavailable\t%u:%u\n",	bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
++		pr_buf(out, "\tdiscard_idx\t%u\n",	ja->discard_idx);
++		pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,	ja->bucket_seq[ja->dirty_idx_ondisk]);
++		pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,		ja->bucket_seq[ja->dirty_idx]);
++		pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,		ja->bucket_seq[ja->cur_idx]);
 +	}
 +
 +	rcu_read_unlock();
++
++	--out->atomic;
 +}
 +
 +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
@@ -52971,36 +55087,68 @@ index 000000000000..14bea8a2535e
 +	spin_unlock(&j->lock);
 +}
 +
-+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
 +{
 +	struct journal_entry_pin_list *pin_list;
 +	struct journal_entry_pin *pin;
-+	u64 i;
 +
 +	spin_lock(&j->lock);
-+	fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
-+		pr_buf(out, "%llu: count %u\n",
-+		       i, atomic_read(&pin_list->count));
++	*seq = max(*seq, j->pin.front);
 +
-+		list_for_each_entry(pin, &pin_list->list, list)
-+			pr_buf(out, "\t%px %ps\n",
-+			       pin, pin->flush);
-+
-+		if (!list_empty(&pin_list->flushed))
-+			pr_buf(out, "flushed:\n");
-+
-+		list_for_each_entry(pin, &pin_list->flushed, list)
-+			pr_buf(out, "\t%px %ps\n",
-+			       pin, pin->flush);
++	if (*seq >= j->pin.back) {
++		spin_unlock(&j->lock);
++		return true;
 +	}
++
++	out->atomic++;
++
++	pin_list = journal_seq_pin(j, *seq);
++
++	pr_buf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count));
++	pr_newline(out);
++	pr_indent_push(out, 2);
++
++	list_for_each_entry(pin, &pin_list->list, list) {
++		pr_buf(out, "\t%px %ps", pin, pin->flush);
++		pr_newline(out);
++	}
++
++	list_for_each_entry(pin, &pin_list->key_cache_list, list) {
++		pr_buf(out, "\t%px %ps", pin, pin->flush);
++		pr_newline(out);
++	}
++
++	if (!list_empty(&pin_list->flushed)) {
++		pr_buf(out, "flushed:");
++		pr_newline(out);
++	}
++
++	list_for_each_entry(pin, &pin_list->flushed, list) {
++		pr_buf(out, "\t%px %ps", pin, pin->flush);
++		pr_newline(out);
++	}
++
++	pr_indent_pop(out, 2);
++
++	--out->atomic;
 +	spin_unlock(&j->lock);
++
++	return false;
++}
++
++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
++{
++	u64 seq = 0;
++
++	while (!bch2_journal_seq_pins_to_text(out, j, &seq))
++		seq++;
 +}
 diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
 new file mode 100644
-index 000000000000..c39cbbf1bccd
+index 000000000000..e7321c327d9d
 --- /dev/null
 +++ b/fs/bcachefs/journal.h
-@@ -0,0 +1,519 @@
+@@ -0,0 +1,522 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_JOURNAL_H
 +#define _BCACHEFS_JOURNAL_H
@@ -53144,6 +55292,11 @@ index 000000000000..c39cbbf1bccd
 +	return j->pin.back - 1;
 +}
 +
++static inline u64 journal_last_unwritten_seq(struct journal *j)
++{
++	return j->seq_ondisk + 1;
++}
++
 +void bch2_journal_set_has_inum(struct journal *, u64, u64);
 +
 +static inline int journal_state_count(union journal_res_state s, int idx)
@@ -53264,9 +55417,6 @@ index 000000000000..c39cbbf1bccd
 +				    .buf3_count = idx == 3,
 +				    }).v, &j->reservations.counter);
 +
-+	EBUG_ON(((s.idx - idx) & 3) >
-+		((s.idx - s.unwritten_idx) & 3));
-+
 +	if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
 +		__bch2_journal_buf_put(j);
 +}
@@ -53296,9 +55446,9 @@ index 000000000000..c39cbbf1bccd
 +int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 +				  unsigned);
 +
-+#define JOURNAL_RES_GET_NONBLOCK	(1 << 0)
-+#define JOURNAL_RES_GET_CHECK		(1 << 1)
-+#define JOURNAL_RES_GET_RESERVED	(1 << 2)
++/* First two bits for JOURNAL_WATERMARK: */
++#define JOURNAL_RES_GET_NONBLOCK	(1 << 2)
++#define JOURNAL_RES_GET_CHECK		(1 << 3)
 +
 +static inline int journal_res_get_fast(struct journal *j,
 +				       struct journal_res *res,
@@ -53319,8 +55469,7 @@ index 000000000000..c39cbbf1bccd
 +
 +		EBUG_ON(!journal_state_count(new, new.idx));
 +
-+		if (!(flags & JOURNAL_RES_GET_RESERVED) &&
-+		    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
++		if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
 +			return 0;
 +
 +		new.cur_entry_offset += res->u64s;
@@ -53373,23 +55522,27 @@ index 000000000000..c39cbbf1bccd
 +
 +/* journal_preres: */
 +
-+static inline bool journal_check_may_get_unreserved(struct journal *j)
++static inline void journal_set_watermark(struct journal *j)
 +{
 +	union journal_preres_state s = READ_ONCE(j->prereserved);
-+	bool ret = s.reserved < s.remaining &&
-+		fifo_free(&j->pin) > 8;
++	unsigned watermark = JOURNAL_WATERMARK_any;
 +
-+	lockdep_assert_held(&j->lock);
++	if (fifo_free(&j->pin) < j->pin.size / 4)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
++	if (fifo_free(&j->pin) < j->pin.size / 8)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
 +
-+	if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-+		if (ret) {
-+			set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-+			journal_wake(j);
-+		} else {
-+			clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
-+		}
-+	}
-+	return ret;
++	if (s.reserved > s.remaining)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc);
++	if (!s.remaining)
++		watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved);
++
++	if (watermark == j->watermark)
++		return;
++
++	swap(watermark, j->watermark);
++	if (watermark > j->watermark)
++		journal_wake(j);
 +}
 +
 +static inline void bch2_journal_preres_put(struct journal *j,
@@ -53409,12 +55562,8 @@ index 000000000000..c39cbbf1bccd
 +		closure_wake_up(&j->preres_wait);
 +	}
 +
-+	if (s.reserved <= s.remaining &&
-+	    !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
-+		spin_lock(&j->lock);
-+		journal_check_may_get_unreserved(j);
-+		spin_unlock(&j->lock);
-+	}
++	if (s.reserved <= s.remaining && j->watermark)
++		journal_set_watermark(j);
 +}
 +
 +int __bch2_journal_preres_get(struct journal *,
@@ -53435,8 +55584,7 @@ index 000000000000..c39cbbf1bccd
 +		old.v = new.v = v;
 +		ret = 0;
 +
-+		if ((flags & JOURNAL_RES_GET_RESERVED) ||
-+		    test_bit(JOURNAL_NOCHANGES, &j->flags) ||
++		if ((flags & JOURNAL_WATERMARK_reserved) ||
 +		    new.reserved + d < new.remaining) {
 +			new.reserved += d;
 +			ret = 1;
@@ -53480,7 +55628,9 @@ index 000000000000..c39cbbf1bccd
 +
 +int bch2_journal_flush_seq(struct journal *, u64);
 +int bch2_journal_flush(struct journal *);
++bool bch2_journal_noflush_seq(struct journal *, u64);
 +int bch2_journal_meta(struct journal *);
++int bch2_journal_log_msg(struct journal *, const char *, ...);
 +
 +void bch2_journal_halt(struct journal *);
 +
@@ -53504,6 +55654,7 @@ index 000000000000..c39cbbf1bccd
 +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 +void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 +void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 +
 +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
 +				unsigned nr);
@@ -53522,12 +55673,13 @@ index 000000000000..c39cbbf1bccd
 +#endif /* _BCACHEFS_JOURNAL_H */
 diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
 new file mode 100644
-index 000000000000..5c8304e05abd
+index 000000000000..e61b88930a7f
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.c
-@@ -0,0 +1,1554 @@
+@@ -0,0 +1,1700 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
++#include "alloc_background.h"
 +#include "alloc_foreground.h"
 +#include "btree_io.h"
 +#include "btree_update_interior.h"
@@ -53575,12 +55727,12 @@ index 000000000000..5c8304e05abd
 + * be replayed:
 + */
 +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-+			     struct bch_extent_ptr entry_ptr,
++			     struct journal_ptr entry_ptr,
 +			     struct journal_list *jlist, struct jset *j,
 +			     bool bad)
 +{
 +	struct journal_replay *i, *pos, *dup = NULL;
-+	struct bch_extent_ptr *ptr;
++	struct journal_ptr *ptr;
 +	struct list_head *where;
 +	size_t bytes = vstruct_bytes(j);
 +	u64 last_seq = 0;
@@ -53780,14 +55932,15 @@ index 000000000000..5c8304e05abd
 +	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 +				    __btree_node_type(level, btree_id));
 +	if (invalid) {
-+		char buf[160];
++		struct printbuf buf = PRINTBUF;
 +
-+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
++		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
 +		mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
 +				 type, where,
 +				 (u64 *) k - entry->_data,
 +				 le16_to_cpu(entry->u64s),
-+				 invalid, buf);
++				 invalid, buf.buf);
++		printbuf_exit(&buf);
 +
 +		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 +		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -53802,7 +55955,7 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_btree_keys(struct bch_fs *c,
++static int journal_entry_btree_keys_validate(struct bch_fs *c,
 +					     const char *where,
 +					     struct jset_entry *entry,
 +					     unsigned version, int big_endian, int write)
@@ -53823,7 +55976,24 @@ index 000000000000..5c8304e05abd
 +	return 0;
 +}
 +
-+static int journal_entry_validate_btree_root(struct bch_fs *c,
++static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	struct bkey_i *k;
++	bool first = true;
++
++	vstruct_for_each(entry, k) {
++		if (!first) {
++			pr_newline(out);
++			pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
++		}
++		pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
++		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
++		first = false;
++	}
++}
++
++static int journal_entry_btree_root_validate(struct bch_fs *c,
 +					     const char *where,
 +					     struct jset_entry *entry,
 +					     unsigned version, int big_endian, int write)
@@ -53851,7 +56021,13 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
++static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	journal_entry_btree_keys_to_text(out, c, entry);
++}
++
++static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
 +					    const char *where,
 +					    struct jset_entry *entry,
 +					    unsigned version, int big_endian, int write)
@@ -53860,7 +56036,12 @@ index 000000000000..5c8304e05abd
 +	return 0;
 +}
 +
-+static int journal_entry_validate_blacklist(struct bch_fs *c,
++static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++}
++
++static int journal_entry_blacklist_validate(struct bch_fs *c,
 +					    const char *where,
 +					    struct jset_entry *entry,
 +					    unsigned version, int big_endian, int write)
@@ -53875,7 +56056,16 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
++static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++	struct jset_entry_blacklist *bl =
++		container_of(entry, struct jset_entry_blacklist, entry);
++
++	pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
++}
++
++static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 +					       const char *where,
 +					       struct jset_entry *entry,
 +					       unsigned version, int big_endian, int write)
@@ -53901,7 +56091,18 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_usage(struct bch_fs *c,
++static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
++					       struct jset_entry *entry)
++{
++	struct jset_entry_blacklist_v2 *bl =
++		container_of(entry, struct jset_entry_blacklist_v2, entry);
++
++	pr_buf(out, "start=%llu end=%llu",
++	       le64_to_cpu(bl->start),
++	       le64_to_cpu(bl->end));
++}
++
++static int journal_entry_usage_validate(struct bch_fs *c,
 +					const char *where,
 +					struct jset_entry *entry,
 +					unsigned version, int big_endian, int write)
@@ -53922,7 +56123,18 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_data_usage(struct bch_fs *c,
++static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					struct jset_entry *entry)
++{
++	struct jset_entry_usage *u =
++		container_of(entry, struct jset_entry_usage, entry);
++
++	pr_buf(out, "type=%s v=%llu",
++	       bch2_fs_usage_types[u->entry.btree_id],
++	       le64_to_cpu(u->v));
++}
++
++static int journal_entry_data_usage_validate(struct bch_fs *c,
 +					const char *where,
 +					struct jset_entry *entry,
 +					unsigned version, int big_endian, int write)
@@ -53944,7 +56156,17 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_clock(struct bch_fs *c,
++static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					     struct jset_entry *entry)
++{
++	struct jset_entry_data_usage *u =
++		container_of(entry, struct jset_entry_data_usage, entry);
++
++	bch2_replicas_entry_to_text(out, &u->r);
++	pr_buf(out, "=%llu", le64_to_cpu(u->v));
++}
++
++static int journal_entry_clock_validate(struct bch_fs *c,
 +					const char *where,
 +					struct jset_entry *entry,
 +					unsigned version, int big_endian, int write)
@@ -53970,7 +56192,16 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
-+static int journal_entry_validate_dev_usage(struct bch_fs *c,
++static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
++					struct jset_entry *entry)
++{
++	struct jset_entry_clock *clock =
++		container_of(entry, struct jset_entry_clock, entry);
++
++	pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
++}
++
++static int journal_entry_dev_usage_validate(struct bch_fs *c,
 +					    const char *where,
 +					    struct jset_entry *entry,
 +					    unsigned version, int big_endian, int write)
@@ -54007,15 +56238,59 @@ index 000000000000..5c8304e05abd
 +	return ret;
 +}
 +
++static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
++					    struct jset_entry *entry)
++{
++	struct jset_entry_dev_usage *u =
++		container_of(entry, struct jset_entry_dev_usage, entry);
++	unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
++
++	pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
++
++	for (i = 0; i < nr_types; i++) {
++		if (i < BCH_DATA_NR)
++			pr_buf(out, " %s", bch2_data_types[i]);
++		else
++			pr_buf(out, " (unknown data type %u)", i);
++		pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
++		       le64_to_cpu(u->d[i].buckets),
++		       le64_to_cpu(u->d[i].sectors),
++		       le64_to_cpu(u->d[i].fragmented));
++	}
++
++	pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
++	       le64_to_cpu(u->buckets_ec),
++	       le64_to_cpu(u->buckets_unavailable));
++}
++
++static int journal_entry_log_validate(struct bch_fs *c,
++				      const char *where,
++				      struct jset_entry *entry,
++				      unsigned version, int big_endian, int write)
++{
++	return 0;
++}
++
++static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
++				      struct jset_entry *entry)
++{
++	struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
++	unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
++
++	pr_buf(out, "%.*s", bytes, l->d);
++}
++
 +struct jset_entry_ops {
 +	int (*validate)(struct bch_fs *, const char *,
 +			struct jset_entry *, unsigned, int, int);
++	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 +};
 +
 +static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 +#define x(f, nr)						\
 +	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
-+		.validate	= journal_entry_validate_##f,	\
++		.validate	= journal_entry_##f##_validate,	\
++		.to_text	= journal_entry_##f##_to_text,	\
 +	},
 +	BCH_JSET_ENTRY_TYPES()
 +#undef x
@@ -54031,6 +56306,17 @@ index 000000000000..5c8304e05abd
 +		: 0;
 +}
 +
++void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
++				struct jset_entry *entry)
++{
++	if (entry->type < BCH_JSET_ENTRY_NR) {
++		pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
++		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
++	} else {
++		pr_buf(out, "(unknown type %u)", entry->type);
++	}
++}
++
 +static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 +				 int write)
 +{
@@ -54120,9 +56406,11 @@ index 000000000000..5c8304e05abd
 +				 sector, le64_to_cpu(jset->seq)))
 +		ret = JOURNAL_ENTRY_BAD;
 +
-+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
++	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 +		     jset->encrypted_start,
 +		     vstruct_end(jset) - (void *) jset->encrypted_start);
++	bch2_fs_fatal_err_on(ret, c,
++			"error decrypting journal entry: %i", ret);
 +csum_done:
 +	/* last_seq is ignored when JSET_NO_FLUSH is true */
 +	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
@@ -54238,7 +56526,7 @@ index 000000000000..5c8304e05abd
 +		case JOURNAL_ENTRY_NONE:
 +			if (!saw_bad)
 +				return 0;
-+			sectors = c->opts.block_size;
++			sectors = block_sectors(c);
 +			goto next_block;
 +		case JOURNAL_ENTRY_BAD:
 +			saw_bad = true;
@@ -54247,7 +56535,7 @@ index 000000000000..5c8304e05abd
 +			 * field of the journal entry we read, so try reading
 +			 * again at next block boundary:
 +			 */
-+			sectors = c->opts.block_size;
++			sectors = block_sectors(c);
 +			break;
 +		default:
 +			return ret;
@@ -54265,9 +56553,12 @@ index 000000000000..5c8304e05abd
 +		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 +
 +		mutex_lock(&jlist->lock);
-+		ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
-+					.dev = ca->dev_idx,
-+					.offset	= offset,
++		ret = journal_entry_add(c, ca, (struct journal_ptr) {
++					.dev		= ca->dev_idx,
++					.bucket		= bucket,
++					.bucket_offset	= offset -
++						bucket_to_sector(ca, ja->buckets[bucket]),
++					.sector		= offset,
 +					}, jlist, j, ret != 0);
 +		mutex_unlock(&jlist->lock);
 +
@@ -54294,12 +56585,14 @@ index 000000000000..5c8304e05abd
 +	struct journal_device *ja =
 +		container_of(cl, struct journal_device, read);
 +	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
++	struct bch_fs *c = ca->fs;
 +	struct journal_list *jlist =
 +		container_of(cl->parent, struct journal_list, cl);
++	struct journal_replay *r;
 +	struct journal_read_buf buf = { NULL, 0 };
 +	u64 min_seq = U64_MAX;
 +	unsigned i;
-+	int ret;
++	int ret = 0;
 +
 +	if (!ja->nr)
 +		goto out;
@@ -54331,11 +56624,37 @@ index 000000000000..5c8304e05abd
 +	 * allocate
 +	 */
 +	while (ja->bucket_seq[ja->cur_idx] > min_seq &&
-+	       ja->bucket_seq[ja->cur_idx] >
++	       ja->bucket_seq[ja->cur_idx] ==
 +	       ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
 +		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 +
-+	ja->sectors_free = 0;
++	ja->sectors_free = ca->mi.bucket_size;
++
++	mutex_lock(&jlist->lock);
++	list_for_each_entry(r, jlist->head, list) {
++		for (i = 0; i < r->nr_ptrs; i++) {
++			if (r->ptrs[i].dev == ca->dev_idx &&
++			    sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
++				unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
++					vstruct_sectors(&r->j, c->block_bits);
++
++				ja->sectors_free = min(ja->sectors_free,
++						       ca->mi.bucket_size - wrote);
++			}
++		}
++	}
++	mutex_unlock(&jlist->lock);
++
++	if (ja->bucket_seq[ja->cur_idx] &&
++	    ja->sectors_free == ca->mi.bucket_size) {
++		bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
++		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
++		for (i = 0; i < 3; i++) {
++			unsigned idx = ja->cur_idx - 1 + i;
++			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
++		}
++		ja->sectors_free = 0;
++	}
 +
 +	/*
 +	 * Set dirty_idx to indicate the entire journal is full and needs to be
@@ -54345,6 +56664,7 @@ index 000000000000..5c8304e05abd
 +	ja->discard_idx = ja->dirty_idx_ondisk =
 +		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 +out:
++	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
 +	kvpfree(buf.data, buf.size);
 +	percpu_ref_put(&ca->io_ref);
 +	closure_return(cl);
@@ -54356,8 +56676,8 @@ index 000000000000..5c8304e05abd
 +	goto out;
 +}
 +
-+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-+				      struct journal_replay *j)
++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
++			       struct journal_replay *j)
 +{
 +	unsigned i;
 +
@@ -54365,13 +56685,15 @@ index 000000000000..5c8304e05abd
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
 +		u64 offset;
 +
-+		div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
++		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
 +
 +		if (i)
 +			pr_buf(out, " ");
-+		pr_buf(out, "%u:%llu (offset %llu)",
++		pr_buf(out, "%u:%u:%u (sector %llu)",
 +		       j->ptrs[i].dev,
-+		       (u64) j->ptrs[i].offset, offset);
++		       j->ptrs[i].bucket,
++		       j->ptrs[i].bucket_offset,
++		       j->ptrs[i].sector);
 +	}
 +}
 +
@@ -54382,6 +56704,7 @@ index 000000000000..5c8304e05abd
 +	struct journal_replay *i, *t;
 +	struct bch_dev *ca;
 +	unsigned iter;
++	struct printbuf buf = PRINTBUF;
 +	size_t keys = 0, entries = 0;
 +	bool degraded = false;
 +	u64 seq, last_seq = 0;
@@ -54440,7 +56763,8 @@ index 000000000000..5c8304e05abd
 +
 +	if (!last_seq) {
 +		fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
-+		return -1;
++		ret = -1;
++		goto err;
 +	}
 +
 +	/* Drop blacklisted entries and entries older than last_seq: */
@@ -54472,7 +56796,7 @@ index 000000000000..5c8304e05abd
 +
 +		while (seq < le64_to_cpu(i->j.seq)) {
 +			u64 missing_start, missing_end;
-+			char buf1[200], buf2[200];
++			struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
 +
 +			while (seq < le64_to_cpu(i->j.seq) &&
 +			       bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -54488,14 +56812,13 @@ index 000000000000..5c8304e05abd
 +				seq++;
 +
 +			if (i->list.prev != list) {
-+				struct printbuf out = PBUF(buf1);
 +				struct journal_replay *p = list_prev_entry(i, list);
 +
-+				bch2_journal_ptrs_to_text(&out, c, p);
-+				pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
++				bch2_journal_ptrs_to_text(&buf1, c, p);
++				pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
 +			} else
-+				sprintf(buf1, "(none)");
-+			bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
++				pr_buf(&buf1, "(none)");
++			bch2_journal_ptrs_to_text(&buf2, c, i);
 +
 +			missing_end = seq - 1;
 +			fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
@@ -54503,7 +56826,10 @@ index 000000000000..5c8304e05abd
 +				 "  next at %s",
 +				 missing_start, missing_end,
 +				 last_seq, *blacklist_seq - 1,
-+				 buf1, buf2);
++				 buf1.buf, buf2.buf);
++
++			printbuf_exit(&buf1);
++			printbuf_exit(&buf2);
 +		}
 +
 +		seq++;
@@ -54517,14 +56843,13 @@ index 000000000000..5c8304e05abd
 +			.e.nr_required = 1,
 +		};
 +		unsigned ptr;
-+		char buf[80];
 +
 +		if (i->ignore)
 +			continue;
 +
 +		ret = jset_validate_entries(c, &i->j, READ);
 +		if (ret)
-+			goto fsck_err;
++			goto err;
 +
 +		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
 +			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
@@ -54536,15 +56861,17 @@ index 000000000000..5c8304e05abd
 +		 * the devices - this is wrong:
 +		 */
 +
++		printbuf_reset(&buf);
++		bch2_replicas_entry_to_text(&buf, &replicas.e);
++
 +		if (!degraded &&
 +		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 +		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
 +				 "superblock not marked as containing replicas %s",
-+				 (bch2_replicas_entry_to_text(&PBUF(buf),
-+							      &replicas.e), buf)))) {
++				 buf.buf))) {
 +			ret = bch2_mark_replicas(c, &replicas.e);
 +			if (ret)
-+				return ret;
++				goto err;
 +		}
 +
 +		for_each_jset_key(k, _n, entry, &i->j)
@@ -54558,7 +56885,9 @@ index 000000000000..5c8304e05abd
 +	if (*start_seq != *blacklist_seq)
 +		bch_info(c, "dropped unflushed entries %llu-%llu",
 +			 *blacklist_seq, *start_seq - 1);
++err:
 +fsck_err:
++	printbuf_exit(&buf);
 +	return ret;
 +}
 +
@@ -54685,49 +57014,6 @@ index 000000000000..5c8304e05abd
 +	return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 +}
 +
-+static void journal_write_compact(struct jset *jset)
-+{
-+	struct jset_entry *i, *next, *prev = NULL;
-+
-+	/*
-+	 * Simple compaction, dropping empty jset_entries (from journal
-+	 * reservations that weren't fully used) and merging jset_entries that
-+	 * can be.
-+	 *
-+	 * If we wanted to be really fancy here, we could sort all the keys in
-+	 * the jset and drop keys that were overwritten - probably not worth it:
-+	 */
-+	vstruct_for_each_safe(jset, i, next) {
-+		unsigned u64s = le16_to_cpu(i->u64s);
-+
-+		/* Empty entry: */
-+		if (!u64s)
-+			continue;
-+
-+		/* Can we merge with previous entry? */
-+		if (prev &&
-+		    i->btree_id == prev->btree_id &&
-+		    i->level	== prev->level &&
-+		    i->type	== prev->type &&
-+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
-+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-+			memmove_u64s_down(vstruct_next(prev),
-+					  i->_data,
-+					  u64s);
-+			le16_add_cpu(&prev->u64s, u64s);
-+			continue;
-+		}
-+
-+		/* Couldn't merge, move i into new position (after prev): */
-+		prev = prev ? vstruct_next(prev) : jset->start;
-+		if (i != prev)
-+			memmove_u64s_down(prev, i, jset_u64s(u64s));
-+	}
-+
-+	prev = prev ? vstruct_next(prev) : jset->start;
-+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-+}
-+
 +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 +{
 +	/* we aren't holding j->lock: */
@@ -54753,7 +57039,7 @@ index 000000000000..5c8304e05abd
 +
 +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 +{
-+	return j->buf + j->reservations.unwritten_idx;
++	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 +}
 +
 +static void journal_write_done(struct closure *cl)
@@ -54766,7 +57052,9 @@ index 000000000000..5c8304e05abd
 +	u64 v, seq;
 +	int err = 0;
 +
-+	bch2_time_stats_update(j->write_time, j->write_start_time);
++	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
++			       ? j->flush_write_time
++			       : j->noflush_write_time, j->write_start_time);
 +
 +	if (!w->devs_written.nr) {
 +		bch_err(c, "unable to write journal to sufficient devices");
@@ -54788,15 +57076,18 @@ index 000000000000..5c8304e05abd
 +		journal_seq_pin(j, seq)->devs = w->devs_written;
 +
 +	if (!err) {
-+		j->seq_ondisk		= seq;
-+
 +		if (!JSET_NO_FLUSH(w->data)) {
 +			j->flushed_seq_ondisk = seq;
 +			j->last_seq_ondisk = w->last_seq;
++
++			bch2_do_discards(c);
++			closure_wake_up(&c->freelist_wait);
 +		}
 +	} else if (!j->err_seq || seq < j->err_seq)
 +		j->err_seq	= seq;
 +
++	j->seq_ondisk		= seq;
++
 +	/*
 +	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 +	 * more buckets:
@@ -54812,7 +57103,7 @@ index 000000000000..5c8304e05abd
 +	v = atomic64_read(&j->reservations.counter);
 +	do {
 +		old.v = new.v = v;
-+		BUG_ON(new.idx == new.unwritten_idx);
++		BUG_ON(journal_state_count(new, new.unwritten_idx));
 +
 +		new.unwritten_idx++;
 +	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
@@ -54823,13 +57114,24 @@ index 000000000000..5c8304e05abd
 +	closure_wake_up(&w->wait);
 +	journal_wake(j);
 +
-+	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-+		mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
-+	spin_unlock(&j->lock);
-+
-+	if (new.unwritten_idx != new.idx &&
-+	    !journal_state_count(new, new.unwritten_idx))
++	if (!journal_state_count(new, new.unwritten_idx) &&
++	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
 +		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
++	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
++		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
++		struct journal_buf *buf = journal_cur_buf(j);
++		long delta = buf->expires - jiffies;
++
++		/*
++		 * We don't close a journal entry to write it while there's
++		 * previous entries still in flight - the current journal entry
++		 * might want to be written now:
++		 */
++
++		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
++	}
++
++	spin_unlock(&j->lock);
 +}
 +
 +static void journal_write_endio(struct bio *bio)
@@ -54911,7 +57213,7 @@ index 000000000000..5c8304e05abd
 +	struct jset_entry *start, *end;
 +	struct jset *jset;
 +	struct bio *bio;
-+	char *journal_debug_buf = NULL;
++	struct printbuf journal_debug_buf = PRINTBUF;
 +	bool validate_before_checksum = false;
 +	unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
 +	int ret;
@@ -54924,10 +57226,11 @@ index 000000000000..5c8304e05abd
 +	j->write_start_time = local_clock();
 +
 +	spin_lock(&j->lock);
-+	if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
-+	    !w->must_flush &&
-+	    (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
-+	    test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
++	if (bch2_journal_error(j) ||
++	    w->noflush ||
++	    (!w->must_flush &&
++	     (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
++	     test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
 +		w->noflush = true;
 +		SET_JSET_NO_FLUSH(jset, true);
 +		jset->last_seq	= 0;
@@ -54964,17 +57267,15 @@ index 000000000000..5c8304e05abd
 +	le32_add_cpu(&jset->u64s, u64s);
 +	BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 +
-+	journal_write_compact(jset);
-+
 +	jset->magic		= cpu_to_le64(jset_magic(c));
-+	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
++	jset->version		= c->sb.version < bcachefs_metadata_version_bkey_renumber
 +		? cpu_to_le32(BCH_JSET_VERSION_OLD)
 +		: cpu_to_le32(c->sb.version);
 +
 +	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
 +	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 +
-+	if (journal_entry_empty(jset))
++	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
 +		j->last_empty_seq = le64_to_cpu(jset->seq);
 +
 +	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
@@ -54987,9 +57288,12 @@ index 000000000000..5c8304e05abd
 +	    jset_validate_for_write(c, jset))
 +		goto err;
 +
-+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
++	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 +		    jset->encrypted_start,
 +		    vstruct_end(jset) - (void *) jset->encrypted_start);
++	if (bch2_fs_fatal_err_on(ret, c,
++			"error decrypting journal entry: %i", ret))
++		goto err;
 +
 +	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
 +				  journal_nonce(jset), jset);
@@ -55014,11 +57318,8 @@ index 000000000000..5c8304e05abd
 +		goto retry_alloc;
 +	}
 +
-+	if (ret) {
-+		journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
-+		if (journal_debug_buf)
-+			__bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
-+	}
++	if (ret)
++		__bch2_journal_debug_to_text(&journal_debug_buf, j);
 +
 +	/*
 +	 * write is allocated, no longer need to account for it in
@@ -55035,8 +57336,8 @@ index 000000000000..5c8304e05abd
 +
 +	if (ret) {
 +		bch_err(c, "Unable to allocate journal write:\n%s",
-+			journal_debug_buf);
-+		kfree(journal_debug_buf);
++			journal_debug_buf.buf);
++		printbuf_exit(&journal_debug_buf);
 +		bch2_fatal_error(c);
 +		continue_at(cl, journal_write_done, c->io_complete_wq);
 +		return;
@@ -55044,7 +57345,7 @@ index 000000000000..5c8304e05abd
 +
 +	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 +
-+	if (test_bit(JOURNAL_NOCHANGES, &j->flags))
++	if (c->opts.nochanges)
 +		goto no_io;
 +
 +	for_each_rw_member(ca, c, i)
@@ -55067,25 +57368,21 @@ index 000000000000..5c8304e05abd
 +		}
 +	}
 +
-+	bch2_bucket_seq_cleanup(c);
-+
 +	continue_at(cl, do_journal_write, c->io_complete_wq);
 +	return;
 +no_io:
-+	bch2_bucket_seq_cleanup(c);
-+
 +	continue_at(cl, journal_write_done, c->io_complete_wq);
 +	return;
 +err:
-+	bch2_inconsistent_error(c);
++	bch2_fatal_error(c);
 +	continue_at(cl, journal_write_done, c->io_complete_wq);
 +}
 diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
 new file mode 100644
-index 000000000000..f34281a28f12
+index 000000000000..f2001835e43e
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.h
-@@ -0,0 +1,50 @@
+@@ -0,0 +1,60 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_JOURNAL_IO_H
 +#define _BCACHEFS_JOURNAL_IO_H
@@ -55096,7 +57393,12 @@ index 000000000000..f34281a28f12
 + */
 +struct journal_replay {
 +	struct list_head	list;
-+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
++	struct journal_ptr {
++		u8		dev;
++		u32		bucket;
++		u32		bucket_offset;
++		u64		sector;
++	}			ptrs[BCH_REPLICAS_MAX];
 +	unsigned		nr_ptrs;
 +
 +	/* checksum error, but we may want to try using it anyways: */
@@ -55128,8 +57430,13 @@ index 000000000000..f34281a28f12
 +	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
 +		vstruct_for_each_safe(entry, k, _n)
 +
-+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
-+				unsigned, int, int);
++int bch2_journal_entry_validate(struct bch_fs *, const char *,
++				struct jset_entry *, unsigned, int, int);
++void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
++				struct jset_entry *);
++
++void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
++			       struct journal_replay *);
 +
 +int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 +
@@ -55138,10 +57445,10 @@ index 000000000000..f34281a28f12
 +#endif /* _BCACHEFS_JOURNAL_IO_H */
 diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
 new file mode 100644
-index 000000000000..ca482c6743c3
+index 000000000000..a9f7d5a7feb2
 --- /dev/null
 +++ b/fs/bcachefs/journal_reclaim.c
-@@ -0,0 +1,849 @@
+@@ -0,0 +1,847 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -55178,10 +57485,8 @@ index 000000000000..ca482c6743c3
 +					    struct journal_device *ja,
 +					    enum journal_space_from from)
 +{
-+	unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
-+		? ((journal_space_from(ja, from) -
-+		    ja->cur_idx - 1 + ja->nr) % ja->nr)
-+		: ja->nr;
++	unsigned available = (journal_space_from(ja, from) -
++			      ja->cur_idx - 1 + ja->nr) % ja->nr;
 +
 +	/*
 +	 * Don't use the last bucket unless writing the new last_seq
@@ -55205,25 +57510,13 @@ index 000000000000..ca482c6743c3
 +				       old.v, new.v)) != old.v);
 +}
 +
-+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
-+{
-+	unsigned sectors = 0;
-+
-+	while (!sectors && *idx != j->reservations.idx) {
-+		sectors = j->buf[*idx].sectors;
-+
-+		*idx = (*idx + 1) & JOURNAL_BUF_MASK;
-+	}
-+
-+	return sectors;
-+}
-+
 +static struct journal_space
 +journal_dev_space_available(struct journal *j, struct bch_dev *ca,
 +			    enum journal_space_from from)
 +{
 +	struct journal_device *ja = &ca->journal;
-+	unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
++	unsigned sectors, buckets, unwritten;
++	u64 seq;
 +
 +	if (from == journal_space_total)
 +		return (struct journal_space) {
@@ -55238,7 +57531,14 @@ index 000000000000..ca482c6743c3
 +	 * We that we don't allocate the space for a journal entry
 +	 * until we write it out - thus, account for it here:
 +	 */
-+	while ((unwritten = get_unwritten_sectors(j, &idx))) {
++	for (seq = journal_last_unwritten_seq(j);
++	     seq <= journal_cur_seq(j);
++	     seq++) {
++		unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors;
++
++		if (!unwritten)
++			continue;
++
 +		/* entry won't fit on this device, skip: */
 +		if (unwritten > ca->mi.bucket_size)
 +			continue;
@@ -55346,7 +57646,7 @@ index 000000000000..ca482c6743c3
 +	j->can_discard = can_discard;
 +
 +	if (nr_online < c->opts.metadata_replicas_required) {
-+		ret = cur_entry_insufficient_devices;
++		ret = JOURNAL_ERR_insufficient_devices;
 +		goto out;
 +	}
 +
@@ -55360,23 +57660,24 @@ index 000000000000..ca482c6743c3
 +	total		= j->space[journal_space_total].total;
 +
 +	if (!clean_ondisk &&
-+	    j->reservations.idx ==
-+	    j->reservations.unwritten_idx) {
-+		char *buf = kmalloc(4096, GFP_ATOMIC);
++	    journal_cur_seq(j) == j->seq_ondisk) {
++		struct printbuf buf = PRINTBUF;
 +
-+		bch_err(c, "journal stuck");
-+		if (buf) {
-+			__bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
-+			pr_err("\n%s", buf);
-+			kfree(buf);
-+		}
++		__bch2_journal_debug_to_text(&buf, j);
++		bch_err(c, "journal stuck\n%s", buf.buf);
++		printbuf_exit(&buf);
 +
++		/*
++		 * Hack: bch2_fatal_error() calls bch2_journal_halt() which
++		 * takes journal lock:
++		 */
++		spin_unlock(&j->lock);
 +		bch2_fatal_error(c);
-+		ret = cur_entry_journal_stuck;
++		spin_lock(&j->lock);
++
++		ret = JOURNAL_ERR_journal_stuck;
 +	} else if (!j->space[journal_space_discarded].next_entry)
-+		ret = cur_entry_journal_full;
-+	else if (!fifo_free(&j->pin))
-+		ret = cur_entry_journal_pin_full;
++		ret = JOURNAL_ERR_journal_full;
 +
 +	if ((j->space[journal_space_clean_ondisk].next_entry <
 +	     j->space[journal_space_clean_ondisk].total) &&
@@ -55395,7 +57696,7 @@ index 000000000000..ca482c6743c3
 +	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 +	j->cur_entry_error	= ret;
 +	journal_set_remaining(j, u64s_remaining);
-+	journal_check_may_get_unreserved(j);
++	journal_set_watermark(j);
 +
 +	if (!ret)
 +		journal_wake(j);
@@ -55430,7 +57731,8 @@ index 000000000000..ca482c6743c3
 +		struct journal_device *ja = &ca->journal;
 +
 +		while (should_discard_bucket(j, ja)) {
-+			if (ca->mi.discard &&
++			if (!c->opts.nochanges &&
++			    ca->mi.discard &&
 +			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 +				blkdev_issue_discard(ca->disk_sb.bdev,
 +					bucket_to_sector(ca,
@@ -55517,9 +57819,6 @@ index 000000000000..ca482c6743c3
 +	if (atomic_dec_and_test(&pin_list->count) &&
 +	    pin_list == &fifo_peek_front(&j->pin))
 +		bch2_journal_reclaim_fast(j);
-+	else if (fifo_used(&j->pin) == 1 &&
-+		 atomic_read(&pin_list->count) == 1)
-+		journal_wake(j);
 +}
 +
 +void bch2_journal_pin_drop(struct journal *j,
@@ -55633,9 +57932,6 @@ index 000000000000..ca482c6743c3
 +	u64 seq;
 +	int err;
 +
-+	if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-+		return 0;
-+
 +	lockdep_assert_held(&j->reclaim_lock);
 +
 +	while (1) {
@@ -55781,7 +58077,7 @@ index 000000000000..ca482c6743c3
 +		 * make sure to flush at least one journal pin:
 +		 */
 +		if (time_after(jiffies, j->last_flushed +
-+			       msecs_to_jiffies(j->reclaim_delay_ms)))
++			       msecs_to_jiffies(c->opts.journal_reclaim_delay)))
 +			min_nr = 1;
 +
 +		if (j->prereserved.reserved * 4 > j->prereserved.remaining)
@@ -55815,7 +58111,7 @@ index 000000000000..ca482c6743c3
 +
 +		if (nr_flushed)
 +			wake_up(&j->reclaim_wait);
-+	} while ((min_nr || min_key_cache) && !direct);
++	} while ((min_nr || min_key_cache) && nr_flushed && !direct);
 +
 +	memalloc_noreclaim_restore(flags);
 +
@@ -55830,13 +58126,13 @@ index 000000000000..ca482c6743c3
 +static int bch2_journal_reclaim_thread(void *arg)
 +{
 +	struct journal *j = arg;
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	unsigned long delay, now;
++	bool journal_empty;
 +	int ret = 0;
 +
 +	set_freezable();
 +
-+	kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-+
 +	j->last_flushed = jiffies;
 +
 +	while (!ret && !kthread_should_stop()) {
@@ -55847,7 +58143,7 @@ index 000000000000..ca482c6743c3
 +		mutex_unlock(&j->reclaim_lock);
 +
 +		now = jiffies;
-+		delay = msecs_to_jiffies(j->reclaim_delay_ms);
++		delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
 +		j->next_reclaim = j->last_flushed + delay;
 +
 +		if (!time_in_range(j->next_reclaim, now, now + delay))
@@ -55859,10 +58155,17 @@ index 000000000000..ca482c6743c3
 +				break;
 +			if (j->reclaim_kicked)
 +				break;
-+			if (time_after_eq(jiffies, j->next_reclaim))
-+				break;
-+			freezable_schedule_timeout(j->next_reclaim - jiffies);
 +
++			spin_lock(&j->lock);
++			journal_empty = fifo_empty(&j->pin);
++			spin_unlock(&j->lock);
++
++			if (journal_empty)
++				freezable_schedule();
++			else if (time_after(j->next_reclaim, jiffies))
++				freezable_schedule_timeout(j->next_reclaim - jiffies);
++			else
++				break;
 +		}
 +		__set_current_state(TASK_RUNNING);
 +	}
@@ -55914,7 +58217,8 @@ index 000000000000..ca482c6743c3
 +
 +	mutex_lock(&j->reclaim_lock);
 +
-+	*did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
++	if (journal_flush_pins(j, seq_to_flush, 0, 0))
++		*did_work = true;
 +
 +	spin_lock(&j->lock);
 +	/*
@@ -55923,8 +58227,7 @@ index 000000000000..ca482c6743c3
 +	 */
 +	ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 +		journal_last_seq(j) > seq_to_flush ||
-+		(fifo_used(&j->pin) == 1 &&
-+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
++		!fifo_used(&j->pin);
 +
 +	spin_unlock(&j->lock);
 +	mutex_unlock(&j->reclaim_lock);
@@ -55972,10 +58275,12 @@ index 000000000000..ca482c6743c3
 +	seq = 0;
 +
 +	spin_lock(&j->lock);
-+	while (!ret && seq < j->pin.back) {
++	while (!ret) {
 +		struct bch_replicas_padded replicas;
 +
 +		seq = max(seq, journal_last_seq(j));
++		if (seq >= j->pin.back)
++			break;
 +		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 +					 journal_seq_pin(j, seq)->devs);
 +		seq++;
@@ -56083,12 +58388,270 @@ index 000000000000..0fd1af120db5
 +int bch2_journal_flush_device_pins(struct journal *, int);
 +
 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
+diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
+new file mode 100644
+index 000000000000..8efe7b7e3dcb
+--- /dev/null
++++ b/fs/bcachefs/journal_sb.c
+@@ -0,0 +1,222 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "journal_sb.h"
++
++#include <linux/sort.h>
++
++/* BCH_SB_FIELD_journal: */
++
++static int u64_cmp(const void *_l, const void *_r)
++{
++	const u64 *l = _l;
++	const u64 *r = _r;
++
++	return cmp_int(*l, *r);
++}
++
++static int bch2_sb_journal_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_journal *journal = field_to_type(f, journal);
++	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
++	int ret = -EINVAL;
++	unsigned nr;
++	unsigned i;
++	u64 *b;
++
++	nr = bch2_nr_journal_buckets(journal);
++	if (!nr)
++		return 0;
++
++	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
++	if (!b)
++		return -ENOMEM;
++
++	for (i = 0; i < nr; i++)
++		b[i] = le64_to_cpu(journal->buckets[i]);
++
++	sort(b, nr, sizeof(u64), u64_cmp, NULL);
++
++	if (!b[0]) {
++		pr_buf(err, "journal bucket at sector 0");
++		goto err;
++	}
++
++	if (b[0] < le16_to_cpu(m->first_bucket)) {
++		pr_buf(err, "journal bucket %llu before first bucket %u",
++		       b[0], le16_to_cpu(m->first_bucket));
++		goto err;
++	}
++
++	if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
++		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
++		       b[nr - 1], le64_to_cpu(m->nbuckets));
++		goto err;
++	}
++
++	for (i = 0; i + 1 < nr; i++)
++		if (b[i] == b[i + 1]) {
++			pr_buf(err, "duplicate journal buckets %llu", b[i]);
++			goto err;
++		}
++
++	ret = 0;
++err:
++	kfree(b);
++	return ret;
++}
++
++static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_journal *journal = field_to_type(f, journal);
++	unsigned i, nr = bch2_nr_journal_buckets(journal);
++
++	pr_buf(out, "Buckets: ");
++	for (i = 0; i < nr; i++)
++		pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
++	pr_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_journal = {
++	.validate	= bch2_sb_journal_validate,
++	.to_text	= bch2_sb_journal_to_text,
++};
++
++struct u64_range {
++	u64	start;
++	u64	end;
++};
++
++static int u64_range_cmp(const void *_l, const void *_r)
++{
++	const struct u64_range *l = _l;
++	const struct u64_range *r = _r;
++
++	return cmp_int(l->start, r->start);
++}
++
++static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
++	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
++	int ret = -EINVAL;
++	unsigned nr;
++	unsigned i;
++	struct u64_range *b;
++
++	nr = bch2_sb_field_journal_v2_nr_entries(journal);
++	if (!nr)
++		return 0;
++
++	b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
++	if (!b)
++		return -ENOMEM;
++
++	for (i = 0; i < nr; i++) {
++		b[i].start = le64_to_cpu(journal->d[i].start);
++		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
++	}
++
++	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
++
++	if (!b[0].start) {
++		pr_buf(err, "journal bucket at sector 0");
++		goto err;
++	}
++
++	if (b[0].start < le16_to_cpu(m->first_bucket)) {
++		pr_buf(err, "journal bucket %llu before first bucket %u",
++		       b[0].start, le16_to_cpu(m->first_bucket));
++		goto err;
++	}
++
++	if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
++		pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
++		       b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
++		goto err;
++	}
++
++	for (i = 0; i + 1 < nr; i++) {
++		if (b[i].end == b[i + 1].start) {
++			pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu",
++			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
++			goto err;
++		}
++
++		if (b[i].end > b[i + 1].start) {
++			pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
++			       b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
++			goto err;
++		}
++	}
++
++	ret = 0;
++err:
++	kfree(b);
++	return ret;
++}
++
++static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
++	unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
++
++	pr_buf(out, "Buckets: ");
++	for (i = 0; i < nr; i++)
++		pr_buf(out, " %llu-%llu",
++		       le64_to_cpu(journal->d[i].start),
++		       le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
++	pr_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
++	.validate	= bch2_sb_journal_v2_validate,
++	.to_text	= bch2_sb_journal_v2_to_text,
++};
++
++int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
++{
++	struct journal_device *ja = &ca->journal;
++	struct bch_sb_field_journal_v2 *j;
++	unsigned i, dst = 0, nr = 1;
++
++	lockdep_assert_held(&c->sb_lock);
++
++	if (!ja->nr) {
++		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
++		bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
++		return 0;
++	}
++
++	for (i = 0; i + 1 < ja->nr; i++)
++		if (ja->buckets[i] + 1 != ja->buckets[i + 1])
++			nr++;
++
++	j = bch2_sb_resize_journal_v2(&ca->disk_sb,
++				 (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
++	if (!j)
++		return -ENOSPC;
++
++	bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
++
++	j->d[dst].start = le64_to_cpu(ja->buckets[0]);
++	j->d[dst].nr	= le64_to_cpu(1);
++
++	for (i = 1; i < ja->nr; i++) {
++		if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
++			le64_add_cpu(&j->d[dst].nr, 1);
++		} else {
++			dst++;
++			j->d[dst].start = le64_to_cpu(ja->buckets[i]);
++			j->d[dst].nr	= le64_to_cpu(1);
++		}
++	}
++
++	return 0;
++}
+diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h
+new file mode 100644
+index 000000000000..a39192e9f6f4
+--- /dev/null
++++ b/fs/bcachefs/journal_sb.h
+@@ -0,0 +1,24 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#include "super-io.h"
++#include "vstructs.h"
++
++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
++{
++	return j
++		? (__le64 *) vstruct_end(&j->field) - j->buckets
++		: 0;
++}
++
++static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
++{
++	if (!j)
++		return 0;
++
++	return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
++}
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
++
++int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
 diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
 new file mode 100644
-index 000000000000..79bc0e49389b
+index 000000000000..3140c8731431
 --- /dev/null
 +++ b/fs/bcachefs/journal_seq_blacklist.c
-@@ -0,0 +1,315 @@
+@@ -0,0 +1,322 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -56157,6 +58720,12 @@ index 000000000000..79bc0e49389b
 +	return bl;
 +}
 +
++static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
++					u64 start, u64 end)
++{
++	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
++}
++
 +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 +{
 +	struct bch_sb_field_journal_seq_blacklist *bl;
@@ -56167,28 +58736,21 @@ index 000000000000..79bc0e49389b
 +	bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
 +	nr = blacklist_nr_entries(bl);
 +
-+	if (bl) {
-+		for (i = 0; i < nr; i++) {
-+			struct journal_seq_blacklist_entry *e =
-+				bl->start + i;
++	for (i = 0; i < nr; i++) {
++		struct journal_seq_blacklist_entry *e =
++			bl->start + i;
 +
-+			if (start == le64_to_cpu(e->start) &&
-+			    end   == le64_to_cpu(e->end))
-+				goto out;
++		if (bl_entry_contig_or_overlaps(e, start, end)) {
++			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
++			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
 +
-+			if (start <= le64_to_cpu(e->start) &&
-+			    end   >= le64_to_cpu(e->end)) {
-+				e->start = cpu_to_le64(start);
-+				e->end	= cpu_to_le64(end);
-+
-+				if (i + 1 < nr)
-+					bl = blacklist_entry_try_merge(c,
-+								bl, i);
-+				if (i)
-+					bl = blacklist_entry_try_merge(c,
-+								bl, i - 1);
-+				goto out_write_sb;
-+			}
++			if (i + 1 < nr)
++				bl = blacklist_entry_try_merge(c,
++							bl, i);
++			if (i)
++				bl = blacklist_entry_try_merge(c,
++							bl, i - 1);
++			goto out_write_sb;
 +		}
 +	}
 +
@@ -56280,27 +58842,34 @@ index 000000000000..79bc0e49389b
 +	return 0;
 +}
 +
-+static const char *
-+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
-+				       struct bch_sb_field *f)
++static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
++						  struct bch_sb_field *f,
++						  struct printbuf *err)
 +{
 +	struct bch_sb_field_journal_seq_blacklist *bl =
 +		field_to_type(f, journal_seq_blacklist);
-+	struct journal_seq_blacklist_entry *i;
-+	unsigned nr = blacklist_nr_entries(bl);
++	unsigned i, nr = blacklist_nr_entries(bl);
 +
-+	for (i = bl->start; i < bl->start + nr; i++) {
-+		if (le64_to_cpu(i->start) >=
-+		    le64_to_cpu(i->end))
-+			return "entry start >= end";
++	for (i = 0; i < nr; i++) {
++		struct journal_seq_blacklist_entry *e = bl->start + i;
 +
-+		if (i + 1 < bl->start + nr &&
-+		    le64_to_cpu(i[0].end) >
-+		    le64_to_cpu(i[1].start))
-+			return "entries out of order";
++		if (le64_to_cpu(e->start) >=
++		    le64_to_cpu(e->end)) {
++			pr_buf(err, "entry %u start >= end (%llu >= %llu)",
++			       i, le64_to_cpu(e->start), le64_to_cpu(e->end));
++			return -EINVAL;
++		}
++
++		if (i + 1 < nr &&
++		    le64_to_cpu(e[0].end) >
++		    le64_to_cpu(e[1].start)) {
++			pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
++			       i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
++			return -EINVAL;
++		}
 +	}
 +
-+	return NULL;
++	return 0;
 +}
 +
 +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
@@ -56320,6 +58889,7 @@ index 000000000000..79bc0e49389b
 +		       le64_to_cpu(i->start),
 +		       le64_to_cpu(i->end));
 +	}
++	pr_newline(out);
 +}
 +
 +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
@@ -56434,10 +59004,10 @@ index 000000000000..afb886ec8e25
 +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
 diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
 new file mode 100644
-index 000000000000..d484513289aa
+index 000000000000..a6cdb885ad41
 --- /dev/null
 +++ b/fs/bcachefs/journal_types.h
-@@ -0,0 +1,324 @@
+@@ -0,0 +1,340 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_JOURNAL_TYPES_H
 +#define _BCACHEFS_JOURNAL_TYPES_H
@@ -56465,6 +59035,8 @@ index 000000000000..d484513289aa
 +
 +	struct closure_waitlist	wait;
 +	u64			last_seq;	/* copy of data->last_seq */
++	long			expires;
++	u64			flush_time;
 +
 +	unsigned		buf_size;	/* size in bytes of @data */
 +	unsigned		sectors;	/* maximum size for current entry */
@@ -56579,20 +59151,39 @@ index 000000000000..d484513289aa
 +	journal_space_nr,
 +};
 +
-+/*
-+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
-+ * either because something's waiting on the write to complete or because it's
-+ * been dirty too long and the timer's expired.
-+ */
-+
 +enum {
 +	JOURNAL_REPLAY_DONE,
 +	JOURNAL_STARTED,
-+	JOURNAL_RECLAIM_STARTED,
-+	JOURNAL_NEED_WRITE,
-+	JOURNAL_MAY_GET_UNRESERVED,
 +	JOURNAL_MAY_SKIP_FLUSH,
-+	JOURNAL_NOCHANGES,
++};
++
++#define JOURNAL_WATERMARKS()		\
++	x(any)				\
++	x(copygc)			\
++	x(reserved)
++
++enum journal_watermark {
++#define x(n)	JOURNAL_WATERMARK_##n,
++	JOURNAL_WATERMARKS()
++#undef x
++};
++
++#define JOURNAL_WATERMARK_MASK	3
++
++/* Reasons we may fail to get a journal reservation: */
++#define JOURNAL_ERRORS()		\
++	x(ok)				\
++	x(blocked)			\
++	x(max_in_flight)		\
++	x(journal_full)			\
++	x(journal_pin_full)		\
++	x(journal_stuck)		\
++	x(insufficient_devices)
++
++enum journal_errors {
++#define x(n)	JOURNAL_ERR_##n,
++	JOURNAL_ERRORS()
++#undef x
 +};
 +
 +/* Embedded in struct bch_fs */
@@ -56602,6 +59193,7 @@ index 000000000000..d484513289aa
 +	unsigned long		flags;
 +
 +	union journal_res_state reservations;
++	enum journal_watermark	watermark;
 +
 +	/* Max size of current journal entry */
 +	unsigned		cur_entry_u64s;
@@ -56611,14 +59203,7 @@ index 000000000000..d484513289aa
 +	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
 +	 * insufficient devices:
 +	 */
-+	enum {
-+		cur_entry_ok,
-+		cur_entry_blocked,
-+		cur_entry_journal_full,
-+		cur_entry_journal_pin_full,
-+		cur_entry_journal_stuck,
-+		cur_entry_insufficient_devices,
-+	}			cur_entry_error;
++	enum journal_errors	cur_entry_error;
 +
 +	union journal_preres_state prereserved;
 +
@@ -56686,6 +59271,10 @@ index 000000000000..d484513289aa
 +	spinlock_t		err_lock;
 +
 +	struct mutex		reclaim_lock;
++	/*
++	 * Used for waiting until journal reclaim has freed up space in the
++	 * journal:
++	 */
 +	wait_queue_head_t	reclaim_wait;
 +	struct task_struct	*reclaim_thread;
 +	bool			reclaim_kicked;
@@ -56702,19 +59291,16 @@ index 000000000000..d484513289aa
 +	struct mutex		discard_lock;
 +	bool			can_discard;
 +
-+	unsigned		write_delay_ms;
-+	unsigned		reclaim_delay_ms;
 +	unsigned long		last_flush_write;
 +
 +	u64			res_get_blocked_start;
-+	u64			need_write_time;
 +	u64			write_start_time;
 +
 +	u64			nr_flush_writes;
 +	u64			nr_noflush_writes;
 +
-+	struct time_stats	*write_time;
-+	struct time_stats	*delay_time;
++	struct time_stats	*flush_write_time;
++	struct time_stats	*noflush_write_time;
 +	struct time_stats	*blocked_time;
 +	struct time_stats	*flush_seq_time;
 +
@@ -56939,6 +59525,238 @@ index 000000000000..4b3ff7d8a875
 +};
 +
 +#endif /* _BCACHEFS_KEYLIST_TYPES_H */
+diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
+new file mode 100644
+index 000000000000..4f0e6960e597
+--- /dev/null
++++ b/fs/bcachefs/lru.c
+@@ -0,0 +1,203 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "alloc_background.h"
++#include "btree_iter.h"
++#include "btree_update.h"
++#include "error.h"
++#include "lru.h"
++#include "recovery.h"
++
++const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
++{
++	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
++
++	if (bkey_val_bytes(k.k) < sizeof(*lru))
++		return "incorrect value size";
++
++	return NULL;
++}
++
++void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
++		      struct bkey_s_c k)
++{
++	const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
++
++	pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
++}
++
++static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u64 existing_idx;
++	int ret = 0;
++
++	if (!time)
++		return 0;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
++			     POS(id, time),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (k.k->type != KEY_TYPE_lru) {
++		bch2_fs_inconsistent(c,
++			"pointer to nonexistent lru %llu:%llu",
++			id, time);
++		ret = -EIO;
++		goto err;
++	}
++
++	existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
++	if (existing_idx != idx) {
++		bch2_fs_inconsistent(c,
++			"lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
++			id, time, existing_idx, idx);
++		ret = -EIO;
++		goto err;
++	}
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_lru *lru;
++	int ret = 0;
++
++	if (!*time)
++		return 0;
++
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
++			POS(lru_id, *time),
++			BTREE_ITER_SLOTS|
++			BTREE_ITER_INTENT|
++			BTREE_ITER_WITH_UPDATES, k, ret)
++		if (bkey_deleted(k.k))
++			break;
++
++	if (ret)
++		goto err;
++
++	BUG_ON(iter.pos.inode != lru_id);
++	*time = iter.pos.offset;
++
++	lru = bch2_trans_kmalloc(trans, sizeof(*lru));
++	ret = PTR_ERR_OR_ZERO(lru);
++	if (ret)
++		goto err;
++
++	bkey_lru_init(&lru->k_i);
++	lru->k.p	= iter.pos;
++	lru->v.idx	= cpu_to_le64(idx);
++
++	ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
++	if (ret)
++		goto err;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
++		    u64 old_time, u64 *new_time)
++{
++	if (old_time == *new_time)
++		return 0;
++
++	return  lru_delete(trans, id, idx, old_time) ?:
++		lru_set(trans, id, idx, new_time);
++}
++
++static int bch2_check_lru_key(struct btree_trans *trans,
++			      struct btree_iter *lru_iter, bool initial)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c lru_k, k;
++	struct bch_alloc_v4 a;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	u64 idx;
++	int ret;
++
++	lru_k = bch2_btree_iter_peek(lru_iter);
++	if (!lru_k.k)
++		return 0;
++
++	ret = bkey_err(lru_k);
++	if (ret)
++		return ret;
++
++	idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx);
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
++			     POS(lru_k.k->p.inode, idx), 0);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	bch2_alloc_to_v4(k, &a);
++
++	if (fsck_err_on(bucket_state(a) != BUCKET_cached ||
++			a.io_time[READ] != lru_k.k->p.offset, c,
++			"incorrect lru entry %s\n"
++			"  for %s",
++			(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
++			(bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
++		struct bkey_i *update =
++			bch2_trans_kmalloc(trans, sizeof(*update));
++
++		ret = PTR_ERR_OR_ZERO(update);
++		if (ret)
++			goto err;
++
++		bkey_init(&update->k);
++		update->k.p = lru_iter->pos;
++
++		ret = bch2_trans_update(trans, lru_iter, update, 0);
++		if (ret)
++			goto err;
++	}
++err:
++fsck_err:
++	bch2_trans_iter_exit(trans, &iter);
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++int bch2_check_lrus(struct bch_fs *c, bool initial)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
++			bch2_check_lru_key(&trans, &iter, initial));
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++	return ret;
++
++}
+diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
+new file mode 100644
+index 000000000000..4db6a8399332
+--- /dev/null
++++ b/fs/bcachefs/lru.h
+@@ -0,0 +1,17 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_LRU_H
++#define _BCACHEFS_LRU_H
++
++const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c);
++void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++
++#define bch2_bkey_ops_lru (struct bkey_ops) {	\
++	.key_invalid	= bch2_lru_invalid,	\
++	.val_to_text	= bch2_lru_to_text,	\
++}
++
++int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
++
++int bch2_check_lrus(struct bch_fs *, bool);
++
++#endif /* _BCACHEFS_LRU_H */
 diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
 new file mode 100644
 index 000000000000..6defc33322b3
@@ -57156,10 +59974,10 @@ index 000000000000..027efaa0d575
 +#endif /* _BCACHEFS_MIGRATE_H */
 diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
 new file mode 100644
-index 000000000000..64e39c10e34b
+index 000000000000..1de213506adf
 --- /dev/null
 +++ b/fs/bcachefs/move.c
-@@ -0,0 +1,1124 @@
+@@ -0,0 +1,1130 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -57254,10 +60072,10 @@ index 000000000000..64e39c10e34b
 +
 +		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
 +			struct bkey_i *update;
-+			size_t i;
++			u32 *i;
 +
-+			for (i = 0; i < s.nr; i++)
-+				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
++			darray_for_each(s.ids, i)
++				if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i))
 +					goto next;
 +
 +			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
@@ -57287,7 +60105,7 @@ index 000000000000..64e39c10e34b
 +		}
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
-+	kfree(s.d);
++	darray_exit(s.ids);
 +
 +	return ret;
 +}
@@ -57513,8 +60331,7 @@ index 000000000000..64e39c10e34b
 +		}
 +
 +	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
-+		m->op.alloc_reserve = RESERVE_MOVINGGC;
-+		m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
++		m->op.alloc_reserve = RESERVE_movinggc;
 +	} else {
 +		/* XXX: this should probably be passed in */
 +		m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
@@ -57556,10 +60373,14 @@ index 000000000000..64e39c10e34b
 +		unsigned compressed_sectors = 0;
 +
 +		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-+			if (p.ptr.dev == data_opts.rewrite_dev &&
-+			    !p.ptr.cached &&
-+			    crc_is_compressed(p.crc))
-+				compressed_sectors += p.crc.compressed_size;
++			if (p.ptr.dev == data_opts.rewrite_dev) {
++				if (p.ptr.cached)
++					m->op.flags |= BCH_WRITE_CACHED;
++
++				if (!p.ptr.cached &&
++				    crc_is_compressed(p.crc))
++					compressed_sectors += p.crc.compressed_size;
++			}
 +
 +		if (compressed_sectors) {
 +			ret = bch2_disk_reservation_add(c, &m->op.res,
@@ -57639,25 +60460,26 @@ index 000000000000..64e39c10e34b
 +	atomic_sub(io->read_sectors, &ctxt->read_sectors);
 +	io->read_completed = true;
 +
-+	if (next_pending_write(ctxt))
-+		wake_up(&ctxt->wait);
-+
++	wake_up(&ctxt->wait);
 +	closure_put(&ctxt->cl);
 +}
 +
-+static void do_pending_writes(struct moving_context *ctxt)
++static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
 +{
 +	struct moving_io *io;
 +
++	if (trans)
++		bch2_trans_unlock(trans);
++
 +	while ((io = next_pending_write(ctxt))) {
 +		list_del(&io->list);
 +		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
 +	}
 +}
 +
-+#define move_ctxt_wait_event(_ctxt, _cond)			\
++#define move_ctxt_wait_event(_ctxt, _trans, _cond)		\
 +do {								\
-+	do_pending_writes(_ctxt);				\
++	do_pending_writes(_ctxt, _trans);			\
 +								\
 +	if (_cond)						\
 +		break;						\
@@ -57665,11 +60487,12 @@ index 000000000000..64e39c10e34b
 +		     next_pending_write(_ctxt) || (_cond));	\
 +} while (1)
 +
-+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
++				       struct btree_trans *trans)
 +{
 +	unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 +
-+	move_ctxt_wait_event(ctxt,
++	move_ctxt_wait_event(ctxt, trans,
 +		!atomic_read(&ctxt->write_sectors) ||
 +		atomic_read(&ctxt->write_sectors) != sectors_pending);
 +}
@@ -57691,14 +60514,6 @@ index 000000000000..64e39c10e34b
 +	unsigned sectors = k.k->size, pages;
 +	int ret = -ENOMEM;
 +
-+	move_ctxt_wait_event(ctxt,
-+		atomic_read(&ctxt->write_sectors) <
-+		SECTORS_IN_FLIGHT_PER_DEVICE);
-+
-+	move_ctxt_wait_event(ctxt,
-+		atomic_read(&ctxt->read_sectors) <
-+		SECTORS_IN_FLIGHT_PER_DEVICE);
-+
 +	/* write path might have to decompress data: */
 +	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
 +		sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -57849,26 +60664,36 @@ index 000000000000..64e39c10e34b
 +				schedule_timeout(delay);
 +
 +			if (unlikely(freezing(current))) {
-+				bch2_trans_unlock(&trans);
-+				move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
++				move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
 +				try_to_freeze();
 +			}
 +		} while (delay);
 +
++		move_ctxt_wait_event(ctxt, &trans,
++			atomic_read(&ctxt->write_sectors) <
++			SECTORS_IN_FLIGHT_PER_DEVICE);
++
++		move_ctxt_wait_event(ctxt, &trans,
++			atomic_read(&ctxt->read_sectors) <
++			SECTORS_IN_FLIGHT_PER_DEVICE);
++
 +		bch2_trans_begin(&trans);
 +
 +		k = bch2_btree_iter_peek(&iter);
-+
-+		stats->pos = iter.pos;
-+
 +		if (!k.k)
 +			break;
++
 +		ret = bkey_err(k);
++		if (ret == -EINTR)
++			continue;
 +		if (ret)
 +			break;
++
 +		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 +			break;
 +
++		stats->pos = iter.pos;
++
 +		if (!bkey_extent_is_direct_data(k.k))
 +			goto next_nondata;
 +
@@ -57903,22 +60728,22 @@ index 000000000000..64e39c10e34b
 +			BUG();
 +		}
 +
-+		/* unlock before doing IO: */
++		/*
++		 * The iterator gets unlocked by __bch2_read_extent - need to
++		 * save a copy of @k elsewhere:
++		  */
 +		bch2_bkey_buf_reassemble(&sk, c, k);
 +		k = bkey_i_to_s_c(sk.k);
-+		bch2_trans_unlock(&trans);
 +
 +		ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
 +					data_cmd, data_opts);
 +		if (ret2) {
-+			if (ret2 == -EINTR) {
-+				bch2_trans_begin(&trans);
++			if (ret2 == -EINTR)
 +				continue;
-+			}
 +
 +			if (ret2 == -ENOMEM) {
 +				/* memory allocation failure, wait for some IO to finish */
-+				bch2_move_ctxt_wait_for_io(ctxt);
++				bch2_move_ctxt_wait_for_io(ctxt, &trans);
 +				continue;
 +			}
 +
@@ -57929,8 +60754,7 @@ index 000000000000..64e39c10e34b
 +		if (rate)
 +			bch2_ratelimit_increment(rate, k.k->size);
 +next:
-+		atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
-+			     &stats->sectors_seen);
++		atomic64_add(k.k->size, &stats->sectors_seen);
 +next_nondata:
 +		bch2_btree_iter_advance(&iter);
 +	}
@@ -58004,7 +60828,7 @@ index 000000000000..64e39c10e34b
 +	}
 +
 +
-+	move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
++	move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
 +	closure_sync(&ctxt.cl);
 +
 +	EBUG_ON(atomic_read(&ctxt.write_sectors));
@@ -58390,10 +61214,10 @@ index 000000000000..9df6d18137a5
 +#endif /* _BCACHEFS_MOVE_TYPES_H */
 diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
 new file mode 100644
-index 000000000000..5c9eafc026c9
+index 000000000000..cb6b81678ecc
 --- /dev/null
 +++ b/fs/bcachefs/movinggc.c
-@@ -0,0 +1,385 @@
+@@ -0,0 +1,424 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Moving/copying garbage collector
@@ -58402,6 +61226,7 @@ index 000000000000..5c9eafc026c9
 + */
 +
 +#include "bcachefs.h"
++#include "alloc_background.h"
 +#include "alloc_foreground.h"
 +#include "btree_iter.h"
 +#include "btree_update.h"
@@ -58425,21 +61250,6 @@ index 000000000000..5c9eafc026c9
 +#include <linux/sort.h>
 +#include <linux/wait.h>
 +
-+/*
-+ * We can't use the entire copygc reserve in one iteration of copygc: we may
-+ * need the buckets we're freeing up to go back into the copygc reserve to make
-+ * forward progress, but if the copygc reserve is full they'll be available for
-+ * any allocation - and it's possible that in a given iteration, we free up most
-+ * of the buckets we're going to free before we allocate most of the buckets
-+ * we're going to allocate.
-+ *
-+ * If we only use half of the reserve per iteration, then in steady state we'll
-+ * always have room in the reserve for the buckets we're going to need in the
-+ * next iteration:
-+ */
-+#define COPYGC_BUCKETS_PER_ITER(ca)					\
-+	((ca)->free[RESERVE_MOVINGGC].size / 2)
-+
 +static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 +{
 +	const struct copygc_heap_entry *l = _l;
@@ -58465,10 +61275,14 @@ index 000000000000..5c9eafc026c9
 +			.dev	= p.ptr.dev,
 +			.offset	= p.ptr.offset,
 +		};
++		ssize_t i;
 +
-+		ssize_t i = eytzinger0_find_le(h->data, h->used,
-+					       sizeof(h->data[0]),
-+					       bucket_offset_cmp, &search);
++		if (p.ptr.cached)
++			continue;
++
++		i = eytzinger0_find_le(h->data, h->used,
++				       sizeof(h->data[0]),
++				       bucket_offset_cmp, &search);
 +#if 0
 +		/* eytzinger search verify code: */
 +		ssize_t j = -1, k;
@@ -58497,7 +61311,7 @@ index 000000000000..5c9eafc026c9
 +			data_opts->target		= io_opts->background_target;
 +			data_opts->nr_replicas		= 1;
 +			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE|
-+				BTREE_INSERT_JOURNAL_RESERVED;
++				JOURNAL_WATERMARK_copygc;
 +			data_opts->rewrite_dev		= p.ptr.dev;
 +
 +			if (p.has_ec)
@@ -58510,18 +61324,6 @@ index 000000000000..5c9eafc026c9
 +	return DATA_SKIP;
 +}
 +
-+static bool have_copygc_reserve(struct bch_dev *ca)
-+{
-+	bool ret;
-+
-+	spin_lock(&ca->fs->freelist_lock);
-+	ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-+		ca->allocator_state != ALLOCATOR_running;
-+	spin_unlock(&ca->fs->freelist_lock);
-+
-+	return ret;
-+}
-+
 +static inline int fragmentation_cmp(copygc_heap *heap,
 +				   struct copygc_heap_entry l,
 +				   struct copygc_heap_entry r)
@@ -58529,18 +61331,106 @@ index 000000000000..5c9eafc026c9
 +	return cmp_int(l.fragmentation, r.fragmentation);
 +}
 +
++static int walk_buckets_to_copygc(struct bch_fs *c)
++{
++	copygc_heap *h = &c->copygc_heap;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++			   BTREE_ITER_PREFETCH, k, ret) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
++		struct copygc_heap_entry e;
++
++		bch2_alloc_to_v4(k, &a);
++
++		if (a.data_type != BCH_DATA_user ||
++		    a.dirty_sectors >= ca->mi.bucket_size ||
++		    bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
++			continue;
++
++		e = (struct copygc_heap_entry) {
++			.dev		= iter.pos.inode,
++			.gen		= a.gen,
++			.replicas	= 1 + a.stripe_redundancy,
++			.fragmentation	= (u64) a.dirty_sectors * (1ULL << 31)
++				/ ca->mi.bucket_size,
++			.sectors	= a.dirty_sectors,
++			.offset		= bucket_to_sector(ca, iter.pos.offset),
++		};
++		heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
++
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int bucket_inorder_cmp(const void *_l, const void *_r)
++{
++	const struct copygc_heap_entry *l = _l;
++	const struct copygc_heap_entry *r = _r;
++
++	return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
++}
++
++static int check_copygc_was_done(struct bch_fs *c,
++				 u64 *sectors_not_moved,
++				 u64 *buckets_not_moved)
++{
++	copygc_heap *h = &c->copygc_heap;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_alloc_v4 a;
++	struct copygc_heap_entry *i;
++	int ret = 0;
++
++	sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
++
++	for (i = h->data; i < h->data + h->used; i++) {
++		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
++
++		bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
++
++		ret = lockrestart_do(&trans,
++				bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
++		if (ret)
++			break;
++
++		bch2_alloc_to_v4(k, &a);
++
++		if (a.gen == i->gen && a.dirty_sectors) {
++			*sectors_not_moved += a.dirty_sectors;
++			*buckets_not_moved += 1;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
 +static int bch2_copygc(struct bch_fs *c)
 +{
 +	copygc_heap *h = &c->copygc_heap;
 +	struct copygc_heap_entry e, *i;
-+	struct bucket_array *buckets;
 +	struct bch_move_stats move_stats;
-+	u64 sectors_to_move = 0, sectors_not_moved = 0;
++	u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
 +	u64 sectors_reserved = 0;
 +	u64 buckets_to_move, buckets_not_moved = 0;
 +	struct bch_dev *ca;
 +	unsigned dev_idx;
-+	size_t b, heap_size = 0;
++	size_t heap_size = 0;
 +	int ret;
 +
 +	bch_move_stats_init(&move_stats, "copygc");
@@ -58565,64 +61455,49 @@ index 000000000000..5c9eafc026c9
 +	}
 +
 +	for_each_rw_member(ca, c, dev_idx) {
-+		closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
++		s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
++				ca->mi.nbuckets >> 6);
 +
-+		spin_lock(&ca->fs->freelist_lock);
-+		sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
-+		spin_unlock(&ca->fs->freelist_lock);
-+
-+		down_read(&ca->bucket_lock);
-+		buckets = bucket_array(ca);
-+
-+		for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-+			struct bucket *g = buckets->b + b;
-+			struct bucket_mark m = READ_ONCE(g->mark);
-+			struct copygc_heap_entry e;
-+
-+			if (m.owned_by_allocator ||
-+			    m.data_type != BCH_DATA_user ||
-+			    !bucket_sectors_used(m) ||
-+			    bucket_sectors_used(m) >= ca->mi.bucket_size)
-+				continue;
-+
-+			WARN_ON(m.stripe && !g->stripe_redundancy);
-+
-+			e = (struct copygc_heap_entry) {
-+				.dev		= dev_idx,
-+				.gen		= m.gen,
-+				.replicas	= 1 + g->stripe_redundancy,
-+				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
-+					/ ca->mi.bucket_size,
-+				.sectors	= bucket_sectors_used(m),
-+				.offset		= bucket_to_sector(ca, b),
-+			};
-+			heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
-+		}
-+		up_read(&ca->bucket_lock);
++		sectors_reserved += avail * ca->mi.bucket_size;
 +	}
 +
++	ret = walk_buckets_to_copygc(c);
++	if (ret) {
++		bch2_fs_fatal_error(c, "error walking buckets to copygc!");
++		return ret;
++	}
++
++	if (!h->used) {
++		bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
++		return 0;
++	}
++
++	/*
++	 * Our btree node allocations also come out of RESERVE_movingc:
++	 */
++	sectors_reserved = (sectors_reserved * 3) / 4;
 +	if (!sectors_reserved) {
 +		bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
 +		return -1;
 +	}
 +
-+	/*
-+	 * Our btree node allocations also come out of RESERVE_MOVINGGC:
-+	 */
-+	sectors_to_move = (sectors_to_move * 3) / 4;
++	for (i = h->data; i < h->data + h->used; i++) {
++		sectors_to_move += i->sectors;
++		sectors_to_write += i->sectors * i->replicas;
++	}
 +
-+	for (i = h->data; i < h->data + h->used; i++)
-+		sectors_to_move += i->sectors * i->replicas;
-+
-+	while (sectors_to_move > sectors_reserved) {
++	while (sectors_to_write > sectors_reserved) {
 +		BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-+		sectors_to_move -= e.sectors * e.replicas;
++		sectors_to_write -= e.sectors * e.replicas;
 +	}
 +
 +	buckets_to_move = h->used;
 +
-+	if (!buckets_to_move)
++	if (!buckets_to_move) {
++		bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
++				    sectors_reserved);
 +		return 0;
++	}
 +
 +	eytzinger0_sort(h->data, h->used,
 +			sizeof(h->data[0]),
@@ -58635,30 +61510,18 @@ index 000000000000..5c9eafc026c9
 +			     writepoint_ptr(&c->copygc_write_point),
 +			     copygc_pred, NULL,
 +			     &move_stats);
-+
-+	for_each_rw_member(ca, c, dev_idx) {
-+		down_read(&ca->bucket_lock);
-+		buckets = bucket_array(ca);
-+		for (i = h->data; i < h->data + h->used; i++) {
-+			struct bucket_mark m;
-+			size_t b;
-+
-+			if (i->dev != dev_idx)
-+				continue;
-+
-+			b = sector_to_bucket(ca, i->offset);
-+			m = READ_ONCE(buckets->b[b].mark);
-+
-+			if (i->gen == m.gen &&
-+			    bucket_sectors_used(m)) {
-+				sectors_not_moved += bucket_sectors_used(m);
-+				buckets_not_moved++;
-+			}
-+		}
-+		up_read(&ca->bucket_lock);
++	if (ret) {
++		bch_err(c, "error %i from bch2_move_data() in copygc", ret);
++		return ret;
 +	}
 +
-+	if (sectors_not_moved && !ret)
++	ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
++	if (ret) {
++		bch_err(c, "error %i from check_copygc_was_done()", ret);
++		return ret;
++	}
++
++	if (sectors_not_moved)
 +		bch_warn_ratelimited(c,
 +			"copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
 +			 sectors_not_moved, sectors_to_move,
@@ -58696,8 +61559,8 @@ index 000000000000..5c9eafc026c9
 +	for_each_rw_member(ca, c, dev_idx) {
 +		struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 +
-+		fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
-+					ca->mi.bucket_size) >> 1);
++		fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
++				       ca->mi.bucket_size) >> 1);
 +		fragmented = usage.d[BCH_DATA_user].fragmented;
 +
 +		wait = min(wait, max(0LL, fragmented_allowed - fragmented));
@@ -58796,10 +61659,10 @@ index 000000000000..922738247d03
 +#endif /* _BCACHEFS_MOVINGGC_H */
 diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
 new file mode 100644
-index 000000000000..a955ef2008c9
+index 000000000000..77fbb7d2194e
 --- /dev/null
 +++ b/fs/bcachefs/opts.c
-@@ -0,0 +1,470 @@
+@@ -0,0 +1,560 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include <linux/kernel.h>
@@ -58811,7 +61674,12 @@ index 000000000000..a955ef2008c9
 +#include "super-io.h"
 +#include "util.h"
 +
-+#define x(t, n) #t,
++#define x(t, n) [n] = #t,
++
++const char * const bch2_metadata_versions[] = {
++	BCH_METADATA_VERSIONS()
++	NULL
++};
 +
 +const char * const bch2_error_actions[] = {
 +	BCH_ERROR_ACTIONS()
@@ -58868,13 +61736,18 @@ index 000000000000..a955ef2008c9
 +	NULL
 +};
 +
-+const char * const bch2_cache_replacement_policies[] = {
-+	BCH_CACHE_REPLACEMENT_POLICIES()
++const char * const bch2_member_states[] = {
++	BCH_MEMBER_STATES()
 +	NULL
 +};
 +
-+const char * const bch2_member_states[] = {
-+	BCH_MEMBER_STATES()
++const char * const bch2_jset_entry_types[] = {
++	BCH_JSET_ENTRY_TYPES()
++	NULL
++};
++
++const char * const bch2_fs_usage_types[] = {
++	BCH_FS_USAGE_TYPES()
 +	NULL
 +};
 +
@@ -58893,6 +61766,16 @@ index 000000000000..a955ef2008c9
 +	[DT_SUBVOL]	= "subvol",
 +};
 +
++u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
++{
++	BUG();
++}
++
++void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
++{
++	BUG();
++}
++
 +void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 +{
 +#define x(_name, ...)						\
@@ -58943,41 +61826,27 @@ index 000000000000..a955ef2008c9
 +	}
 +}
 +
-+/*
-+ * Initial options from superblock - here we don't want any options undefined,
-+ * any options the superblock doesn't specify are set to 0:
-+ */
-+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
-+{
-+	struct bch_opts opts = bch2_opts_empty();
-+
-+#define x(_name, _bits, _mode, _type, _sb_opt, ...)			\
-+	if (_sb_opt != NO_SB_OPT)					\
-+		opt_set(opts, _name, _sb_opt(sb));
-+	BCH_OPTS()
-+#undef x
-+
-+	return opts;
-+}
-+
 +const struct bch_option bch2_opt_table[] = {
-+#define OPT_BOOL()		.type = BCH_OPT_BOOL
-+#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT, .min = _min, .max = _max
-+#define OPT_SECTORS(_min, _max)	.type = BCH_OPT_SECTORS, .min = _min, .max = _max
-+#define OPT_STR(_choices)	.type = BCH_OPT_STR, .choices = _choices
++#define OPT_BOOL()		.type = BCH_OPT_BOOL, .min = 0, .max = 2
++#define OPT_UINT(_min, _max)	.type = BCH_OPT_UINT,			\
++				.min = _min, .max = _max
++#define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
++				.min = 0, .max = ARRAY_SIZE(_choices),\
++				.choices = _choices
 +#define OPT_FN(_fn)		.type = BCH_OPT_FN,			\
 +				.parse = _fn##_parse,			\
 +				.to_text = _fn##_to_text
 +
-+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help)	\
++#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
 +	[Opt_##_name] = {						\
 +		.attr	= {						\
 +			.name	= #_name,				\
-+			.mode = (_mode) & OPT_RUNTIME ? 0644 : 0444,	\
++			.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,	\
 +		},							\
-+		.mode	= _mode,					\
++		.flags	= _flags,					\
 +		.hint	= _hint,					\
 +		.help	= _help,					\
++		.get_sb = _sb_opt,					\
 +		.set_sb	= SET_##_sb_opt,				\
 +		_type							\
 +	},
@@ -59020,8 +61889,43 @@ index 000000000000..a955ef2008c9
 +	return bch2_opt_lookup(name);
 +}
 +
-+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
-+		   const char *val, u64 *res)
++int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
++{
++	if (v < opt->min) {
++		if (err)
++			pr_buf(err, "%s: too small (min %llu)",
++			       opt->attr.name, opt->min);
++		return -ERANGE;
++	}
++
++	if (opt->max && v >= opt->max) {
++		if (err)
++			pr_buf(err, "%s: too big (max %llu)",
++			       opt->attr.name, opt->max);
++		return -ERANGE;
++	}
++
++	if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
++		if (err)
++			pr_buf(err, "%s: not a multiple of 512",
++			       opt->attr.name);
++		return -EINVAL;
++	}
++
++	if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
++		if (err)
++			pr_buf(err, "%s: must be a power of two",
++			       opt->attr.name);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++int bch2_opt_parse(struct bch_fs *c,
++		   const struct bch_option *opt,
++		   const char *val, u64 *res,
++		   struct printbuf *err)
 +{
 +	ssize_t ret;
 +
@@ -59030,30 +61934,13 @@ index 000000000000..a955ef2008c9
 +		ret = kstrtou64(val, 10, res);
 +		if (ret < 0)
 +			return ret;
-+
-+		if (*res > 1)
-+			return -ERANGE;
 +		break;
 +	case BCH_OPT_UINT:
-+		ret = kstrtou64(val, 10, res);
++		ret = opt->flags & OPT_HUMAN_READABLE
++			? bch2_strtou64_h(val, res)
++			: kstrtou64(val, 10, res);
 +		if (ret < 0)
 +			return ret;
-+
-+		if (*res < opt->min || *res >= opt->max)
-+			return -ERANGE;
-+		break;
-+	case BCH_OPT_SECTORS:
-+		ret = bch2_strtou64_h(val, res);
-+		if (ret < 0)
-+			return ret;
-+
-+		if (*res & 511)
-+			return -EINVAL;
-+
-+		*res >>= 9;
-+
-+		if (*res < opt->min || *res >= opt->max)
-+			return -ERANGE;
 +		break;
 +	case BCH_OPT_STR:
 +		ret = match_string(opt->choices, -1, val);
@@ -59066,13 +61953,16 @@ index 000000000000..a955ef2008c9
 +		if (!c)
 +			return 0;
 +
-+		return opt->parse(c, val, res);
++		ret = opt->parse(c, val, res);
++		if (ret < 0)
++			return ret;
 +	}
 +
-+	return 0;
++	return bch2_opt_validate(opt, *res, err);
 +}
 +
-+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
++void bch2_opt_to_text(struct printbuf *out,
++		      struct bch_fs *c, struct bch_sb *sb,
 +		      const struct bch_option *opt, u64 v,
 +		      unsigned flags)
 +{
@@ -59090,10 +61980,10 @@ index 000000000000..a955ef2008c9
 +	switch (opt->type) {
 +	case BCH_OPT_BOOL:
 +	case BCH_OPT_UINT:
-+		pr_buf(out, "%lli", v);
-+		break;
-+	case BCH_OPT_SECTORS:
-+		bch2_hprint(out, v);
++		if (opt->flags & OPT_HUMAN_READABLE)
++			bch2_hprint(out, v);
++		else
++			pr_buf(out, "%lli", v);
 +		break;
 +	case BCH_OPT_STR:
 +		if (flags & OPT_SHOW_FULL_LIST)
@@ -59102,7 +61992,7 @@ index 000000000000..a955ef2008c9
 +			pr_buf(out, opt->choices[v]);
 +		break;
 +	case BCH_OPT_FN:
-+		opt->to_text(out, c, v);
++		opt->to_text(out, c, sb, v);
 +		break;
 +	default:
 +		BUG();
@@ -59148,6 +62038,7 @@ index 000000000000..a955ef2008c9
 +	char *copied_opts, *copied_opts_start;
 +	char *opt, *name, *val;
 +	int ret, id;
++	struct printbuf err = PRINTBUF;
 +	u64 v;
 +
 +	if (!options)
@@ -59167,7 +62058,7 @@ index 000000000000..a955ef2008c9
 +			if (id < 0)
 +				goto bad_opt;
 +
-+			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
++			ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
 +			if (ret < 0)
 +				goto bad_val;
 +		} else {
@@ -59187,7 +62078,7 @@ index 000000000000..a955ef2008c9
 +				goto no_val;
 +		}
 +
-+		if (!(bch2_opt_table[id].mode & OPT_MOUNT))
++		if (!(bch2_opt_table[id].flags & OPT_MOUNT))
 +			goto bad_opt;
 +
 +		if (id == Opt_acl &&
@@ -59210,7 +62101,7 @@ index 000000000000..a955ef2008c9
 +	ret = -1;
 +	goto out;
 +bad_val:
-+	pr_err("Invalid value %s for mount option %s", val, name);
++	pr_err("Invalid mount option %s", err.buf);
 +	ret = -1;
 +	goto out;
 +no_val:
@@ -59219,9 +62110,71 @@ index 000000000000..a955ef2008c9
 +	goto out;
 +out:
 +	kfree(copied_opts_start);
++	printbuf_exit(&err);
 +	return ret;
 +}
 +
++u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
++{
++	const struct bch_option *opt = bch2_opt_table + id;
++	u64 v;
++
++	v = opt->get_sb(sb);
++
++	if (opt->flags & OPT_SB_FIELD_ILOG2)
++		v = 1ULL << v;
++
++	if (opt->flags & OPT_SB_FIELD_SECTORS)
++		v <<= 9;
++
++	return v;
++}
++
++/*
++ * Initial options from superblock - here we don't want any options undefined,
++ * any options the superblock doesn't specify are set to 0:
++ */
++int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
++{
++	unsigned id;
++
++	for (id = 0; id < bch2_opts_nr; id++) {
++		const struct bch_option *opt = bch2_opt_table + id;
++
++		if (opt->get_sb == BCH2_NO_SB_OPT)
++			continue;
++
++		bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
++	}
++
++	return 0;
++}
++
++void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
++{
++	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
++		return;
++
++	if (opt->flags & OPT_SB_FIELD_SECTORS)
++		v >>= 9;
++
++	if (opt->flags & OPT_SB_FIELD_ILOG2)
++		v = ilog2(v);
++
++	opt->set_sb(sb, v);
++}
++
++void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
++{
++	if (opt->set_sb == SET_BCH2_NO_SB_OPT)
++		return;
++
++	mutex_lock(&c->sb_lock);
++	__bch2_opt_set_sb(c->disk_sb.sb, opt, v);
++	bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++}
++
 +/* io opts: */
 +
 +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
@@ -59272,10 +62225,10 @@ index 000000000000..a955ef2008c9
 +}
 diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
 new file mode 100644
-index 000000000000..afb1bb2a62d2
+index 000000000000..8bc67d07afb9
 --- /dev/null
 +++ b/fs/bcachefs/opts.h
-@@ -0,0 +1,470 @@
+@@ -0,0 +1,517 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_OPTS_H
 +#define _BCACHEFS_OPTS_H
@@ -59286,6 +62239,7 @@ index 000000000000..afb1bb2a62d2
 +#include <linux/sysfs.h>
 +#include "bcachefs_format.h"
 +
++extern const char * const bch2_metadata_versions[];
 +extern const char * const bch2_error_actions[];
 +extern const char * const bch2_sb_features[];
 +extern const char * const bch2_sb_compat[];
@@ -59297,8 +62251,9 @@ index 000000000000..afb1bb2a62d2
 +extern const char * const bch2_str_hash_types[];
 +extern const char * const bch2_str_hash_opts[];
 +extern const char * const bch2_data_types[];
-+extern const char * const bch2_cache_replacement_policies[];
 +extern const char * const bch2_member_states[];
++extern const char * const bch2_jset_entry_types[];
++extern const char * const bch2_fs_usage_types[];
 +extern const char * const bch2_d_types[];
 +
 +static inline const char *bch2_d_type_str(unsigned d_type)
@@ -59319,21 +62274,26 @@ index 000000000000..afb1bb2a62d2
 + */
 +
 +/* dummy option, for options that aren't stored in the superblock */
-+LE64_BITMASK(NO_SB_OPT,		struct bch_sb, flags[0], 0, 0);
++u64 BCH2_NO_SB_OPT(const struct bch_sb *);
++void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
 +
 +/* When can be set: */
-+enum opt_mode {
-+	OPT_FORMAT	= (1 << 0),
-+	OPT_MOUNT	= (1 << 1),
-+	OPT_RUNTIME	= (1 << 2),
-+	OPT_INODE	= (1 << 3),
-+	OPT_DEVICE	= (1 << 4),
++enum opt_flags {
++	OPT_FS		= (1 << 0),	/* Filesystem option */
++	OPT_DEVICE	= (1 << 1),	/* Device option */
++	OPT_INODE	= (1 << 2),	/* Inode option */
++	OPT_FORMAT	= (1 << 3),	/* May be specified at format time */
++	OPT_MOUNT	= (1 << 4),	/* May be specified at mount time */
++	OPT_RUNTIME	= (1 << 5),	/* May be specified at runtime */
++	OPT_HUMAN_READABLE = (1 << 6),
++	OPT_MUST_BE_POW_2 = (1 << 7),	/* Must be power of 2 */
++	OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
++	OPT_SB_FIELD_ILOG2 = (1 << 9),	/* Superblock field is ilog2 of actual value */
 +};
 +
 +enum opt_type {
 +	BCH_OPT_BOOL,
 +	BCH_OPT_UINT,
-+	BCH_OPT_SECTORS,
 +	BCH_OPT_STR,
 +	BCH_OPT_FN,
 +};
@@ -59358,281 +62318,315 @@ index 000000000000..afb1bb2a62d2
 + */
 +
 +#ifdef __KERNEL__
-+#define RATELIMIT_ERRORS true
++#define RATELIMIT_ERRORS_DEFAULT true
 +#else
-+#define RATELIMIT_ERRORS false
++#define RATELIMIT_ERRORS_DEFAULT false
 +#endif
 +
 +#define BCH_OPTS()							\
 +	x(block_size,			u16,				\
-+	  OPT_FORMAT,							\
-+	  OPT_SECTORS(1, 128),						\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
++	  OPT_UINT(512, 1U << 16),					\
 +	  BCH_SB_BLOCK_SIZE,		8,				\
 +	  "size",	NULL)						\
-+	x(btree_node_size,		u16,				\
-+	  OPT_FORMAT,							\
-+	  OPT_SECTORS(1, 512),						\
++	x(btree_node_size,		u32,				\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,	\
++	  OPT_UINT(512, 1U << 20),					\
 +	  BCH_SB_BTREE_NODE_SIZE,	512,				\
 +	  "size",	"Btree node size, default 256k")		\
 +	x(errors,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_STR(bch2_error_actions),					\
 +	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_ro,		\
 +	  NULL,		"Action to take on filesystem error")		\
 +	x(metadata_replicas,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 +	  BCH_SB_META_REPLICAS_WANT,	1,				\
 +	  "#",		"Number of metadata replicas")			\
 +	x(data_replicas,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 +	  BCH_SB_DATA_REPLICAS_WANT,	1,				\
 +	  "#",		"Number of data replicas")			\
 +	x(metadata_replicas_required, u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 +	  BCH_SB_META_REPLICAS_REQ,	1,				\
 +	  "#",		NULL)						\
 +	x(data_replicas_required,	u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
 +	  BCH_SB_DATA_REPLICAS_REQ,	1,				\
 +	  "#",		NULL)						\
++	x(encoded_extent_max,		u32,				\
++	  OPT_FS|OPT_FORMAT|						\
++	  OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
++	  OPT_UINT(4096, 2U << 20),					\
++	  BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,			\
++	  "size",	"Maximum size of checksummed/compressed extents")\
 +	x(metadata_checksum,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_STR(bch2_csum_opts),					\
 +	  BCH_SB_META_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 +	  NULL,		NULL)						\
 +	x(data_checksum,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_STR(bch2_csum_opts),					\
 +	  BCH_SB_DATA_CSUM_TYPE,	BCH_CSUM_OPT_crc32c,		\
 +	  NULL,		NULL)						\
 +	x(compression,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_STR(bch2_compression_opts),				\
 +	  BCH_SB_COMPRESSION_TYPE,	BCH_COMPRESSION_OPT_none,	\
 +	  NULL,		NULL)						\
 +	x(background_compression,	u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_STR(bch2_compression_opts),				\
 +	  BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,	\
 +	  NULL,		NULL)						\
 +	x(str_hash,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_STR(bch2_str_hash_opts),					\
 +	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_siphash,	\
 +	  NULL,		"Hash function for directory entries and xattrs")\
 +	x(metadata_target,		u16,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_FN(bch2_opt_target),					\
 +	  BCH_SB_METADATA_TARGET,	0,				\
 +	  "(target)",	"Device or disk group for metadata writes")	\
 +	x(foreground_target,		u16,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_FN(bch2_opt_target),					\
 +	  BCH_SB_FOREGROUND_TARGET,	0,				\
 +	  "(target)",	"Device or disk group for foreground writes")	\
 +	x(background_target,		u16,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_FN(bch2_opt_target),					\
 +	  BCH_SB_BACKGROUND_TARGET,	0,				\
 +	  "(target)",	"Device or disk group to move data to in the background")\
 +	x(promote_target,		u16,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_FN(bch2_opt_target),					\
 +	  BCH_SB_PROMOTE_TARGET,	0,				\
 +	  "(target)",	"Device or disk group to promote data to on read")\
 +	x(erasure_code,			u16,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
++	  OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,		\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_ERASURE_CODE,		false,				\
 +	  NULL,		"Enable erasure coding (DO NOT USE YET)")	\
 +	x(inodes_32bit,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_INODE_32BIT,		true,				\
 +	  NULL,		"Constrain inode numbers to 32 bits")		\
 +	x(shard_inode_numbers,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_SHARD_INUMS,		true,				\
 +	  NULL,		"Shard new inode numbers by CPU id")		\
 +	x(inodes_use_key_cache,	u8,					\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_INODES_USE_KEY_CACHE,	true,				\
 +	  NULL,		"Use the btree key cache for the inodes btree")	\
 +	x(btree_node_mem_ptr_optimization, u8,				\
-+	  OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			true,				\
++	  BCH2_NO_SB_OPT,		true,				\
 +	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
 +	x(gc_reserve_percent,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_UINT(5, 21),						\
 +	  BCH_SB_GC_RESERVE,		8,				\
 +	  "%",		"Percentage of disk space to reserve for copygc")\
 +	x(gc_reserve_bytes,		u64,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
-+	  OPT_SECTORS(0, U64_MAX),					\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|			\
++	  OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,			\
++	  OPT_UINT(0, U64_MAX),						\
 +	  BCH_SB_GC_RESERVE_BYTES,	0,				\
 +	  "%",		"Amount of disk space to reserve for copygc\n"	\
 +			"Takes precedence over gc_reserve_percent if set")\
 +	x(root_reserve_percent,		u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_UINT(0, 100),						\
 +	  BCH_SB_ROOT_RESERVE,		0,				\
 +	  "%",		"Percentage of disk space to reserve for superuser")\
 +	x(wide_macs,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_128_BIT_MACS,		false,				\
 +	  NULL,		"Store full 128 bits of cryptographic MACs, instead of 80")\
 +	x(inline_data,			u8,				\
-+	  OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			true,				\
++	  BCH2_NO_SB_OPT,		true,				\
 +	  NULL,		"Enable inline data extents")			\
 +	x(acl,				u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_POSIX_ACL,		true,				\
 +	  NULL,		"Enable POSIX acls")				\
 +	x(usrquota,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_USRQUOTA,		false,				\
 +	  NULL,		"Enable user quotas")				\
 +	x(grpquota,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_GRPQUOTA,		false,				\
 +	  NULL,		"Enable group quotas")				\
 +	x(prjquota,			u8,				\
-+	  OPT_FORMAT|OPT_MOUNT,						\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT,					\
 +	  OPT_BOOL(),							\
 +	  BCH_SB_PRJQUOTA,		false,				\
 +	  NULL,		"Enable project quotas")			\
 +	x(degraded,			u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Allow mounting in degraded mode")		\
 +	x(very_degraded,		u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Allow mounting in when data will be missing")	\
 +	x(discard,			u8,				\
-+	  OPT_MOUNT|OPT_DEVICE,						\
++	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		true,				\
 +	  NULL,		"Enable discard/TRIM support")			\
 +	x(verbose,			u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Extra debugging information during mount/recovery")\
++	x(journal_flush_delay,		u32,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_UINT(1, U32_MAX),						\
++	  BCH_SB_JOURNAL_FLUSH_DELAY,	1000,				\
++	  NULL,		"Delay in milliseconds before automatic journal commits")\
 +	x(journal_flush_disabled,	u8,				\
-+	  OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH_SB_JOURNAL_FLUSH_DISABLED,false,				\
 +	  NULL,		"Disable journal flush on sync/fsync\n"		\
 +			"If enabled, writes can be lost, but only since the\n"\
 +			"last journal write (default 1 second)")	\
++	x(journal_reclaim_delay,	u32,				\
++	  OPT_FS|OPT_MOUNT|OPT_RUNTIME,					\
++	  OPT_UINT(0, U32_MAX),						\
++	  BCH_SB_JOURNAL_RECLAIM_DELAY,	100,				\
++	  NULL,		"Delay in milliseconds before automatic journal reclaim")\
 +	x(fsck,				u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Run fsck on mount")				\
 +	x(fix_errors,			u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Fix errors during fsck without asking")	\
 +	x(ratelimit_errors,		u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			RATELIMIT_ERRORS,		\
++	  BCH2_NO_SB_OPT,		RATELIMIT_ERRORS_DEFAULT,	\
 +	  NULL,		"Ratelimit error messages during fsck")		\
 +	x(nochanges,			u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,		false,				\
 +	  NULL,		"Super read only mode - no writes at all will be issued,\n"\
 +			"even if we have to replay the journal")	\
 +	x(norecovery,			u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Don't replay the journal")			\
 +	x(rebuild_replicas,		u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Rebuild the superblock replicas section")	\
 +	x(keep_journal,			u8,				\
-+	  OPT_MOUNT,							\
++	  0,								\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Don't free journal entries/keys after startup")\
 +	x(read_entire_journal,		u8,				\
 +	  0,								\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Read all journal entries, not just dirty ones")\
-+	x(noexcl,			u8,				\
-+	  OPT_MOUNT,							\
++	x(read_journal_only,		u8,				\
++	  0,								\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Only read the journal, skip the rest of recovery")\
++	x(journal_transaction_names,	u8,				\
++	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
++	  OPT_BOOL(),							\
++	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
++	  NULL,		"Log transaction function names in journal")	\
++	x(noexcl,			u8,				\
++	  OPT_FS|OPT_MOUNT,						\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Don't open device in exclusive mode")		\
 +	x(sb,				u64,				\
 +	  OPT_MOUNT,							\
 +	  OPT_UINT(0, S64_MAX),						\
-+	  NO_SB_OPT,			BCH_SB_SECTOR,			\
++	  BCH2_NO_SB_OPT,			BCH_SB_SECTOR,			\
 +	  "offset",	"Sector offset of superblock")			\
 +	x(read_only,			u8,				\
-+	  0,								\
++	  OPT_FS,							\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		NULL)						\
 +	x(nostart,			u8,				\
 +	  0,								\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Don\'t start filesystem, only open devices")	\
 +	x(reconstruct_alloc,		u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Reconstruct alloc btree")			\
 +	x(version_upgrade,		u8,				\
-+	  OPT_MOUNT,							\
++	  OPT_FS|OPT_MOUNT,						\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		"Set superblock to latest version,\n"		\
 +			"allowing any new features to be used")		\
++	x(buckets_nouse,		u8,				\
++	  0,								\
++	  OPT_BOOL(),							\
++	  BCH2_NO_SB_OPT,			false,				\
++	  NULL,		"Allocate the buckets_nouse bitmap")		\
 +	x(project,			u8,				\
 +	  OPT_INODE,							\
 +	  OPT_BOOL(),							\
-+	  NO_SB_OPT,			false,				\
++	  BCH2_NO_SB_OPT,			false,				\
 +	  NULL,		NULL)						\
 +	x(fs_size,			u64,				\
 +	  OPT_DEVICE,							\
-+	  OPT_SECTORS(0, S64_MAX),					\
-+	  NO_SB_OPT,			0,				\
++	  OPT_UINT(0, S64_MAX),						\
++	  BCH2_NO_SB_OPT,		0,				\
 +	  "size",	"Size of filesystem on device")			\
 +	x(bucket,			u32,				\
 +	  OPT_DEVICE,							\
-+	  OPT_SECTORS(0, S64_MAX),					\
-+	  NO_SB_OPT,			0,				\
++	  OPT_UINT(0, S64_MAX),						\
++	  BCH2_NO_SB_OPT,		0,				\
 +	  "size",	"Size of filesystem on device")			\
 +	x(durability,			u8,				\
 +	  OPT_DEVICE,							\
 +	  OPT_UINT(0, BCH_REPLICAS_MAX),				\
-+	  NO_SB_OPT,			1,				\
++	  BCH2_NO_SB_OPT,		1,				\
 +	  "n",		"Data written to this device will be considered\n"\
 +			"to have already been replicated n times")
 +
@@ -59685,20 +62679,21 @@ index 000000000000..afb1bb2a62d2
 +
 +struct bch_option {
 +	struct attribute	attr;
++	u64			(*get_sb)(const struct bch_sb *);
 +	void			(*set_sb)(struct bch_sb *, u64);
-+	enum opt_mode		mode;
 +	enum opt_type		type;
++	enum opt_flags		flags;
++	u64			min, max;
 +
 +	union {
 +	struct {
-+		u64		min, max;
 +	};
 +	struct {
 +		const char * const *choices;
 +	};
 +	struct {
 +		int (*parse)(struct bch_fs *, const char *, u64 *);
-+		void (*to_text)(struct printbuf *, struct bch_fs *, u64);
++		void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
 +	};
 +	};
 +
@@ -59713,15 +62708,20 @@ index 000000000000..afb1bb2a62d2
 +u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 +void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 +
-+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
++u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
++int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
++void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
++void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
 +
 +int bch2_opt_lookup(const char *);
-+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
++int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
++int bch2_opt_parse(struct bch_fs *, const struct bch_option *,
++		   const char *, u64 *, struct printbuf *);
 +
 +#define OPT_SHOW_FULL_LIST	(1 << 0)
 +#define OPT_SHOW_MOUNT_STYLE	(1 << 1)
 +
-+void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *,
 +		      const struct bch_option *, u64, unsigned);
 +
 +int bch2_opt_check_may_set(struct bch_fs *, int, u64);
@@ -59748,10 +62748,10 @@ index 000000000000..afb1bb2a62d2
 +#endif /* _BCACHEFS_OPTS_H */
 diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
 new file mode 100644
-index 000000000000..8f8f4b0accd6
+index 000000000000..ca029a00e7b8
 --- /dev/null
 +++ b/fs/bcachefs/quota.c
-@@ -0,0 +1,821 @@
+@@ -0,0 +1,852 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "btree_update.h"
@@ -59760,19 +62760,55 @@ index 000000000000..8f8f4b0accd6
 +#include "subvolume.h"
 +#include "super-io.h"
 +
-+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
-+					  struct bch_sb_field *f)
++static const char * const bch2_quota_types[] = {
++	"user",
++	"group",
++	"project",
++};
++
++static const char * const bch2_quota_counters[] = {
++	"space",
++	"inodes",
++};
++
++static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				  struct printbuf *err)
 +{
 +	struct bch_sb_field_quota *q = field_to_type(f, quota);
 +
-+	if (vstruct_bytes(&q->field) != sizeof(*q))
-+		return "invalid field quota: wrong size";
++	if (vstruct_bytes(&q->field) < sizeof(*q)) {
++		pr_buf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&q->field), sizeof(*q));
++		return -EINVAL;
++	}
 +
-+	return NULL;
++	return 0;
++}
++
++static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_quota *q = field_to_type(f, quota);
++	unsigned qtyp, counter;
++
++	for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) {
++		pr_buf(out, "%s: flags %llx",
++		       bch2_quota_types[qtyp],
++		       le64_to_cpu(q->q[qtyp].flags));
++
++		for (counter = 0; counter < Q_COUNTERS; counter++)
++			pr_buf(out, " %s timelimit %u warnlimit %u",
++			       bch2_quota_counters[counter],
++			       le32_to_cpu(q->q[qtyp].c[counter].timelimit),
++			       le32_to_cpu(q->q[qtyp].c[counter].warnlimit));
++
++		pr_newline(out);
++	}
 +}
 +
 +const struct bch_sb_field_ops bch_sb_field_ops_quota = {
-+	.validate	= bch2_sb_validate_quota,
++	.validate	= bch2_sb_quota_validate,
++	.to_text	= bch2_sb_quota_to_text,
 +};
 +
 +const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -59786,11 +62822,6 @@ index 000000000000..8f8f4b0accd6
 +	return NULL;
 +}
 +
-+static const char * const bch2_quota_counters[] = {
-+	"space",
-+	"inodes",
-+};
-+
 +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
 +			struct bkey_s_c k)
 +{
@@ -60324,7 +63355,7 @@ index 000000000000..8f8f4b0accd6
 +		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 +					      POS(QTYP_USR, 0),
 +					      POS(QTYP_USR + 1, 0),
-+					      NULL);
++					      0, NULL);
 +		if (ret)
 +			return ret;
 +	}
@@ -60336,7 +63367,7 @@ index 000000000000..8f8f4b0accd6
 +		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 +					      POS(QTYP_GRP, 0),
 +					      POS(QTYP_GRP + 1, 0),
-+					      NULL);
++					      0, NULL);
 +		if (ret)
 +			return ret;
 +	}
@@ -60348,7 +63379,7 @@ index 000000000000..8f8f4b0accd6
 +		ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
 +					      POS(QTYP_PRJ, 0),
 +					      POS(QTYP_PRJ + 1, 0),
-+					      NULL);
++					      0, NULL);
 +		if (ret)
 +			return ret;
 +	}
@@ -60701,10 +63732,10 @@ index 000000000000..6a136083d389
 +#endif /* _BCACHEFS_QUOTA_TYPES_H */
 diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
 new file mode 100644
-index 000000000000..a573fede05b1
+index 000000000000..d914892f5339
 --- /dev/null
 +++ b/fs/bcachefs/rebalance.c
-@@ -0,0 +1,337 @@
+@@ -0,0 +1,349 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -60964,35 +63995,47 @@ index 000000000000..a573fede05b1
 +{
 +	struct bch_fs_rebalance *r = &c->rebalance;
 +	struct rebalance_work w = rebalance_work(c);
-+	char h1[21], h2[21];
 +
-+	bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
-+	bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
-+	pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
-+	       w.dev_most_full_idx, h1, h2);
++	out->tabstops[0] = 20;
 +
-+	bch2_hprint(&PBUF(h1), w.total_work << 9);
-+	bch2_hprint(&PBUF(h2), c->capacity << 9);
-+	pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
++	pr_buf(out, "fullest_dev (%i):", w.dev_most_full_idx);
++	pr_tab(out);
 +
-+	pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
++	bch2_hprint(out, w.dev_most_full_work << 9);
++	pr_buf(out, "/");
++	bch2_hprint(out, w.dev_most_full_capacity << 9);
++	pr_newline(out);
++
++	pr_buf(out, "total work:");
++	pr_tab(out);
++
++	bch2_hprint(out, w.total_work << 9);
++	pr_buf(out, "/");
++	bch2_hprint(out, c->capacity << 9);
++	pr_newline(out);
++
++	pr_buf(out, "rate:");
++	pr_tab(out);
++	pr_buf(out, "%u", r->pd.rate.rate);
++	pr_newline(out);
 +
 +	switch (r->state) {
 +	case REBALANCE_WAITING:
-+		pr_buf(out, "waiting\n");
++		pr_buf(out, "waiting");
 +		break;
 +	case REBALANCE_THROTTLED:
-+		bch2_hprint(&PBUF(h1),
++		pr_buf(out, "throttled for %lu sec or ",
++		       (r->throttled_until_cputime - jiffies) / HZ);
++		bch2_hprint(out,
 +			    (r->throttled_until_iotime -
 +			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
-+		pr_buf(out, "throttled for %lu sec or %s io\n",
-+		       (r->throttled_until_cputime - jiffies) / HZ,
-+		       h1);
++		pr_buf(out, " io");
 +		break;
 +	case REBALANCE_RUNNING:
-+		pr_buf(out, "running\n");
++		pr_buf(out, "running");
 +		break;
 +	}
++	pr_newline(out);
 +}
 +
 +void bch2_rebalance_stop(struct bch_fs *c)
@@ -61110,10 +64153,10 @@ index 000000000000..7462a92e9598
 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */
 diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
 new file mode 100644
-index 000000000000..c3b4d116275c
+index 000000000000..ca92fe84c248
 --- /dev/null
 +++ b/fs/bcachefs/recovery.c
-@@ -0,0 +1,1498 @@
+@@ -0,0 +1,1472 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -61132,6 +64175,7 @@ index 000000000000..c3b4d116275c
 +#include "journal_io.h"
 +#include "journal_reclaim.h"
 +#include "journal_seq_blacklist.h"
++#include "lru.h"
 +#include "move.h"
 +#include "quota.h"
 +#include "recovery.h"
@@ -61175,23 +64219,21 @@ index 000000000000..c3b4d116275c
 +static int __journal_key_cmp(enum btree_id	l_btree_id,
 +			     unsigned		l_level,
 +			     struct bpos	l_pos,
-+			     struct journal_key *r)
++			     const struct journal_key *r)
 +{
 +	return (cmp_int(l_btree_id,	r->btree_id) ?:
 +		cmp_int(l_level,	r->level) ?:
 +		bpos_cmp(l_pos,	r->k->k.p));
 +}
 +
-+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
 +{
-+	return (cmp_int(l->btree_id,	r->btree_id) ?:
-+		cmp_int(l->level,	r->level) ?:
-+		bpos_cmp(l->k->k.p,	r->k->k.p));
++	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 +}
 +
-+static size_t journal_key_search(struct journal_keys *journal_keys,
-+				 enum btree_id id, unsigned level,
-+				 struct bpos pos)
++size_t bch2_journal_key_search(struct journal_keys *journal_keys,
++			       enum btree_id id, unsigned level,
++			       struct bpos pos)
 +{
 +	size_t l = 0, r = journal_keys->nr, m;
 +
@@ -61212,6 +64254,24 @@ index 000000000000..c3b4d116275c
 +	return l;
 +}
 +
++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
++				      unsigned level, struct bpos pos)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_key *end = keys->d + keys->nr;
++	struct journal_key *k = keys->d +
++		bch2_journal_key_search(keys, btree_id, level, pos);
++
++	while (k < end && k->overwritten)
++		k++;
++
++	if (k < end &&
++	    k->btree_id	== btree_id &&
++	    k->level	== level)
++		return k->k;
++	return NULL;
++}
++
 +static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
 +{
 +	struct bkey_i *n = iter->keys->d[idx].k;
@@ -61225,18 +64285,25 @@ index 000000000000..c3b4d116275c
 +		iter->idx++;
 +}
 +
-+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-+			    unsigned level, struct bkey_i *k)
++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
++				 unsigned level, struct bkey_i *k)
 +{
 +	struct journal_key n = {
 +		.btree_id	= id,
 +		.level		= level,
 +		.k		= k,
-+		.allocated	= true
++		.allocated	= true,
++		/*
++		 * Ensure these keys are done last by journal replay, to unblock
++		 * journal reclaim:
++		 */
++		.journal_seq	= U32_MAX,
 +	};
 +	struct journal_keys *keys = &c->journal_keys;
 +	struct journal_iter *iter;
-+	unsigned idx = journal_key_search(keys, id, level, k->k.p);
++	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
++
++	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 +
 +	if (idx < keys->nr &&
 +	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -61273,38 +64340,66 @@ index 000000000000..c3b4d116275c
 +	return 0;
 +}
 +
++/*
++ * Can only be used from the recovery thread while we're still RO - can't be
++ * used once we've got RW, as journal_keys is at that point used by multiple
++ * threads:
++ */
++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
++			    unsigned level, struct bkey_i *k)
++{
++	struct bkey_i *n;
++	int ret;
++
++	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
++	if (!n)
++		return -ENOMEM;
++
++	bkey_copy(n, k);
++	ret = bch2_journal_key_insert_take(c, id, level, n);
++	if (ret)
++		kfree(n);
++	return ret;
++}
++
 +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
 +			    unsigned level, struct bpos pos)
 +{
-+	struct bkey_i *whiteout =
-+		kmalloc(sizeof(struct bkey), GFP_KERNEL);
-+	int ret;
++	struct bkey_i whiteout;
 +
-+	if (!whiteout) {
-+		bch_err(c, "%s: error allocating new key", __func__);
-+		return -ENOMEM;
-+	}
++	bkey_init(&whiteout.k);
++	whiteout.k.p = pos;
 +
-+	bkey_init(&whiteout->k);
-+	whiteout->k.p = pos;
++	return bch2_journal_key_insert(c, id, level, &whiteout);
++}
 +
-+	ret = bch2_journal_key_insert(c, id, level, whiteout);
-+	if (ret)
-+		kfree(whiteout);
-+	return ret;
++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
++				  unsigned level, struct bpos pos)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
++
++	if (idx < keys->nr &&
++	    keys->d[idx].btree_id	== btree &&
++	    keys->d[idx].level		== level &&
++	    !bpos_cmp(keys->d[idx].k->k.p, pos))
++		keys->d[idx].overwritten = true;
 +}
 +
 +static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 +{
-+	struct journal_key *k = iter->idx - iter->keys->nr
-+		? iter->keys->d + iter->idx : NULL;
++	struct journal_key *k = iter->keys->d + iter->idx;
 +
-+	if (k &&
-+	    k->btree_id	== iter->btree_id &&
-+	    k->level	== iter->level)
-+		return k->k;
++	while (k < iter->keys->d + iter->keys->nr &&
++	       k->btree_id	== iter->btree_id &&
++	       k->level		== iter->level) {
++		if (!k->overwritten)
++			return k->k;
++
++		iter->idx++;
++		k = iter->keys->d + iter->idx;
++	}
 +
-+	iter->idx = iter->keys->nr;
 +	return NULL;
 +}
 +
@@ -61327,8 +64422,7 @@ index 000000000000..c3b4d116275c
 +	iter->btree_id	= id;
 +	iter->level	= level;
 +	iter->keys	= &c->journal_keys;
-+	iter->idx	= journal_key_search(&c->journal_keys, id, level, pos);
-+	list_add(&iter->list, &c->journal_iters);
++	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
 +}
 +
 +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -61414,106 +64508,33 @@ index 000000000000..c3b4d116275c
 +	bch2_journal_iter_exit(&iter->journal);
 +}
 +
-+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-+						struct bch_fs *c,
-+						struct btree *b)
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						  struct bch_fs *c,
++						  struct btree *b,
++						  struct btree_node_iter node_iter,
++						  struct bpos pos)
 +{
 +	memset(iter, 0, sizeof(*iter));
 +
 +	iter->b = b;
-+	bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-+	bch2_journal_iter_init(c, &iter->journal,
-+			       b->c.btree_id, b->c.level, b->data->min_key);
++	iter->node_iter = node_iter;
++	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
++	INIT_LIST_HEAD(&iter->journal.list);
 +}
 +
-+/* Walk btree, overlaying keys from the journal: */
-+
-+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
-+					   struct btree_and_journal_iter iter)
++/*
++ * this version is used by btree_gc before filesystem has gone RW and
++ * multithreaded, so uses the journal_iters list:
++ */
++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						struct bch_fs *c,
++						struct btree *b)
 +{
-+	unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
-+	struct bkey_s_c k;
-+	struct bkey_buf tmp;
++	struct btree_node_iter node_iter;
 +
-+	BUG_ON(!b->c.level);
-+
-+	bch2_bkey_buf_init(&tmp);
-+
-+	while (i < nr &&
-+	       (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-+		bch2_bkey_buf_reassemble(&tmp, c, k);
-+
-+		bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
-+					b->c.btree_id, b->c.level - 1);
-+
-+		bch2_btree_and_journal_iter_advance(&iter);
-+		i++;
-+	}
-+
-+	bch2_bkey_buf_exit(&tmp, c);
-+}
-+
-+static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
-+				enum btree_id btree_id,
-+				btree_walk_key_fn key_fn)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_and_journal_iter iter;
-+	struct bkey_s_c k;
-+	struct bkey_buf tmp;
-+	struct btree *child;
-+	int ret = 0;
-+
-+	bch2_bkey_buf_init(&tmp);
-+	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-+
-+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-+		if (b->c.level) {
-+			bch2_bkey_buf_reassemble(&tmp, c, k);
-+
-+			child = bch2_btree_node_get_noiter(c, tmp.k,
-+						b->c.btree_id, b->c.level - 1,
-+						false);
-+
-+			ret = PTR_ERR_OR_ZERO(child);
-+			if (ret)
-+				break;
-+
-+			btree_and_journal_iter_prefetch(c, b, iter);
-+
-+			ret = bch2_btree_and_journal_walk_recurse(trans, child,
-+					btree_id, key_fn);
-+			six_unlock_read(&child->c.lock);
-+		} else {
-+			ret = key_fn(trans, k);
-+		}
-+
-+		if (ret)
-+			break;
-+
-+		bch2_btree_and_journal_iter_advance(&iter);
-+	}
-+
-+	bch2_btree_and_journal_iter_exit(&iter);
-+	bch2_bkey_buf_exit(&tmp, c);
-+	return ret;
-+}
-+
-+int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
-+				btree_walk_key_fn key_fn)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree *b = c->btree_roots[btree_id].b;
-+	int ret = 0;
-+
-+	if (btree_node_fake(b))
-+		return 0;
-+
-+	six_lock_read(&b->c.lock, NULL, NULL);
-+	ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
-+	six_unlock_read(&b->c.lock);
-+
-+	return ret;
++	bch2_btree_node_iter_init_from_start(&node_iter, b);
++	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
++	list_add(&iter->journal.list, &c->journal_iters);
 +}
 +
 +/* sort and dedup all keys in the journal: */
@@ -61538,9 +64559,7 @@ index 000000000000..c3b4d116275c
 +	const struct journal_key *l = _l;
 +	const struct journal_key *r = _r;
 +
-+	return  cmp_int(l->btree_id,	r->btree_id) ?:
-+		cmp_int(l->level,	r->level) ?:
-+		bpos_cmp(l->k->k.p, r->k->k.p) ?:
++	return  journal_key_cmp(l, r) ?:
 +		cmp_int(l->journal_seq, r->journal_seq) ?:
 +		cmp_int(l->journal_offset, r->journal_offset);
 +}
@@ -61633,8 +64652,8 @@ index 000000000000..c3b4d116275c
 +		bch2_journal_pin_put(j, j->replay_journal_seq++);
 +}
 +
-+static int __bch2_journal_replay_key(struct btree_trans *trans,
-+				     struct journal_key *k)
++static int bch2_journal_replay_key(struct btree_trans *trans,
++				   struct journal_key *k)
 +{
 +	struct btree_iter iter;
 +	unsigned iter_flags =
@@ -61643,111 +64662,75 @@ index 000000000000..c3b4d116275c
 +	int ret;
 +
 +	if (!k->level && k->btree_id == BTREE_ID_alloc)
-+		iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
++		iter_flags |= BTREE_ITER_CACHED;
 +
 +	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 +				  BTREE_MAX_DEPTH, k->level,
 +				  iter_flags);
-+	ret   = bch2_btree_iter_traverse(&iter) ?:
-+		bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
++	ret = bch2_btree_iter_traverse(&iter);
++	if (ret)
++		goto out;
++
++	/* Must be checked with btree locked: */
++	if (k->overwritten)
++		goto out;
++
++	ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
++out:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
 +
-+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-+{
-+	unsigned commit_flags =
-+		BTREE_INSERT_LAZY_RW|
-+		BTREE_INSERT_NOFAIL|
-+		BTREE_INSERT_JOURNAL_RESERVED;
-+
-+	if (!k->allocated)
-+		commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
-+
-+	return bch2_trans_do(c, NULL, NULL, commit_flags,
-+			     __bch2_journal_replay_key(&trans, k));
-+}
-+
 +static int journal_sort_seq_cmp(const void *_l, const void *_r)
 +{
-+	const struct journal_key *l = _l;
-+	const struct journal_key *r = _r;
++	const struct journal_key *l = *((const struct journal_key **)_l);
++	const struct journal_key *r = *((const struct journal_key **)_r);
 +
-+	return  cmp_int(r->level,	l->level) ?:
-+		cmp_int(l->journal_seq, r->journal_seq) ?:
-+		cmp_int(l->btree_id,	r->btree_id) ?:
-+		bpos_cmp(l->k->k.p,	r->k->k.p);
++	return cmp_int(l->journal_seq, r->journal_seq);
 +}
 +
-+static int bch2_journal_replay(struct bch_fs *c,
-+			       struct journal_keys keys)
++static int bch2_journal_replay(struct bch_fs *c)
 +{
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_key **keys_sorted, *k;
 +	struct journal *j = &c->journal;
-+	struct journal_key *i;
-+	u64 seq;
++	size_t i;
 +	int ret;
 +
-+	sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
++	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
++	if (!keys_sorted)
++		return -ENOMEM;
 +
-+	if (keys.nr)
-+		replay_now_at(j, keys.journal_seq_base);
++	for (i = 0; i < keys->nr; i++)
++		keys_sorted[i] = &keys->d[i];
 +
-+	seq = j->replay_journal_seq;
++	sort(keys_sorted, keys->nr,
++	     sizeof(keys_sorted[0]),
++	     journal_sort_seq_cmp, NULL);
++
++	if (keys->nr)
++		replay_now_at(j, keys->journal_seq_base);
++
++	for (i = 0; i < keys->nr; i++) {
++		k = keys_sorted[i];
 +
-+	/*
-+	 * First replay updates to the alloc btree - these will only update the
-+	 * btree key cache:
-+	 */
-+	for_each_journal_key(keys, i) {
 +		cond_resched();
 +
-+		if (!i->level && i->btree_id == BTREE_ID_alloc) {
-+			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-+			ret = bch2_journal_replay_key(c, i);
-+			if (ret)
-+				goto err;
-+		}
-+	}
++		if (!k->allocated)
++			replay_now_at(j, keys->journal_seq_base + k->journal_seq);
 +
-+	/*
-+	 * Next replay updates to interior btree nodes:
-+	 */
-+	for_each_journal_key(keys, i) {
-+		cond_resched();
-+
-+		if (i->level) {
-+			j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-+			ret = bch2_journal_replay_key(c, i);
-+			if (ret)
-+				goto err;
-+		}
-+	}
-+
-+	/*
-+	 * Now that the btree is in a consistent state, we can start journal
-+	 * reclaim (which will be flushing entries from the btree key cache back
-+	 * to the btree:
-+	 */
-+	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-+	set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
-+	journal_reclaim_kick(j);
-+
-+	j->replay_journal_seq = seq;
-+
-+	/*
-+	 * Now replay leaf node updates:
-+	 */
-+	for_each_journal_key(keys, i) {
-+		cond_resched();
-+
-+		if (i->level || i->btree_id == BTREE_ID_alloc)
-+			continue;
-+
-+		replay_now_at(j, keys.journal_seq_base + i->journal_seq);
-+
-+		ret = bch2_journal_replay_key(c, i);
-+		if (ret)
++		ret = bch2_trans_do(c, NULL, NULL,
++				    BTREE_INSERT_LAZY_RW|
++				    BTREE_INSERT_NOFAIL|
++				    (!k->allocated
++				     ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved
++				     : 0),
++			     bch2_journal_replay_key(&trans, k));
++		if (ret) {
++			bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
++				ret, bch2_btree_ids[k->btree_id], k->level);
 +			goto err;
++		}
 +	}
 +
 +	replay_now_at(j, j->replay_journal_seq_end);
@@ -61755,10 +64738,12 @@ index 000000000000..c3b4d116275c
 +
 +	bch2_journal_set_replay_done(j);
 +	bch2_journal_flush_all_pins(j);
-+	return bch2_journal_error(j);
++	ret = bch2_journal_error(j);
++
++	if (keys->nr && !ret)
++		bch2_journal_log_msg(&c->journal, "journal replay finished");
 +err:
-+	bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-+		ret, bch2_btree_ids[i->btree_id], i->level);
++	kvfree(keys_sorted);
 +	return ret;
 +}
 +
@@ -61796,15 +64781,15 @@ index 000000000000..c3b4d116275c
 +			container_of(entry, struct jset_entry_usage, entry);
 +
 +		switch (entry->btree_id) {
-+		case FS_USAGE_RESERVED:
++		case BCH_FS_USAGE_reserved:
 +			if (entry->level < BCH_REPLICAS_MAX)
 +				c->usage_base->persistent_reserved[entry->level] =
 +					le64_to_cpu(u->v);
 +			break;
-+		case FS_USAGE_INODES:
++		case BCH_FS_USAGE_inodes:
 +			c->usage_base->nr_inodes = le64_to_cpu(u->v);
 +			break;
-+		case FS_USAGE_KEY_VERSION:
++		case BCH_FS_USAGE_key_version:
 +			atomic64_set(&c->key_version,
 +				     le64_to_cpu(u->v));
 +			break;
@@ -61824,10 +64809,7 @@ index 000000000000..c3b4d116275c
 +		struct jset_entry_dev_usage *u =
 +			container_of(entry, struct jset_entry_dev_usage, entry);
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
-+		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-+		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
-+			sizeof(struct jset_entry_dev_usage_type);
-+		unsigned i;
++		unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 +
 +		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
 +		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
@@ -61942,6 +64924,8 @@ index 000000000000..c3b4d116275c
 +{
 +	unsigned i;
 +	struct bch_sb_field_clean *clean = *cleanp;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
 +	int ret = 0;
 +
 +	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
@@ -61954,7 +64938,6 @@ index 000000000000..c3b4d116275c
 +	}
 +
 +	for (i = 0; i < BTREE_ID_NR; i++) {
-+		char buf1[200], buf2[200];
 +		struct bkey_i *k1, *k2;
 +		unsigned l1 = 0, l2 = 0;
 +
@@ -61964,6 +64947,19 @@ index 000000000000..c3b4d116275c
 +		if (!k1 && !k2)
 +			continue;
 +
++		printbuf_reset(&buf1);
++		printbuf_reset(&buf2);
++
++		if (k1)
++			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
++		else
++			pr_buf(&buf1, "(none)");
++
++		if (k2)
++			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
++		else
++			pr_buf(&buf2, "(none)");
++
 +		mustfix_fsck_err_on(!k1 || !k2 ||
 +				    IS_ERR(k1) ||
 +				    IS_ERR(k2) ||
@@ -61973,10 +64969,12 @@ index 000000000000..c3b4d116275c
 +			"superblock btree root %u doesn't match journal after clean shutdown\n"
 +			"sb:      l=%u %s\n"
 +			"journal: l=%u %s\n", i,
-+			l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
-+			l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
++			l1, buf1.buf,
++			l2, buf2.buf);
 +	}
 +fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
 +	return ret;
 +}
 +
@@ -62003,7 +65001,7 @@ index 000000000000..c3b4d116275c
 +		return ERR_PTR(-ENOMEM);
 +	}
 +
-+	ret = bch2_sb_clean_validate(c, clean, READ);
++	ret = bch2_sb_clean_validate_late(c, clean, READ);
 +	if (ret) {
 +		mutex_unlock(&c->sb_lock);
 +		return ERR_PTR(ret);
@@ -62099,7 +65097,6 @@ index 000000000000..c3b4d116275c
 +
 +static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
 +{
-+	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bch_inode_unpacked inode;
@@ -62113,7 +65110,7 @@ index 000000000000..c3b4d116275c
 +		goto err;
 +
 +	if (!bkey_is_inode(k.k)) {
-+		bch_err(c, "root inode not found");
++		bch_err(trans->c, "root inode not found");
 +		ret = -ENOENT;
 +		goto err;
 +	}
@@ -62191,8 +65188,8 @@ index 000000000000..c3b4d116275c
 +			bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
 +			c->opts.version_upgrade = true;
 +			c->opts.fsck		= true;
-+		} else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
-+			bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
++		} else if (c->sb.version < bcachefs_metadata_version_alloc_v4) {
++			bch_info(c, "filesystem version is prior to alloc_v4 - upgrading");
 +			c->opts.version_upgrade = true;
 +		}
 +	}
@@ -62206,6 +65203,7 @@ index 000000000000..c3b4d116275c
 +	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
 +		struct journal_replay *i;
 +
++		bch_verbose(c, "starting journal read");
 +		ret = bch2_journal_read(c, &c->journal_entries,
 +					&blacklist_seq, &journal_seq);
 +		if (ret)
@@ -62254,6 +65252,9 @@ index 000000000000..c3b4d116275c
 +		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
 +	}
 +
++	if (c->opts.read_journal_only)
++		goto out;
++
 +	if (c->opts.reconstruct_alloc) {
 +		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 +		drop_alloc_keys(&c->journal_keys);
@@ -62295,7 +65296,11 @@ index 000000000000..c3b4d116275c
 +
 +	bch_verbose(c, "starting alloc read");
 +	err = "error reading allocation information";
++
++	down_read(&c->gc_lock);
 +	ret = bch2_alloc_read(c);
++	up_read(&c->gc_lock);
++
 +	if (ret)
 +		goto err;
 +	bch_verbose(c, "alloc read done");
@@ -62307,7 +65312,12 @@ index 000000000000..c3b4d116275c
 +		goto err;
 +	bch_verbose(c, "stripes_read done");
 +
-+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
++	/*
++	 * If we're not running fsck, this ensures bch2_fsck_err() calls are
++	 * instead interpreted as bch2_inconsistent_err() calls:
++	 */
++	if (!c->opts.fsck)
++		set_bit(BCH_FS_FSCK_DONE, &c->flags);
 +
 +	if (c->opts.fsck ||
 +	    !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
@@ -62315,18 +65325,32 @@ index 000000000000..c3b4d116275c
 +	    test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
 +		bool metadata_only = c->opts.norecovery;
 +
-+		bch_info(c, "starting mark and sweep");
-+		err = "error in mark and sweep";
++		bch_info(c, "checking allocations");
++		err = "error checking allocations";
 +		ret = bch2_gc(c, true, metadata_only);
 +		if (ret)
 +			goto err;
-+		bch_verbose(c, "mark and sweep done");
++		bch_verbose(c, "done checking allocations");
++	}
++
++	if (c->opts.fsck) {
++		bch_info(c, "checking need_discard and freespace btrees");
++		err = "error checking need_discard and freespace btrees";
++		ret = bch2_check_alloc_info(c, true);
++		if (ret)
++			goto err;
++
++		ret = bch2_check_lrus(c, true);
++		if (ret)
++			goto err;
++		bch_verbose(c, "done checking need_discard and freespace btrees");
 +	}
 +
 +	bch2_stripes_heap_start(c);
 +
 +	clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 +	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 +
 +	/*
 +	 * Skip past versions that might have possibly been used (as nonces),
@@ -62338,30 +65362,18 @@ index 000000000000..c3b4d116275c
 +	if (c->opts.norecovery)
 +		goto out;
 +
-+	bch_verbose(c, "starting journal replay");
++	bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 +	err = "journal replay failed";
-+	ret = bch2_journal_replay(c, c->journal_keys);
++	ret = bch2_journal_replay(c);
 +	if (ret)
 +		goto err;
-+	bch_verbose(c, "journal replay done");
++	if (c->opts.verbose || !c->sb.clean)
++		bch_info(c, "journal replay done");
 +
-+	if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
-+	    !c->opts.nochanges) {
-+		/*
-+		 * note that even when filesystem was clean there might be work
-+		 * to do here, if we ran gc (because of fsck) which recalculated
-+		 * oldest_gen:
-+		 */
-+		bch_verbose(c, "writing allocation info");
-+		err = "error writing out alloc info";
-+		ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
-+			bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
-+		if (ret) {
-+			bch_err(c, "error writing alloc info");
-+			goto err;
-+		}
-+		bch_verbose(c, "alloc write done");
-+	}
++	err = "error initializing freespace";
++	ret = bch2_fs_freespace_init(c);
++	if (ret)
++		goto err;
 +
 +	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 +		bch2_fs_lazy_rw(c);
@@ -62412,23 +65424,6 @@ index 000000000000..c3b4d116275c
 +		bch_verbose(c, "quotas done");
 +	}
 +
-+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-+	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-+		struct bch_move_stats stats;
-+
-+		bch_move_stats_init(&stats, "recovery");
-+
-+		bch_info(c, "scanning for old btree nodes");
-+		ret = bch2_fs_read_write(c);
-+		if (ret)
-+			goto err;
-+
-+		ret = bch2_scan_old_btree_nodes(c, &stats);
-+		if (ret)
-+			goto err;
-+		bch_info(c, "scanning for old btree nodes done");
-+	}
-+
 +	mutex_lock(&c->sb_lock);
 +	if (c->opts.version_upgrade) {
 +		c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
@@ -62453,6 +65448,24 @@ index 000000000000..c3b4d116275c
 +		bch2_write_super(c);
 +	mutex_unlock(&c->sb_lock);
 +
++	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
++	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
++	    le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
++		struct bch_move_stats stats;
++
++		bch_move_stats_init(&stats, "recovery");
++
++		bch_info(c, "scanning for old btree nodes");
++		ret = bch2_fs_read_write(c);
++		if (ret)
++			goto err;
++
++		ret = bch2_scan_old_btree_nodes(c, &stats);
++		if (ret)
++			goto err;
++		bch_info(c, "scanning for old btree nodes done");
++	}
++
 +	if (c->journal_seq_blacklist_table &&
 +	    c->journal_seq_blacklist_table->nr > 128)
 +		queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
@@ -62500,20 +65513,15 @@ index 000000000000..c3b4d116275c
 +		c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 +		bch2_write_super(c);
 +	}
-+
-+	for_each_online_member(ca, c, i)
-+		bch2_mark_dev_superblock(c, ca, 0);
 +	mutex_unlock(&c->sb_lock);
 +
-+	set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 +	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 +
 +	for (i = 0; i < BTREE_ID_NR; i++)
 +		bch2_btree_root_alloc(c, i);
 +
-+	set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-+	set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-+
 +	err = "unable to allocate journal buckets";
 +	for_each_online_member(ca, c, i) {
 +		ret = bch2_dev_journal_alloc(ca);
@@ -62539,6 +65547,7 @@ index 000000000000..c3b4d116275c
 +	 * Write out the superblock and journal buckets, now that we can do
 +	 * btree updates
 +	 */
++	bch_verbose(c, "marking superblocks");
 +	err = "error marking superblock and journal";
 +	for_each_member_device(ca, c, i) {
 +		ret = bch2_trans_mark_dev_sb(c, ca);
@@ -62546,8 +65555,16 @@ index 000000000000..c3b4d116275c
 +			percpu_ref_put(&ca->ref);
 +			goto err;
 +		}
++
++		ca->new_fs_bucket_idx = 0;
 +	}
 +
++	bch_verbose(c, "initializing freespace");
++	err = "error initializing freespace";
++	ret = bch2_fs_freespace_init(c);
++	if (ret)
++		goto err;
++
 +	err = "error creating root snapshot node";
 +	ret = bch2_fs_initialize_subvolumes(c);
 +	if (ret)
@@ -62614,10 +65631,10 @@ index 000000000000..c3b4d116275c
 +}
 diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
 new file mode 100644
-index 000000000000..e45c70b3693f
+index 000000000000..e6927a918df3
 --- /dev/null
 +++ b/fs/bcachefs/recovery.h
-@@ -0,0 +1,58 @@
+@@ -0,0 +1,66 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_RECOVERY_H
 +#define _BCACHEFS_RECOVERY_H
@@ -62651,24 +65668,32 @@ index 000000000000..e45c70b3693f
 +	}			last;
 +};
 +
++size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
++			       unsigned, struct bpos);
++struct bkey_i *bch2_journal_keys_peek(struct bch_fs *, enum btree_id,
++				      unsigned, struct bpos pos);
++
++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
++				 unsigned, struct bkey_i *);
 +int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
 +			    unsigned, struct bkey_i *);
 +int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
 +			    unsigned, struct bpos);
++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
++				  unsigned, struct bpos);
 +
 +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 +struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 +
 +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
++				struct bch_fs *, struct btree *,
++				struct btree_node_iter, struct bpos);
 +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 +						struct bch_fs *,
 +						struct btree *);
 +
-+typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-+
-+int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-+
 +void bch2_journal_keys_free(struct journal_keys *);
 +void bch2_journal_entries_free(struct list_head *);
 +
@@ -62678,10 +65703,10 @@ index 000000000000..e45c70b3693f
 +#endif /* _BCACHEFS_RECOVERY_H */
 diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
 new file mode 100644
-index 000000000000..8dcac7815c9f
+index 000000000000..6824730945d4
 --- /dev/null
 +++ b/fs/bcachefs/reflink.c
-@@ -0,0 +1,367 @@
+@@ -0,0 +1,404 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
@@ -62782,6 +65807,24 @@ index 000000000000..8dcac7815c9f
 +	return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 +}
 +
++int bch2_trans_mark_reflink_v(struct btree_trans *trans,
++			      struct bkey_s_c old, struct bkey_i *new,
++			      unsigned flags)
++{
++	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
++		struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
++
++		if (!r->v.refcount) {
++			r->k.type = KEY_TYPE_deleted;
++			r->k.size = 0;
++			set_bkey_val_u64s(&r->k, 0);
++			return 0;
++		}
++	}
++
++	return bch2_trans_mark_extent(trans, old, new, flags);
++}
++
 +/* indirect inline data */
 +
 +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
@@ -62803,6 +65846,24 @@ index 000000000000..8dcac7815c9f
 +	       min(datalen, 32U), d.v->data);
 +}
 +
++int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
++			      struct bkey_s_c old, struct bkey_i *new,
++			      unsigned flags)
++{
++	if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
++		struct bkey_i_indirect_inline_data *r =
++			bkey_i_to_indirect_inline_data(new);
++
++		if (!r->v.refcount) {
++			r->k.type = KEY_TYPE_deleted;
++			r->k.size = 0;
++			set_bkey_val_u64s(&r->k, 0);
++		}
++	}
++
++	return 0;
++}
++
 +static int bch2_make_extent_indirect(struct btree_trans *trans,
 +				     struct btree_iter *extent_iter,
 +				     struct bkey_i *orig)
@@ -62868,7 +65929,8 @@ index 000000000000..8dcac7815c9f
 +
 +	r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 +
-+	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
++	ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 +err:
 +	c->reflink_hint = reflink_iter.pos.offset;
 +	bch2_trans_iter_exit(trans, &reflink_iter);
@@ -63051,10 +66113,10 @@ index 000000000000..8dcac7815c9f
 +}
 diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
 new file mode 100644
-index 000000000000..3745873fd88d
+index 000000000000..8eb41c0292eb
 --- /dev/null
 +++ b/fs/bcachefs/reflink.h
-@@ -0,0 +1,63 @@
+@@ -0,0 +1,73 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_REFLINK_H
 +#define _BCACHEFS_REFLINK_H
@@ -63067,27 +66129,37 @@ index 000000000000..3745873fd88d
 +#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
 +	.key_invalid	= bch2_reflink_p_invalid,		\
 +	.val_to_text	= bch2_reflink_p_to_text,		\
-+	.key_merge	= bch2_reflink_p_merge,		\
++	.key_merge	= bch2_reflink_p_merge,			\
++	.trans_trigger	= bch2_trans_mark_reflink_p,		\
++	.atomic_trigger	= bch2_mark_reflink_p,			\
 +}
 +
 +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
 +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
 +			    struct bkey_s_c);
++int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
++			      struct bkey_i *, unsigned);
 +
 +#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
 +	.key_invalid	= bch2_reflink_v_invalid,		\
 +	.val_to_text	= bch2_reflink_v_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
++	.trans_trigger	= bch2_trans_mark_reflink_v,		\
++	.atomic_trigger	= bch2_mark_extent,			\
 +}
 +
 +const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
 +					      struct bkey_s_c);
 +void bch2_indirect_inline_data_to_text(struct printbuf *,
 +				struct bch_fs *, struct bkey_s_c);
++int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
++			      struct bkey_s_c, struct bkey_i *,
++			      unsigned);
 +
 +#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
 +	.key_invalid	= bch2_indirect_inline_data_invalid,	\
 +	.val_to_text	= bch2_indirect_inline_data_to_text,	\
++	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
 +}
 +
 +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
@@ -63120,10 +66192,10 @@ index 000000000000..3745873fd88d
 +#endif /* _BCACHEFS_REFLINK_H */
 diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
 new file mode 100644
-index 000000000000..002006593044
+index 000000000000..c2771112d573
 --- /dev/null
 +++ b/fs/bcachefs/replicas.c
-@@ -0,0 +1,1094 @@
+@@ -0,0 +1,1073 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -63162,23 +66234,40 @@ index 000000000000..002006593044
 +	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 +}
 +
++void bch2_replicas_entry_v0_to_text(struct printbuf *out,
++				    struct bch_replicas_entry_v0 *e)
++{
++	unsigned i;
++
++	if (e->data_type < BCH_DATA_NR)
++		pr_buf(out, "%s", bch2_data_types[e->data_type]);
++	else
++		pr_buf(out, "(invalid data type %u)", e->data_type);
++
++	pr_buf(out, ": %u [", e->nr_devs);
++	for (i = 0; i < e->nr_devs; i++)
++		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
++	pr_buf(out, "]");
++}
++
 +void bch2_replicas_entry_to_text(struct printbuf *out,
 +				 struct bch_replicas_entry *e)
 +{
 +	unsigned i;
 +
-+	pr_buf(out, "%s: %u/%u [",
-+	       bch2_data_types[e->data_type],
-+	       e->nr_required,
-+	       e->nr_devs);
++	if (e->data_type < BCH_DATA_NR)
++		pr_buf(out, "%s", bch2_data_types[e->data_type]);
++	else
++		pr_buf(out, "(invalid data type %u)", e->data_type);
 +
++	pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
 +	for (i = 0; i < e->nr_devs; i++)
 +		pr_buf(out, i ? " %u" : "%u", e->devs[i]);
 +	pr_buf(out, "]");
 +}
 +
 +void bch2_cpu_replicas_to_text(struct printbuf *out,
-+			      struct bch_replicas_cpu *r)
++			       struct bch_replicas_cpu *r)
 +{
 +	struct bch_replicas_entry *e;
 +	bool first = true;
@@ -63539,75 +66628,14 @@ index 000000000000..002006593044
 +	goto out;
 +}
 +
-+static int __bch2_mark_replicas(struct bch_fs *c,
-+				struct bch_replicas_entry *r,
-+				bool check)
-+{
-+	return likely(bch2_replicas_marked(c, r))	? 0
-+		: check					? -1
-+		: bch2_mark_replicas_slowpath(c, r);
-+}
-+
 +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
 +{
-+	return __bch2_mark_replicas(c, r, false);
-+}
-+
-+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
-+				     bool check)
-+{
-+	struct bch_replicas_padded search;
-+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-+	unsigned i;
-+	int ret;
-+
-+	memset(&search, 0, sizeof(search));
-+
-+	for (i = 0; i < cached.nr; i++) {
-+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-+
-+		ret = __bch2_mark_replicas(c, &search.e, check);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	bch2_bkey_to_replicas(&search.e, k);
-+
-+	ret = __bch2_mark_replicas(c, &search.e, check);
-+	if (ret)
-+		return ret;
-+
-+	if (search.e.data_type == BCH_DATA_parity) {
-+		search.e.data_type = BCH_DATA_cached;
-+		ret = __bch2_mark_replicas(c, &search.e, check);
-+		if (ret)
-+			return ret;
-+
-+		search.e.data_type = BCH_DATA_user;
-+		ret = __bch2_mark_replicas(c, &search.e, check);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	return 0;
++	return likely(bch2_replicas_marked(c, r))
++		? 0 : bch2_mark_replicas_slowpath(c, r);
 +}
 +
 +/* replicas delta list: */
 +
-+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
-+				     struct replicas_delta_list *r)
-+{
-+	struct replicas_delta *d = r->d;
-+	struct replicas_delta *top = (void *) r->d + r->used;
-+
-+	percpu_rwsem_assert_held(&c->mark_lock);
-+
-+	for (d = r->d; d != top; d = replicas_delta_next(d))
-+		if (bch2_replicas_entry_idx(c, &d->r) < 0)
-+			return false;
-+	return true;
-+}
-+
 +int bch2_replicas_delta_list_mark(struct bch_fs *c,
 +				  struct replicas_delta_list *r)
 +{
@@ -63620,19 +66648,6 @@ index 000000000000..002006593044
 +	return ret;
 +}
 +
-+/* bkey replicas: */
-+
-+bool bch2_bkey_replicas_marked(struct bch_fs *c,
-+			       struct bkey_s_c k)
-+{
-+	return __bch2_mark_bkey_replicas(c, k, true) == 0;
-+}
-+
-+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-+{
-+	return __bch2_mark_bkey_replicas(c, k, false);
-+}
-+
 +/*
 + * Old replicas_gc mechanism: only used for journal replicas entries now, should
 + * die at some point:
@@ -64000,67 +67015,78 @@ index 000000000000..002006593044
 +	return 0;
 +}
 +
-+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
++static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
++				      struct bch_sb *sb,
++				      struct printbuf *err)
 +{
-+	unsigned i;
++	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
++	unsigned i, j;
 +
 +	sort_cmp_size(cpu_r->entries,
 +		      cpu_r->nr,
 +		      cpu_r->entry_size,
 +		      memcmp, NULL);
 +
-+	for (i = 0; i + 1 < cpu_r->nr; i++) {
-+		struct bch_replicas_entry *l =
++	for (i = 0; i < cpu_r->nr; i++) {
++		struct bch_replicas_entry *e =
 +			cpu_replicas_entry(cpu_r, i);
-+		struct bch_replicas_entry *r =
-+			cpu_replicas_entry(cpu_r, i + 1);
 +
-+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
++		if (e->data_type >= BCH_DATA_NR) {
++			pr_buf(err, "invalid data type in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
 +
-+		if (!memcmp(l, r, cpu_r->entry_size))
-+			return "duplicate replicas entry";
++		if (!e->nr_devs) {
++			pr_buf(err, "no devices in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
++
++		if (e->nr_required > 1 &&
++		    e->nr_required >= e->nr_devs) {
++			pr_buf(err, "bad nr_required in entry ");
++			bch2_replicas_entry_to_text(err, e);
++			return -EINVAL;
++		}
++
++		for (j = 0; j < e->nr_devs; j++)
++			if (!bch2_dev_exists(sb, mi, e->devs[j])) {
++				pr_buf(err, "invalid device %u in entry ", e->devs[j]);
++				bch2_replicas_entry_to_text(err, e);
++				return -EINVAL;
++			}
++
++		if (i + 1 < cpu_r->nr) {
++			struct bch_replicas_entry *n =
++				cpu_replicas_entry(cpu_r, i + 1);
++
++			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
++
++			if (!memcmp(e, n, cpu_r->entry_size)) {
++				pr_buf(err, "duplicate replicas entry ");
++				bch2_replicas_entry_to_text(err, e);
++				return -EINVAL;
++			}
++		}
 +	}
 +
-+	return NULL;
++	return 0;
 +}
 +
-+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
++static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				     struct printbuf *err)
 +{
 +	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
-+	struct bch_replicas_entry *e;
-+	const char *err;
-+	unsigned i;
++	struct bch_replicas_cpu cpu_r;
++	int ret;
 +
-+	for_each_replicas_entry(sb_r, e) {
-+		err = "invalid replicas entry: invalid data type";
-+		if (e->data_type >= BCH_DATA_NR)
-+			goto err;
-+
-+		err = "invalid replicas entry: no devices";
-+		if (!e->nr_devs)
-+			goto err;
-+
-+		err = "invalid replicas entry: bad nr_required";
-+		if (e->nr_required > 1 &&
-+		    e->nr_required >= e->nr_devs)
-+			goto err;
-+
-+		err = "invalid replicas entry: invalid device";
-+		for (i = 0; i < e->nr_devs; i++)
-+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-+				goto err;
-+	}
-+
-+	err = "cannot allocate memory";
 +	if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
-+		goto err;
++		return -ENOMEM;
 +
-+	err = check_dup_replicas_entries(&cpu_r);
-+err:
++	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 +	kfree(cpu_r.entries);
-+	return err;
++	return ret;
 +}
 +
 +static void bch2_sb_replicas_to_text(struct printbuf *out,
@@ -64078,49 +67104,50 @@ index 000000000000..002006593044
 +
 +		bch2_replicas_entry_to_text(out, e);
 +	}
++	pr_newline(out);
 +}
 +
 +const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
-+	.validate	= bch2_sb_validate_replicas,
++	.validate	= bch2_sb_replicas_validate,
 +	.to_text	= bch2_sb_replicas_to_text,
 +};
 +
-+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
++static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f,
++					struct printbuf *err)
 +{
 +	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-+	struct bch_replicas_cpu cpu_r = { .entries = NULL };
-+	struct bch_replicas_entry_v0 *e;
-+	const char *err;
-+	unsigned i;
++	struct bch_replicas_cpu cpu_r;
++	int ret;
 +
-+	for_each_replicas_entry_v0(sb_r, e) {
-+		err = "invalid replicas entry: invalid data type";
-+		if (e->data_type >= BCH_DATA_NR)
-+			goto err;
-+
-+		err = "invalid replicas entry: no devices";
-+		if (!e->nr_devs)
-+			goto err;
-+
-+		err = "invalid replicas entry: invalid device";
-+		for (i = 0; i < e->nr_devs; i++)
-+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
-+				goto err;
-+	}
-+
-+	err = "cannot allocate memory";
 +	if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
-+		goto err;
++		return -ENOMEM;
 +
-+	err = check_dup_replicas_entries(&cpu_r);
-+err:
++	ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
 +	kfree(cpu_r.entries);
-+	return err;
++	return ret;
++}
++
++static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
++					struct bch_sb *sb,
++					struct bch_sb_field *f)
++{
++	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
++	struct bch_replicas_entry_v0 *e;
++	bool first = true;
++
++	for_each_replicas_entry(sb_r, e) {
++		if (!first)
++			pr_buf(out, " ");
++		first = false;
++
++		bch2_replicas_entry_v0_to_text(out, e);
++	}
++	pr_newline(out);
 +}
 +
 +const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
-+	.validate	= bch2_sb_validate_replicas_v0,
++	.validate	= bch2_sb_replicas_v0_validate,
++	.to_text	= bch2_sb_replicas_v0_to_text,
 +};
 +
 +/* Query replicas: */
@@ -64161,11 +67188,12 @@ index 000000000000..002006593044
 +
 +		if (dflags & ~flags) {
 +			if (print) {
-+				char buf[100];
++				struct printbuf buf = PRINTBUF;
 +
-+				bch2_replicas_entry_to_text(&PBUF(buf), e);
++				bch2_replicas_entry_to_text(&buf, e);
 +				bch_err(c, "insufficient devices online (%u) for replicas entry %s",
-+					nr_online, buf);
++					nr_online, buf.buf);
++				printbuf_exit(&buf);
 +			}
 +			ret = false;
 +			break;
@@ -64177,19 +67205,42 @@ index 000000000000..002006593044
 +	return ret;
 +}
 +
++unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
++{
++	struct bch_sb_field_replicas *replicas;
++	struct bch_sb_field_replicas_v0 *replicas_v0;
++	unsigned i, data_has = 0;
++
++	replicas = bch2_sb_get_replicas(sb);
++	replicas_v0 = bch2_sb_get_replicas_v0(sb);
++
++	if (replicas) {
++		struct bch_replicas_entry *r;
++
++		for_each_replicas_entry(replicas, r)
++			for (i = 0; i < r->nr_devs; i++)
++				if (r->devs[i] == dev)
++					data_has |= 1 << r->data_type;
++	} else if (replicas_v0) {
++		struct bch_replicas_entry_v0 *r;
++
++		for_each_replicas_entry_v0(replicas_v0, r)
++			for (i = 0; i < r->nr_devs; i++)
++				if (r->devs[i] == dev)
++					data_has |= 1 << r->data_type;
++	}
++
++
++	return data_has;
++}
++
 +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 +{
-+	struct bch_replicas_entry *e;
-+	unsigned i, ret = 0;
++	unsigned ret;
 +
-+	percpu_down_read(&c->mark_lock);
-+
-+	for_each_cpu_replicas_entry(&c->replicas, e)
-+		for (i = 0; i < e->nr_devs; i++)
-+			if (e->devs[i] == ca->dev_idx)
-+				ret |= 1 << e->data_type;
-+
-+	percpu_up_read(&c->mark_lock);
++	mutex_lock(&c->sb_lock);
++	ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx);
++	mutex_unlock(&c->sb_lock);
 +
 +	return ret;
 +}
@@ -64220,10 +67271,10 @@ index 000000000000..002006593044
 +}
 diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
 new file mode 100644
-index 000000000000..72ac544f16d8
+index 000000000000..87820b2e1ad3
 --- /dev/null
 +++ b/fs/bcachefs/replicas.h
-@@ -0,0 +1,108 @@
+@@ -0,0 +1,106 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_REPLICAS_H
 +#define _BCACHEFS_REPLICAS_H
@@ -64274,12 +67325,9 @@ index 000000000000..72ac544f16d8
 +	return (void *) d + replicas_entry_bytes(&d->r) + 8;
 +}
 +
-+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
 +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
 +
 +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
-+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 +
 +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
 +					      unsigned dev)
@@ -64293,6 +67341,7 @@ index 000000000000..72ac544f16d8
 +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
 +			   unsigned, bool);
 +
++unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
 +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 +
 +int bch2_replicas_gc_end(struct bch_fs *, int);
@@ -64622,10 +67671,10 @@ index 000000000000..3dfaf34a43b2
 +#endif /* _SIPHASH_H_ */
 diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
 new file mode 100644
-index 000000000000..57d636740d2f
+index 000000000000..591bbb9f8beb
 --- /dev/null
 +++ b/fs/bcachefs/str_hash.h
-@@ -0,0 +1,358 @@
+@@ -0,0 +1,351 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_STR_HASH_H
 +#define _BCACHEFS_STR_HASH_H
@@ -64791,12 +67840,10 @@ index 000000000000..57d636740d2f
 +	if (ret)
 +		return ret;
 +
-+	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
++	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 +			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
++			   POS(inum.inum, U64_MAX),
 +			   BTREE_ITER_SLOTS|flags, k, ret) {
-+		if (iter->pos.inode != inum.inum)
-+			break;
-+
 +		if (is_visible_key(desc, inum, k)) {
 +			if (!desc.cmp_key(k, key))
 +				return 0;
@@ -64827,15 +67874,12 @@ index 000000000000..57d636740d2f
 +	if (ret)
 +		return ret;
 +
-+	for_each_btree_key_norestart(trans, *iter, desc.btree_id,
++	for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
 +			   SPOS(inum.inum, desc.hash_key(info, key), snapshot),
-+			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-+		if (iter->pos.inode != inum.inum)
-+			break;
-+
++			   POS(inum.inum, U64_MAX),
++			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret)
 +		if (!is_visible_key(desc, inum, k))
 +			return 0;
-+	}
 +	bch2_trans_iter_exit(trans, iter);
 +
 +	return ret ?: -ENOSPC;
@@ -64888,14 +67932,12 @@ index 000000000000..57d636740d2f
 +	if (ret)
 +		return ret;
 +
-+	for_each_btree_key_norestart(trans, iter, desc.btree_id,
++	for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
 +			   SPOS(inum.inum,
 +				desc.hash_bkey(info, bkey_i_to_s_c(insert)),
 +				snapshot),
++			   POS(inum.inum, U64_MAX),
 +			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-+		if (iter.pos.inode != inum.inum)
-+			break;
-+
 +		if (is_visible_key(desc, inum, k)) {
 +			if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
 +				goto found;
@@ -64986,10 +68028,10 @@ index 000000000000..57d636740d2f
 +#endif /* _BCACHEFS_STR_HASH_H */
 diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
 new file mode 100644
-index 000000000000..7e909a118189
+index 000000000000..cdb89ba216cc
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.c
-@@ -0,0 +1,1084 @@
+@@ -0,0 +1,1075 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -65131,7 +68173,7 @@ index 000000000000..7e909a118189
 +	for_each_btree_key(trans, iter, BTREE_ID_snapshots,
 +			   POS_MIN, 0, k, ret) {
 +		u32 id = k.k->p.offset, child[2];
-+		unsigned nr_live = 0, live_idx;
++		unsigned nr_live = 0, live_idx = 0;
 +
 +		if (k.k->type != KEY_TYPE_snapshot)
 +			continue;
@@ -65143,7 +68185,7 @@ index 000000000000..7e909a118189
 +		for (i = 0; i < 2; i++) {
 +			ret = snapshot_live(trans, child[i]);
 +			if (ret < 0)
-+				break;
++				goto err;
 +
 +			if (ret)
 +				live_idx = i;
@@ -65154,6 +68196,7 @@ index 000000000000..7e909a118189
 +			? snapshot_t(c, child[live_idx])->equiv
 +			: id;
 +	}
++err:
 +	bch2_trans_iter_exit(trans, &iter);
 +
 +	if (ret)
@@ -65448,10 +68491,10 @@ index 000000000000..7e909a118189
 +	return ret;
 +}
 +
-+static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-+				     u32 *new_snapids,
-+				     u32 *snapshot_subvols,
-+				     unsigned nr_snapids)
++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
++			      u32 *new_snapids,
++			      u32 *snapshot_subvols,
++			      unsigned nr_snapids)
 +{
 +	struct btree_iter iter;
 +	struct bkey_i_snapshot *n;
@@ -65480,7 +68523,7 @@ index 000000000000..7e909a118189
 +		n = bch2_trans_kmalloc(trans, sizeof(*n));
 +		ret = PTR_ERR_OR_ZERO(n);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		bkey_snapshot_init(&n->k_i);
 +		n->k.p		= iter.pos;
@@ -65490,11 +68533,10 @@ index 000000000000..7e909a118189
 +		n->v.pad	= 0;
 +		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 +
-+		bch2_trans_update(trans, &iter, &n->k_i, 0);
-+
-+		ret = bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
++		ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
++			bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
 +		if (ret)
-+			break;
++			goto err;
 +
 +		new_snapids[i]	= iter.pos.offset;
 +	}
@@ -65515,7 +68557,7 @@ index 000000000000..7e909a118189
 +		n = bch2_trans_kmalloc(trans, sizeof(*n));
 +		ret = PTR_ERR_OR_ZERO(n);
 +		if (ret)
-+			return ret;
++			goto err;
 +
 +		bkey_reassemble(&n->k_i, k);
 +
@@ -65528,43 +68570,30 @@ index 000000000000..7e909a118189
 +		n->v.children[0] = cpu_to_le32(new_snapids[0]);
 +		n->v.children[1] = cpu_to_le32(new_snapids[1]);
 +		SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
-+		bch2_trans_update(trans, &iter, &n->k_i, 0);
++		ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
++		if (ret)
++			goto err;
 +	}
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
 +
-+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
++static int snapshot_id_add(snapshot_id_list *s, u32 id)
 +{
 +	BUG_ON(snapshot_list_has_id(s, id));
 +
-+	if (s->nr == s->size) {
-+		size_t new_size = max(8U, s->size * 2);
-+		void *n = krealloc(s->d,
-+				   new_size * sizeof(s->d[0]),
-+				   GFP_KERNEL);
-+		if (!n) {
-+			pr_err("error allocating snapshot ID list");
-+			return -ENOMEM;
-+		}
-+
-+		s->d	= n;
-+		s->size = new_size;
-+	};
-+
-+	s->d[s->nr++] = id;
-+	return 0;
++	return darray_push(*s, id);
 +}
 +
 +static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-+					   struct snapshot_id_list *deleted,
++					   snapshot_id_list *deleted,
 +					   enum btree_id btree_id)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	struct snapshot_id_list equiv_seen = { 0 };
++	snapshot_id_list equiv_seen = { 0 };
 +	struct bpos last_pos = POS_MIN;
 +	int ret = 0;
 +
@@ -65611,7 +68640,7 @@ index 000000000000..7e909a118189
 +	}
 +	bch2_trans_iter_exit(trans, &iter);
 +
-+	kfree(equiv_seen.d);
++	darray_exit(equiv_seen);
 +
 +	return ret;
 +}
@@ -65623,7 +68652,7 @@ index 000000000000..7e909a118189
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	struct bkey_s_c_snapshot snap;
-+	struct snapshot_id_list deleted = { 0 };
++	snapshot_id_list deleted = { 0 };
 +	u32 i, id, children[2];
 +	int ret = 0;
 +
@@ -65703,15 +68732,15 @@ index 000000000000..7e909a118189
 +
 +	for (i = 0; i < deleted.nr; i++) {
 +		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-+			bch2_snapshot_node_delete(&trans, deleted.d[i]));
++			bch2_snapshot_node_delete(&trans, deleted.data[i]));
 +		if (ret) {
 +			bch_err(c, "error deleting snapshot %u: %i",
-+				deleted.d[i], ret);
++				deleted.data[i], ret);
 +			goto err;
 +		}
 +	}
 +err:
-+	kfree(deleted.d);
++	darray_exit(deleted);
 +	bch2_trans_exit(&trans);
 +	percpu_ref_put(&c->writes);
 +}
@@ -65866,14 +68895,14 @@ index 000000000000..7e909a118189
 +{
 +	struct bch_fs *c = container_of(work, struct bch_fs,
 +				snapshot_wait_for_pagecache_and_delete_work);
-+	struct snapshot_id_list s;
++	snapshot_id_list s;
 +	u32 *id;
 +	int ret = 0;
 +
 +	while (!ret) {
 +		mutex_lock(&c->snapshots_unlinked_lock);
 +		s = c->snapshots_unlinked;
-+		memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
++		darray_init(c->snapshots_unlinked);
 +		mutex_unlock(&c->snapshots_unlinked_lock);
 +
 +		if (!s.nr)
@@ -65881,7 +68910,7 @@ index 000000000000..7e909a118189
 +
 +		bch2_evict_subvolume_inodes(c, &s);
 +
-+		for (id = s.d; id < s.d + s.nr; id++) {
++		for (id = s.data; id < s.data + s.nr; id++) {
 +			ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
 +				      bch2_subvolume_delete(&trans, *id));
 +			if (ret) {
@@ -65890,7 +68919,7 @@ index 000000000000..7e909a118189
 +			}
 +		}
 +
-+		kfree(s.d);
++		darray_exit(s);
 +	}
 +
 +	percpu_ref_put(&c->writes);
@@ -66041,7 +69070,9 @@ index 000000000000..7e909a118189
 +
 +	if (src_subvolid) {
 +		src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
-+		bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
++		ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
++		if (ret)
++			goto err;
 +	}
 +
 +	new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
@@ -66056,7 +69087,9 @@ index 000000000000..7e909a118189
 +	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 +	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
 +	new_subvol->k.p		= dst_iter.pos;
-+	bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
++	ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
++	if (ret)
++		goto err;
 +
 +	*new_subvolid	= new_subvol->k.p.offset;
 +	*new_snapshotid	= new_nodes[0];
@@ -66076,14 +69109,15 @@ index 000000000000..7e909a118189
 +}
 diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
 new file mode 100644
-index 000000000000..e4c3fdcdf22f
+index 000000000000..f609291acafa
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.h
-@@ -0,0 +1,132 @@
+@@ -0,0 +1,124 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUBVOLUME_H
 +#define _BCACHEFS_SUBVOLUME_H
 +
++#include "darray.h"
 +#include "subvolume_types.h"
 +
 +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -66140,15 +69174,13 @@ index 000000000000..e4c3fdcdf22f
 +
 +struct snapshots_seen {
 +	struct bpos			pos;
-+	size_t				nr;
-+	size_t				size;
-+	u32				*d;
++	DARRAY(u32)			ids;
 +};
 +
 +static inline void snapshots_seen_exit(struct snapshots_seen *s)
 +{
-+	kfree(s->d);
-+	s->d = NULL;
++	kfree(s->ids.data);
++	s->ids.data = NULL;
 +}
 +
 +static inline void snapshots_seen_init(struct snapshots_seen *s)
@@ -66158,30 +69190,19 @@ index 000000000000..e4c3fdcdf22f
 +
 +static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
 +{
-+	if (s->nr == s->size) {
-+		size_t new_size = max(s->size, (size_t) 128) * 2;
-+		u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
-+
-+		if (!d) {
-+			bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
-+				new_size);
-+			return -ENOMEM;
-+		}
-+
-+		s->size = new_size;
-+		s->d	= d;
-+	}
-+
-+	s->d[s->nr++] = id;
-+	return 0;
++	int ret = darray_push(s->ids, id);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
 +}
 +
-+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
 +{
-+	unsigned i;
++	u32 *i;
 +
-+	for (i = 0; i < s->nr; i++)
-+		if (id == s->d[i])
++	darray_for_each(*s, i)
++		if (*i == id)
 +			return true;
 +	return false;
 +}
@@ -66204,6 +69225,10 @@ index 000000000000..e4c3fdcdf22f
 +			     struct bch_subvolume *);
 +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 +
++/* only exported for tests: */
++int bch2_snapshot_node_create(struct btree_trans *, u32,
++			      u32 *, u32 *, unsigned);
++
 +int bch2_subvolume_delete(struct btree_trans *, u32);
 +int bch2_subvolume_unlink(struct btree_trans *, u32);
 +int bch2_subvolume_create(struct btree_trans *, u64, u32,
@@ -66214,27 +69239,25 @@ index 000000000000..e4c3fdcdf22f
 +#endif /* _BCACHEFS_SUBVOLUME_H */
 diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
 new file mode 100644
-index 000000000000..9410b9587591
+index 000000000000..f7562b5d51df
 --- /dev/null
 +++ b/fs/bcachefs/subvolume_types.h
-@@ -0,0 +1,11 @@
+@@ -0,0 +1,9 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
 +#define _BCACHEFS_SUBVOLUME_TYPES_H
 +
-+struct snapshot_id_list {
-+	u32		nr;
-+	u32		size;
-+	u32		*d;
-+};
++#include "darray.h"
++
++typedef DARRAY(u32) snapshot_id_list;
 +
 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
 diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
 new file mode 100644
-index 000000000000..88a8e54fbd7a
+index 000000000000..71abf87114df
 --- /dev/null
 +++ b/fs/bcachefs/super-io.c
-@@ -0,0 +1,1202 @@
+@@ -0,0 +1,1601 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -66247,6 +69270,7 @@ index 000000000000..88a8e54fbd7a
 +#include "io.h"
 +#include "journal.h"
 +#include "journal_io.h"
++#include "journal_sb.h"
 +#include "journal_seq_blacklist.h"
 +#include "replicas.h"
 +#include "quota.h"
@@ -66264,8 +69288,8 @@ index 000000000000..88a8e54fbd7a
 +	NULL
 +};
 +
-+static const char *bch2_sb_field_validate(struct bch_sb *,
-+					  struct bch_sb_field *);
++static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
++				  struct printbuf *);
 +
 +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
 +				      enum bch_sb_field_type type)
@@ -66439,22 +69463,31 @@ index 000000000000..88a8e54fbd7a
 +	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
 +}
 +
-+static const char *validate_sb_layout(struct bch_sb_layout *layout)
++static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 +{
 +	u64 offset, prev_offset, max_sectors;
 +	unsigned i;
 +
-+	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
-+		return "Not a bcachefs superblock layout";
++	if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
++		pr_buf(out, "Not a bcachefs superblock layout");
++		return -EINVAL;
++	}
 +
-+	if (layout->layout_type != 0)
-+		return "Invalid superblock layout type";
++	if (layout->layout_type != 0) {
++		pr_buf(out, "Invalid superblock layout type %u",
++		       layout->layout_type);
++		return -EINVAL;
++	}
 +
-+	if (!layout->nr_superblocks)
-+		return "Invalid superblock layout: no superblocks";
++	if (!layout->nr_superblocks) {
++		pr_buf(out, "Invalid superblock layout: no superblocks");
++		return -EINVAL;
++	}
 +
-+	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
-+		return "Invalid superblock layout: too many superblocks";
++	if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
++		pr_buf(out, "Invalid superblock layout: too many superblocks");
++		return -EINVAL;
++	}
 +
 +	max_sectors = 1 << layout->sb_max_size_bits;
 +
@@ -66463,126 +69496,163 @@ index 000000000000..88a8e54fbd7a
 +	for (i = 1; i < layout->nr_superblocks; i++) {
 +		offset = le64_to_cpu(layout->sb_offset[i]);
 +
-+		if (offset < prev_offset + max_sectors)
-+			return "Invalid superblock layout: superblocks overlap";
++		if (offset < prev_offset + max_sectors) {
++			pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
++			       "  (sb %u ends at %llu next starts at %llu",
++			       i - 1, prev_offset + max_sectors, offset);
++			return -EINVAL;
++		}
 +		prev_offset = offset;
 +	}
 +
-+	return NULL;
++	return 0;
 +}
 +
-+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
++static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
++			    int rw)
 +{
 +	struct bch_sb *sb = disk_sb->sb;
 +	struct bch_sb_field *f;
 +	struct bch_sb_field_members *mi;
-+	const char *err;
++	enum bch_opt_id opt_id;
 +	u32 version, version_min;
 +	u16 block_size;
++	int ret;
 +
 +	version		= le16_to_cpu(sb->version);
-+	version_min	= version >= bcachefs_metadata_version_new_versioning
++	version_min	= version >= bcachefs_metadata_version_bkey_renumber
 +		? le16_to_cpu(sb->version_min)
 +		: version;
 +
-+	if (version    >= bcachefs_metadata_version_max ||
-+	    version_min < bcachefs_metadata_version_min)
-+		return "Unsupported superblock version";
++	if (version    >= bcachefs_metadata_version_max) {
++		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
++		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
 +
-+	if (version_min > version)
-+		return "Bad minimum version";
++	if (version_min < bcachefs_metadata_version_min) {
++		pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
++		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	if (version_min > version) {
++		pr_buf(out, "Bad minimum version %u, greater than version field %u",
++		       version_min, version);
++		return -EINVAL;
++	}
 +
 +	if (sb->features[1] ||
-+	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
-+		return "Filesystem has incompatible features";
++	    (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
++		pr_buf(out, "Filesystem has incompatible features");
++		return -EINVAL;
++	}
 +
 +	block_size = le16_to_cpu(sb->block_size);
 +
-+	if (!is_power_of_2(block_size) ||
-+	    block_size > PAGE_SECTORS)
-+		return "Bad block size";
++	if (block_size > PAGE_SECTORS) {
++		pr_buf(out, "Block size too big (got %u, max %u)",
++		       block_size, PAGE_SECTORS);
++		return -EINVAL;
++	}
 +
-+	if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
-+		return "Bad user UUID";
++	if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
++		pr_buf(out, "Bad user UUID (got zeroes)");
++		return -EINVAL;
++	}
 +
-+	if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
-+		return "Bad internal UUID";
++	if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
++		pr_buf(out, "Bad intenal UUID (got zeroes)");
++		return -EINVAL;
++	}
 +
 +	if (!sb->nr_devices ||
-+	    sb->nr_devices <= sb->dev_idx ||
-+	    sb->nr_devices > BCH_SB_MEMBERS_MAX)
-+		return "Bad number of member devices";
++	    sb->nr_devices > BCH_SB_MEMBERS_MAX) {
++		pr_buf(out, "Bad number of member devices %u (max %u)",
++		       sb->nr_devices, BCH_SB_MEMBERS_MAX);
++		return -EINVAL;
++	}
 +
-+	if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-+	    BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-+		return "Invalid number of metadata replicas";
-+
-+	if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-+	    BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-+		return "Invalid number of metadata replicas";
-+
-+	if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-+	    BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-+		return "Invalid number of data replicas";
-+
-+	if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-+	    BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-+		return "Invalid number of data replicas";
-+
-+	if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-+		return "Invalid metadata checksum type";
-+
-+	if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-+		return "Invalid metadata checksum type";
-+
-+	if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
-+		return "Invalid compression type";
-+
-+	if (!BCH_SB_BTREE_NODE_SIZE(sb))
-+		return "Btree node size not set";
-+
-+	if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
-+		return "Btree node size not a power of two";
-+
-+	if (BCH_SB_GC_RESERVE(sb) < 5)
-+		return "gc reserve percentage too small";
++	if (sb->dev_idx >= sb->nr_devices) {
++		pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
++		       sb->dev_idx, sb->nr_devices);
++		return -EINVAL;
++	}
 +
 +	if (!sb->time_precision ||
-+	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
-+		return "invalid time precision";
++	    le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
++		pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
++		       le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
++		return -EINVAL;
++	}
++
++	if (rw == READ) {
++		/*
++		 * Been seeing a bug where these are getting inexplicably
++		 * zeroed, so we'r now validating them, but we have to be
++		 * careful not to preven people's filesystems from mounting:
++		 */
++		if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
++			SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
++		if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
++			SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000);
++	}
++
++	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
++		const struct bch_option *opt = bch2_opt_table + opt_id;
++
++		if (opt->get_sb != BCH2_NO_SB_OPT) {
++			u64 v = bch2_opt_from_sb(sb, opt_id);
++
++			pr_buf(out, "Invalid option ");
++			ret = bch2_opt_validate(opt, v, out);
++			if (ret)
++				return ret;
++
++			printbuf_reset(out);
++		}
++	}
 +
 +	/* validate layout */
-+	err = validate_sb_layout(&sb->layout);
-+	if (err)
-+		return err;
++	ret = validate_sb_layout(&sb->layout, out);
++	if (ret)
++		return ret;
 +
 +	vstruct_for_each(sb, f) {
-+		if (!f->u64s)
-+			return "Invalid superblock: invalid optional field";
++		if (!f->u64s) {
++			pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
++			       le32_to_cpu(f->type));
++			return -EINVAL;
++		}
 +
-+		if (vstruct_next(f) > vstruct_last(sb))
-+			return "Invalid superblock: invalid optional field";
++		if (vstruct_next(f) > vstruct_last(sb)) {
++			pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
++			       le32_to_cpu(f->type));
++			return -EINVAL;
++		}
 +	}
 +
 +	/* members must be validated first: */
 +	mi = bch2_sb_get_members(sb);
-+	if (!mi)
-+		return "Invalid superblock: member info area missing";
++	if (!mi) {
++		pr_buf(out, "Invalid superblock: member info area missing");
++		return -EINVAL;
++	}
 +
-+	err = bch2_sb_field_validate(sb, &mi->field);
-+	if (err)
-+		return err;
++	ret = bch2_sb_field_validate(sb, &mi->field, out);
++	if (ret)
++		return ret;
 +
 +	vstruct_for_each(sb, f) {
 +		if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
 +			continue;
 +
-+		err = bch2_sb_field_validate(sb, f);
-+		if (err)
-+			return err;
++		ret = bch2_sb_field_validate(sb, f, out);
++		if (ret)
++			return ret;
 +	}
 +
-+	return NULL;
++	return 0;
 +}
 +
 +/* device open: */
@@ -66603,7 +69673,6 @@ index 000000000000..88a8e54fbd7a
 +	c->sb.nr_devices	= src->nr_devices;
 +	c->sb.clean		= BCH_SB_CLEAN(src);
 +	c->sb.encryption_type	= BCH_SB_ENCRYPTION_TYPE(src);
-+	c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
 +
 +	c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
 +	c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
@@ -66645,7 +69714,7 @@ index 000000000000..88a8e54fbd7a
 +	memcpy(dst->compat,	src->compat,	sizeof(dst->compat));
 +
 +	for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-+		if (i == BCH_SB_FIELD_journal)
++		if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
 +			continue;
 +
 +		src_f = bch2_sb_field_get(src, i);
@@ -66676,9 +69745,6 @@ index 000000000000..88a8e54fbd7a
 +
 +	__copy_super(&c->disk_sb, src);
 +
-+	if (BCH_SB_INITIALIZED(c->disk_sb.sb))
-+		set_bit(BCH_FS_INITIALIZED, &c->flags);
-+
 +	ret = bch2_sb_replicas_to_cpu_replicas(c);
 +	if (ret)
 +		return ret;
@@ -66712,10 +69778,12 @@ index 000000000000..88a8e54fbd7a
 +
 +/* read superblock: */
 +
-+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
++static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
 +{
 +	struct bch_csum csum;
++	u32 version, version_min;
 +	size_t bytes;
++	int ret;
 +reread:
 +	bio_reset(sb->bio);
 +	bio_set_dev(sb->bio, sb->bdev);
@@ -66723,40 +69791,65 @@ index 000000000000..88a8e54fbd7a
 +	bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
 +	bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 +
-+	if (submit_bio_wait(sb->bio))
-+		return "IO error";
++	ret = submit_bio_wait(sb->bio);
++	if (ret) {
++		pr_buf(err, "IO error: %i", ret);
++		return ret;
++	}
 +
-+	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
-+		return "Not a bcachefs superblock";
++	if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
++		pr_buf(err, "Not a bcachefs superblock");
++		return -EINVAL;
++	}
 +
-+	if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
-+	    le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
-+		return "Unsupported superblock version";
++	version		= le16_to_cpu(sb->sb->version);
++	version_min	= version >= bcachefs_metadata_version_bkey_renumber
++		? le16_to_cpu(sb->sb->version_min)
++		: version;
++
++	if (version    >= bcachefs_metadata_version_max) {
++		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
++		       version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
++
++	if (version_min < bcachefs_metadata_version_min) {
++		pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
++		       version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
++		return -EINVAL;
++	}
 +
 +	bytes = vstruct_bytes(sb->sb);
 +
-+	if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
-+		return "Bad superblock: too big";
++	if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
++		pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
++		       bytes, 512UL << sb->sb->layout.sb_max_size_bits);
++		return -EINVAL;
++	}
 +
 +	if (bytes > sb->buffer_size) {
 +		if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
-+			return "cannot allocate memory";
++			return -ENOMEM;
 +		goto reread;
 +	}
 +
-+	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
-+		return "unknown csum type";
++	if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
++		pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
++		return -EINVAL;
++	}
 +
 +	/* XXX: verify MACs */
 +	csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
 +			    null_nonce(), sb->sb);
 +
-+	if (bch2_crc_cmp(csum, sb->sb->csum))
-+		return "bad checksum reading superblock";
++	if (bch2_crc_cmp(csum, sb->sb->csum)) {
++		pr_buf(err, "bad checksum");
++		return -EINVAL;
++	}
 +
 +	sb->seq = le64_to_cpu(sb->sb->seq);
 +
-+	return NULL;
++	return 0;
 +}
 +
 +int bch2_read_super(const char *path, struct bch_opts *opts,
@@ -66764,7 +69857,7 @@ index 000000000000..88a8e54fbd7a
 +{
 +	u64 offset = opt_get(*opts, sb);
 +	struct bch_sb_layout layout;
-+	const char *err;
++	struct printbuf err = PRINTBUF;
 +	__le64 *i;
 +	int ret;
 +
@@ -66796,25 +69889,28 @@ index 000000000000..88a8e54fbd7a
 +		goto out;
 +	}
 +
-+	err = "cannot allocate memory";
 +	ret = bch2_sb_realloc(sb, 0);
-+	if (ret)
++	if (ret) {
++		pr_buf(&err, "error allocating memory for superblock");
 +		goto err;
++	}
 +
-+	ret = -EFAULT;
-+	err = "dynamic fault";
-+	if (bch2_fs_init_fault("read_super"))
++	if (bch2_fs_init_fault("read_super")) {
++		pr_buf(&err, "dynamic fault");
++		ret = -EFAULT;
 +		goto err;
++	}
 +
-+	ret = -EINVAL;
-+	err = read_one_super(sb, offset);
-+	if (!err)
++	ret = read_one_super(sb, offset, &err);
++	if (!ret)
 +		goto got_super;
 +
 +	if (opt_defined(*opts, sb))
 +		goto err;
 +
-+	pr_err("error reading default superblock: %s", err);
++	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
++	       path, err.buf);
++	printbuf_reset(&err);
 +
 +	/*
 +	 * Error reading primary superblock - read location of backup
@@ -66830,13 +69926,15 @@ index 000000000000..88a8e54fbd7a
 +	 */
 +	bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
 +
-+	err = "IO error";
-+	if (submit_bio_wait(sb->bio))
++	ret = submit_bio_wait(sb->bio);
++	if (ret) {
++		pr_buf(&err, "IO error: %i", ret);
 +		goto err;
++	}
 +
 +	memcpy(&layout, sb->sb, sizeof(layout));
-+	err = validate_sb_layout(&layout);
-+	if (err)
++	ret = validate_sb_layout(&layout, &err);
++	if (ret)
 +		goto err;
 +
 +	for (i = layout.sb_offset;
@@ -66846,29 +69944,41 @@ index 000000000000..88a8e54fbd7a
 +		if (offset == opt_get(*opts, sb))
 +			continue;
 +
-+		err = read_one_super(sb, offset);
-+		if (!err)
++		ret = read_one_super(sb, offset, &err);
++		if (!ret)
 +			goto got_super;
 +	}
 +
-+	ret = -EINVAL;
 +	goto err;
 +
 +got_super:
-+	err = "Superblock block size smaller than device block size";
-+	ret = -EINVAL;
 +	if (le16_to_cpu(sb->sb->block_size) << 9 <
-+	    bdev_logical_block_size(sb->bdev))
++	    bdev_logical_block_size(sb->bdev)) {
++		pr_buf(&err, "block size (%u) smaller than device block size (%u)",
++		       le16_to_cpu(sb->sb->block_size) << 9,
++		       bdev_logical_block_size(sb->bdev));
++		ret = -EINVAL;
 +		goto err;
++	}
 +
 +	ret = 0;
 +	sb->have_layout = true;
++
++	ret = bch2_sb_validate(sb, &err, READ);
++	if (ret) {
++		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
++		       path, err.buf);
++		goto err_no_print;
++	}
 +out:
 +	pr_verbose_init(*opts, "ret %i", ret);
++	printbuf_exit(&err);
 +	return ret;
 +err:
++	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
++	       path, err.buf);
++err_no_print:
 +	bch2_free_super(sb);
-+	pr_err("error reading superblock: %s", err);
 +	goto out;
 +}
 +
@@ -66940,8 +70050,8 @@ index 000000000000..88a8e54fbd7a
 +{
 +	struct closure *cl = &c->sb_write;
 +	struct bch_dev *ca;
++	struct printbuf err = PRINTBUF;
 +	unsigned i, sb = 0, nr_wrote;
-+	const char *err;
 +	struct bch_devs_mask sb_written;
 +	bool wrote, can_mount_without_written, can_mount_with_written;
 +	unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -66968,10 +70078,12 @@ index 000000000000..88a8e54fbd7a
 +		bch2_sb_from_fs(c, ca);
 +
 +	for_each_online_member(ca, c, i) {
-+		err = bch2_sb_validate(&ca->disk_sb);
-+		if (err) {
-+			bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
-+			ret = -1;
++		printbuf_reset(&err);
++
++		ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
++		if (ret) {
++			bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
++			percpu_ref_put(&ca->io_ref);
 +			goto out;
 +		}
 +	}
@@ -66989,11 +70101,24 @@ index 000000000000..88a8e54fbd7a
 +	closure_sync(cl);
 +
 +	for_each_online_member(ca, c, i) {
-+		if (!ca->sb_write_error &&
-+		    ca->disk_sb.seq !=
-+		    le64_to_cpu(ca->sb_read_scratch->seq)) {
++		if (ca->sb_write_error)
++			continue;
++
++		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
 +			bch2_fs_fatal_error(c,
-+				"Superblock modified by another process");
++				"Superblock write was silently dropped! (seq %llu expected %llu)",
++				le64_to_cpu(ca->sb_read_scratch->seq),
++				ca->disk_sb.seq);
++			percpu_ref_put(&ca->io_ref);
++			ret = -EROFS;
++			goto out;
++		}
++
++		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
++			bch2_fs_fatal_error(c,
++				"Superblock modified by another process (seq %llu expected %llu)",
++				le64_to_cpu(ca->sb_read_scratch->seq),
++				ca->disk_sb.seq);
 +			percpu_ref_put(&ca->io_ref);
 +			ret = -EROFS;
 +			goto out;
@@ -67048,6 +70173,7 @@ index 000000000000..88a8e54fbd7a
 +out:
 +	/* Make new options visible after they're persistent: */
 +	bch2_sb_update(c);
++	printbuf_exit(&err);
 +	return ret;
 +}
 +
@@ -67062,133 +70188,218 @@ index 000000000000..88a8e54fbd7a
 +	mutex_unlock(&c->sb_lock);
 +}
 +
-+/* BCH_SB_FIELD_journal: */
-+
-+static int u64_cmp(const void *_l, const void *_r)
-+{
-+	u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-+
-+	return l < r ? -1 : l > r ? 1 : 0;
-+}
-+
-+static const char *bch2_sb_validate_journal(struct bch_sb *sb,
-+					    struct bch_sb_field *f)
-+{
-+	struct bch_sb_field_journal *journal = field_to_type(f, journal);
-+	struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-+	const char *err;
-+	unsigned nr;
-+	unsigned i;
-+	u64 *b;
-+
-+	journal = bch2_sb_get_journal(sb);
-+	if (!journal)
-+		return NULL;
-+
-+	nr = bch2_nr_journal_buckets(journal);
-+	if (!nr)
-+		return NULL;
-+
-+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
-+	if (!b)
-+		return "cannot allocate memory";
-+
-+	for (i = 0; i < nr; i++)
-+		b[i] = le64_to_cpu(journal->buckets[i]);
-+
-+	sort(b, nr, sizeof(u64), u64_cmp, NULL);
-+
-+	err = "journal bucket at sector 0";
-+	if (!b[0])
-+		goto err;
-+
-+	err = "journal bucket before first bucket";
-+	if (m && b[0] < le16_to_cpu(m->first_bucket))
-+		goto err;
-+
-+	err = "journal bucket past end of device";
-+	if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
-+		goto err;
-+
-+	err = "duplicate journal buckets";
-+	for (i = 0; i + 1 < nr; i++)
-+		if (b[i] == b[i + 1])
-+			goto err;
-+
-+	err = NULL;
-+err:
-+	kfree(b);
-+	return err;
-+}
-+
-+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-+	.validate	= bch2_sb_validate_journal,
-+};
-+
 +/* BCH_SB_FIELD_members: */
 +
-+static const char *bch2_sb_validate_members(struct bch_sb *sb,
-+					    struct bch_sb_field *f)
++static int bch2_sb_members_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
 +{
 +	struct bch_sb_field_members *mi = field_to_type(f, members);
-+	struct bch_member *m;
++	unsigned i;
 +
 +	if ((void *) (mi->members + sb->nr_devices) >
-+	    vstruct_end(&mi->field))
-+		return "Invalid superblock: bad member info";
++	    vstruct_end(&mi->field)) {
++		pr_buf(err, "too many devices for section size");
++		return -EINVAL;
++	}
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
 +
-+	for (m = mi->members;
-+	     m < mi->members + sb->nr_devices;
-+	     m++) {
 +		if (!bch2_member_exists(m))
 +			continue;
 +
-+		if (le64_to_cpu(m->nbuckets) > LONG_MAX)
-+			return "Too many buckets";
++		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
++			pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
++			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
++			return -EINVAL;
++		}
 +
 +		if (le64_to_cpu(m->nbuckets) -
-+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
-+			return "Not enough buckets";
++		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
++			pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
++			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
++			return -EINVAL;
++		}
 +
 +		if (le16_to_cpu(m->bucket_size) <
-+		    le16_to_cpu(sb->block_size))
-+			return "bucket size smaller than block size";
++		    le16_to_cpu(sb->block_size)) {
++			pr_buf(err, "device %u: bucket size %u smaller than block size %u",
++			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
++			return -EINVAL;
++		}
 +
 +		if (le16_to_cpu(m->bucket_size) <
-+		    BCH_SB_BTREE_NODE_SIZE(sb))
-+			return "bucket size smaller than btree node size";
++		    BCH_SB_BTREE_NODE_SIZE(sb)) {
++			pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
++			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
++			return -EINVAL;
++		}
 +	}
 +
-+	return NULL;
++	return 0;
++}
++
++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_members *mi = field_to_type(f, members);
++	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
++	unsigned i;
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++		unsigned data_have = bch2_sb_dev_has_data(sb, i);
++		u64 bucket_size = le16_to_cpu(m->bucket_size);
++		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		pr_buf(out, "Device:");
++		pr_tab(out);
++		pr_buf(out, "%u", i);
++		pr_newline(out);
++
++		pr_indent_push(out, 2);
++
++		pr_buf(out, "UUID:");
++		pr_tab(out);
++		pr_uuid(out, m->uuid.b);
++		pr_newline(out);
++
++		pr_buf(out, "Size:");
++		pr_tab(out);
++		pr_units(out, device_size, device_size << 9);
++		pr_newline(out);
++
++		pr_buf(out, "Bucket size:");
++		pr_tab(out);
++		pr_units(out, bucket_size, bucket_size << 9);
++		pr_newline(out);
++
++		pr_buf(out, "First bucket:");
++		pr_tab(out);
++		pr_buf(out, "%u", le16_to_cpu(m->first_bucket));
++		pr_newline(out);
++
++		pr_buf(out, "Buckets:");
++		pr_tab(out);
++		pr_buf(out, "%llu", le64_to_cpu(m->nbuckets));
++		pr_newline(out);
++
++		pr_buf(out, "Last mount:");
++		pr_tab(out);
++		if (m->last_mount)
++			pr_time(out, le64_to_cpu(m->last_mount));
++		else
++			pr_buf(out, "(never)");
++		pr_newline(out);
++
++		pr_buf(out, "State:");
++		pr_tab(out);
++		pr_buf(out, "%s",
++		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
++		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
++		       : "unknown");
++		pr_newline(out);
++
++		pr_buf(out, "Group:");
++		pr_tab(out);
++		if (BCH_MEMBER_GROUP(m)) {
++			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
++
++			if (idx < disk_groups_nr(gi))
++				pr_buf(out, "%s (%u)",
++				       gi->entries[idx].label, idx);
++			else
++				pr_buf(out, "(bad disk labels section)");
++		} else {
++			pr_buf(out, "(none)");
++		}
++		pr_newline(out);
++
++		pr_buf(out, "Data allowed:");
++		pr_tab(out);
++		if (BCH_MEMBER_DATA_ALLOWED(m))
++			bch2_flags_to_text(out, bch2_data_types,
++					   BCH_MEMBER_DATA_ALLOWED(m));
++		else
++			pr_buf(out, "(none)");
++		pr_newline(out);
++
++		pr_buf(out, "Has data:");
++		pr_tab(out);
++		if (data_have)
++			bch2_flags_to_text(out, bch2_data_types, data_have);
++		else
++			pr_buf(out, "(none)");
++		pr_newline(out);
++
++		pr_buf(out, "Discard:");
++		pr_tab(out);
++		pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
++		pr_newline(out);
++
++		pr_buf(out, "Freespace initialized:");
++		pr_tab(out);
++		pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
++		pr_newline(out);
++
++		pr_indent_pop(out, 2);
++	}
 +}
 +
 +static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-+	.validate	= bch2_sb_validate_members,
++	.validate	= bch2_sb_members_validate,
++	.to_text	= bch2_sb_members_to_text,
 +};
 +
 +/* BCH_SB_FIELD_crypt: */
 +
-+static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
-+					  struct bch_sb_field *f)
++static int bch2_sb_crypt_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
 +{
 +	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 +
-+	if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
-+		return "invalid field crypt: wrong size";
++	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
++		pr_buf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&crypt->field), sizeof(*crypt));
++		return -EINVAL;
++	}
 +
-+	if (BCH_CRYPT_KDF_TYPE(crypt))
-+		return "invalid field crypt: bad kdf type";
++	if (BCH_CRYPT_KDF_TYPE(crypt)) {
++		pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
++		return -EINVAL;
++	}
 +
-+	return NULL;
++	return 0;
++}
++
++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
++
++	pr_buf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
++	pr_newline(out);
++	pr_buf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
++	pr_newline(out);
++	pr_buf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
++	pr_newline(out);
++	pr_buf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
++	pr_newline(out);
 +}
 +
 +static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-+	.validate	= bch2_sb_validate_crypt,
++	.validate	= bch2_sb_crypt_validate,
++	.to_text	= bch2_sb_crypt_to_text,
 +};
 +
 +/* BCH_SB_FIELD_clean: */
 +
-+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
 +{
 +	struct jset_entry *entry;
 +	int ret;
@@ -67264,7 +70475,7 @@ index 000000000000..88a8e54fbd7a
 +				     struct jset_entry_usage, entry);
 +
 +		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = FS_USAGE_INODES;
++		u->entry.btree_id = BCH_FS_USAGE_inodes;
 +		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
 +	}
 +
@@ -67274,7 +70485,7 @@ index 000000000000..88a8e54fbd7a
 +				     struct jset_entry_usage, entry);
 +
 +		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = FS_USAGE_KEY_VERSION;
++		u->entry.btree_id = BCH_FS_USAGE_key_version;
 +		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
 +	}
 +
@@ -67284,7 +70495,7 @@ index 000000000000..88a8e54fbd7a
 +				     struct jset_entry_usage, entry);
 +
 +		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = FS_USAGE_RESERVED;
++		u->entry.btree_id = BCH_FS_USAGE_reserved;
 +		u->entry.level	= i;
 +		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
 +	}
@@ -67360,7 +70571,7 @@ index 000000000000..88a8e54fbd7a
 +	}
 +
 +	sb_clean->flags		= 0;
-+	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);
++	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
 +
 +	/* Trying to catch outstanding bug: */
 +	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
@@ -67377,7 +70588,7 @@ index 000000000000..88a8e54fbd7a
 +	 * this should be in the write path, and we should be validating every
 +	 * superblock section:
 +	 */
-+	ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
++	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
 +	if (ret) {
 +		bch_err(c, "error writing marking filesystem clean: validate error");
 +		goto out;
@@ -67388,19 +70599,47 @@ index 000000000000..88a8e54fbd7a
 +	mutex_unlock(&c->sb_lock);
 +}
 +
-+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
-+					  struct bch_sb_field *f)
++static int bch2_sb_clean_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
 +{
 +	struct bch_sb_field_clean *clean = field_to_type(f, clean);
 +
-+	if (vstruct_bytes(&clean->field) < sizeof(*clean))
-+		return "invalid field crypt: wrong size";
++	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
++		pr_buf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&clean->field), sizeof(*clean));
++		return -EINVAL;
++	}
 +
-+	return NULL;
++	return 0;
++}
++
++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_clean *clean = field_to_type(f, clean);
++	struct jset_entry *entry;
++
++	pr_buf(out, "flags:          %x",	le32_to_cpu(clean->flags));
++	pr_newline(out);
++	pr_buf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
++	pr_newline(out);
++
++	for (entry = clean->start;
++	     entry != vstruct_end(&clean->field);
++	     entry = vstruct_next(entry)) {
++		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
++		    !entry->u64s)
++			continue;
++
++		bch2_journal_entry_to_text(out, NULL, entry);
++		pr_newline(out);
++	}
 +}
 +
 +static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-+	.validate	= bch2_sb_validate_clean,
++	.validate	= bch2_sb_clean_validate,
++	.to_text	= bch2_sb_clean_to_text,
 +};
 +
 +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
@@ -67410,14 +70649,27 @@ index 000000000000..88a8e54fbd7a
 +#undef x
 +};
 +
-+static const char *bch2_sb_field_validate(struct bch_sb *sb,
-+					  struct bch_sb_field *f)
++static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
++				  struct printbuf *err)
 +{
 +	unsigned type = le32_to_cpu(f->type);
++	struct printbuf field_err = PRINTBUF;
++	int ret;
 +
-+	return type < BCH_SB_FIELD_NR
-+		? bch2_sb_field_ops[type]->validate(sb, f)
-+		: NULL;
++	if (type >= BCH_SB_FIELD_NR)
++		return 0;
++
++	ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err);
++	if (ret) {
++		pr_buf(err, "Invalid superblock section %s: %s",
++		       bch2_sb_fields[type],
++		       field_err.buf);
++		pr_newline(err);
++		bch2_sb_field_to_text(err, sb, f);
++	}
++
++	printbuf_exit(&field_err);
++	return ret;
 +}
 +
 +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
@@ -67427,22 +70679,192 @@ index 000000000000..88a8e54fbd7a
 +	const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
 +		? bch2_sb_field_ops[type] : NULL;
 +
++	if (!out->tabstops[0])
++		out->tabstops[0] = 32;
++
 +	if (ops)
 +		pr_buf(out, "%s", bch2_sb_fields[type]);
 +	else
 +		pr_buf(out, "(unknown field %u)", type);
 +
-+	pr_buf(out, " (size %llu):", vstruct_bytes(f));
++	pr_buf(out, " (size %zu):", vstruct_bytes(f));
++	pr_newline(out);
 +
-+	if (ops && ops->to_text)
++	if (ops && ops->to_text) {
++		pr_indent_push(out, 2);
 +		bch2_sb_field_ops[type]->to_text(out, sb, f);
++		pr_indent_pop(out, 2);
++	}
++}
++
++void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
++{
++	unsigned i;
++
++	pr_buf(out, "Type:                    %u", l->layout_type);
++	pr_newline(out);
++
++	pr_buf(out, "Superblock max size:     ");
++	pr_units(out,
++		 1 << l->sb_max_size_bits,
++		 512 << l->sb_max_size_bits);
++	pr_newline(out);
++
++	pr_buf(out, "Nr superblocks:          %u", l->nr_superblocks);
++	pr_newline(out);
++
++	pr_buf(out, "Offsets:                 ");
++	for (i = 0; i < l->nr_superblocks; i++) {
++		if (i)
++			pr_buf(out, ", ");
++		pr_buf(out, "%llu", le64_to_cpu(l->sb_offset[i]));
++	}
++	pr_newline(out);
++}
++
++void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
++		     bool print_layout, unsigned fields)
++{
++	struct bch_sb_field_members *mi;
++	struct bch_sb_field *f;
++	u64 fields_have = 0;
++	unsigned nr_devices = 0;
++
++	if (!out->tabstops[0])
++		out->tabstops[0] = 32;
++
++	mi = bch2_sb_get_members(sb);
++	if (mi) {
++		struct bch_member *m;
++
++		for (m = mi->members;
++		     m < mi->members + sb->nr_devices;
++		     m++)
++			nr_devices += bch2_member_exists(m);
++	}
++
++	pr_buf(out, "External UUID:");
++	pr_tab(out);
++	pr_uuid(out, sb->user_uuid.b);
++	pr_newline(out);
++
++	pr_buf(out, "Internal UUID:");
++	pr_tab(out);
++	pr_uuid(out, sb->uuid.b);
++	pr_newline(out);
++
++	pr_buf(out, "Device index:");
++	pr_tab(out);
++	pr_buf(out, "%u", sb->dev_idx);
++	pr_newline(out);
++
++	pr_buf(out, "Label:");
++	pr_tab(out);
++	pr_buf(out, "%.*s", (int) sizeof(sb->label), sb->label);
++	pr_newline(out);
++
++	pr_buf(out, "Version:");
++	pr_tab(out);
++	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]);
++	pr_newline(out);
++
++	pr_buf(out, "Oldest version on disk:");
++	pr_tab(out);
++	pr_buf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]);
++	pr_newline(out);
++
++	pr_buf(out, "Created:");
++	pr_tab(out);
++	if (sb->time_base_lo)
++		pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
++	else
++		pr_buf(out, "(not set)");
++	pr_newline(out);
++
++	pr_buf(out, "Sequence number:");
++	pr_tab(out);
++	pr_buf(out, "%llu", le64_to_cpu(sb->seq));
++	pr_newline(out);
++
++	pr_buf(out, "Superblock size:");
++	pr_tab(out);
++	pr_buf(out, "%zu", vstruct_bytes(sb));
++	pr_newline(out);
++
++	pr_buf(out, "Clean:");
++	pr_tab(out);
++	pr_buf(out, "%llu", BCH_SB_CLEAN(sb));
++	pr_newline(out);
++
++	pr_buf(out, "Devices:");
++	pr_tab(out);
++	pr_buf(out, "%u", nr_devices);
++	pr_newline(out);
++
++	pr_buf(out, "Sections:");
++	vstruct_for_each(sb, f)
++		fields_have |= 1 << le32_to_cpu(f->type);
++	pr_tab(out);
++	bch2_flags_to_text(out, bch2_sb_fields, fields_have);
++	pr_newline(out);
++
++	pr_buf(out, "Features:");
++	pr_tab(out);
++	bch2_flags_to_text(out, bch2_sb_features,
++			   le64_to_cpu(sb->features[0]));
++	pr_newline(out);
++
++	pr_buf(out, "Compat features:");
++	pr_tab(out);
++	bch2_flags_to_text(out, bch2_sb_compat,
++			   le64_to_cpu(sb->compat[0]));
++	pr_newline(out);
++
++	pr_newline(out);
++	pr_buf(out, "Options:");
++	pr_newline(out);
++	pr_indent_push(out, 2);
++	{
++		enum bch_opt_id id;
++
++		for (id = 0; id < bch2_opts_nr; id++) {
++			const struct bch_option *opt = bch2_opt_table + id;
++
++			if (opt->get_sb != BCH2_NO_SB_OPT) {
++				u64 v = bch2_opt_from_sb(sb, id);
++
++				pr_buf(out, "%s:", opt->attr.name);
++				pr_tab(out);
++				bch2_opt_to_text(out, NULL, sb, opt, v,
++						 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST);
++				pr_newline(out);
++			}
++		}
++	}
++
++	pr_indent_pop(out, 2);
++
++	if (print_layout) {
++		pr_newline(out);
++		pr_buf(out, "layout:");
++		pr_newline(out);
++		pr_indent_push(out, 2);
++		bch2_sb_layout_to_text(out, &sb->layout);
++		pr_indent_pop(out, 2);
++	}
++
++	vstruct_for_each(sb, f)
++		if (fields & (1 << le32_to_cpu(f->type))) {
++			pr_newline(out);
++			bch2_sb_field_to_text(out, sb, f);
++		}
 +}
 diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
 new file mode 100644
-index 000000000000..b64ac2fbbf8b
+index 000000000000..14a25f6fe29a
 --- /dev/null
 +++ b/fs/bcachefs/super-io.h
-@@ -0,0 +1,136 @@
+@@ -0,0 +1,126 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUPER_IO_H
 +#define _BCACHEFS_SUPER_IO_H
@@ -67483,9 +70905,8 @@ index 000000000000..b64ac2fbbf8b
 +extern const char * const bch2_sb_fields[];
 +
 +struct bch_sb_field_ops {
-+	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
-+	void		(*to_text)(struct printbuf *, struct bch_sb *,
-+				   struct bch_sb_field *);
++	int	(*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
++	void	(*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
 +};
 +
 +static inline __le64 bch2_sb_magic(struct bch_fs *c)
@@ -67511,8 +70932,6 @@ index 000000000000..b64ac2fbbf8b
 +void bch2_free_super(struct bch_sb_handle *);
 +int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 +
-+const char *bch2_sb_validate(struct bch_sb_handle *);
-+
 +int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 +int bch2_write_super(struct bch_fs *);
 +void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -67523,15 +70942,6 @@ index 000000000000..b64ac2fbbf8b
 +		__bch2_check_set_feature(c, feat);
 +}
 +
-+/* BCH_SB_FIELD_journal: */
-+
-+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-+{
-+	return j
-+		? (__le64 *) vstruct_end(&j->field) - j->buckets
-+		: 0;
-+}
-+
 +/* BCH_SB_FIELD_members: */
 +
 +static inline bool bch2_member_exists(struct bch_member *m)
@@ -67555,12 +70965,12 @@ index 000000000000..b64ac2fbbf8b
 +		.bucket_size	= le16_to_cpu(mi->bucket_size),
 +		.group		= BCH_MEMBER_GROUP(mi),
 +		.state		= BCH_MEMBER_STATE(mi),
-+		.replacement	= BCH_MEMBER_REPLACEMENT(mi),
 +		.discard	= BCH_MEMBER_DISCARD(mi),
 +		.data_allowed	= BCH_MEMBER_DATA_ALLOWED(mi),
 +		.durability	= BCH_MEMBER_DURABILITY(mi)
 +			? BCH_MEMBER_DURABILITY(mi) - 1
 +			: 1,
++		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
 +		.valid		= !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
 +	};
 +}
@@ -67570,21 +70980,23 @@ index 000000000000..b64ac2fbbf8b
 +void bch2_journal_super_entries_add_common(struct bch_fs *,
 +					   struct jset_entry **, u64);
 +
-+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
 +
 +int bch2_fs_mark_dirty(struct bch_fs *);
 +void bch2_fs_mark_clean(struct bch_fs *);
 +
 +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 +			   struct bch_sb_field *);
++void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
++void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned);
 +
 +#endif /* _BCACHEFS_SUPER_IO_H */
 diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
 new file mode 100644
-index 000000000000..3744b6d519a7
+index 000000000000..4a071711d363
 --- /dev/null
 +++ b/fs/bcachefs/super.c
-@@ -0,0 +1,2110 @@
+@@ -0,0 +1,1966 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcachefs setup/teardown code, and some metadata io - read a superblock and
@@ -67603,6 +71015,7 @@ index 000000000000..3744b6d519a7
 +#include "btree_key_cache.h"
 +#include "btree_update_interior.h"
 +#include "btree_io.h"
++#include "buckets_waiting_for_journal.h"
 +#include "chardev.h"
 +#include "checksum.h"
 +#include "clock.h"
@@ -67785,17 +71198,9 @@ index 000000000000..3744b6d519a7
 +	 */
 +	bch2_journal_flush_all_pins(&c->journal);
 +
-+	/*
-+	 * If the allocator threads didn't all start up, the btree updates to
-+	 * write out alloc info aren't going to work:
-+	 */
-+	if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-+		goto nowrote_alloc;
-+
 +	bch_verbose(c, "flushing journal and stopping allocators");
 +
 +	bch2_journal_flush_all_pins(&c->journal);
-+	set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 +
 +	do {
 +		clean_passes++;
@@ -67820,17 +71225,11 @@ index 000000000000..3744b6d519a7
 +	bch_verbose(c, "flushing journal and stopping allocators complete");
 +
 +	set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-+nowrote_alloc:
++
 +	closure_wait_event(&c->btree_interior_update_wait,
 +			   !bch2_btree_interior_updates_nr_pending(c));
 +	flush_work(&c->btree_interior_update_work);
 +
-+	for_each_member_device(ca, c, i)
-+		bch2_dev_allocator_stop(ca);
-+
-+	clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-+	clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-+
 +	bch2_fs_journal_stop(&c->journal);
 +
 +	/*
@@ -67866,10 +71265,6 @@ index 000000000000..3744b6d519a7
 +	/*
 +	 * Block new foreground-end write operations from starting - any new
 +	 * writes will return -EROFS:
-+	 *
-+	 * (This is really blocking new _allocations_, writes to previously
-+	 * allocated space can still happen until stopping the allocator in
-+	 * bch2_dev_allocator_stop()).
 +	 */
 +	percpu_ref_kill(&c->writes);
 +
@@ -67998,19 +71393,7 @@ index 000000000000..3744b6d519a7
 +		bch2_dev_allocator_add(c, ca);
 +	bch2_recalc_capacity(c);
 +
-+	for_each_rw_member(ca, c, i) {
-+		ret = bch2_dev_allocator_start(ca);
-+		if (ret) {
-+			bch_err(c, "error starting allocator threads");
-+			percpu_ref_put(&ca->io_ref);
-+			goto err;
-+		}
-+	}
-+
-+	set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-+
-+	for_each_rw_member(ca, c, i)
-+		bch2_wake_allocator(ca);
++	bch2_do_discards(c);
 +
 +	if (!early) {
 +		ret = bch2_fs_read_write_late(c);
@@ -68055,6 +71438,7 @@ index 000000000000..3744b6d519a7
 +	bch2_fs_ec_exit(c);
 +	bch2_fs_encryption_exit(c);
 +	bch2_fs_io_exit(c);
++	bch2_fs_buckets_waiting_for_journal_exit(c);
 +	bch2_fs_btree_interior_update_exit(c);
 +	bch2_fs_btree_iter_exit(c);
 +	bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -68259,6 +71643,7 @@ index 000000000000..3744b6d519a7
 +	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
 +
 +	init_rwsem(&c->gc_lock);
++	mutex_init(&c->gc_gens_lock);
 +
 +	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 +		bch2_time_stats_init(&c->times[i]);
@@ -68309,10 +71694,10 @@ index 000000000000..3744b6d519a7
 +	c->rebalance.enabled		= 1;
 +	c->promote_whole_extents	= true;
 +
-+	c->journal.write_time	= &c->times[BCH_TIME_journal_write];
-+	c->journal.delay_time	= &c->times[BCH_TIME_journal_delay];
-+	c->journal.blocked_time	= &c->times[BCH_TIME_blocked_journal];
-+	c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
++	c->journal.flush_write_time	= &c->times[BCH_TIME_journal_flush_write];
++	c->journal.noflush_write_time	= &c->times[BCH_TIME_journal_noflush_write];
++	c->journal.blocked_time		= &c->times[BCH_TIME_blocked_journal];
++	c->journal.flush_seq_time	= &c->times[BCH_TIME_journal_flush_seq];
 +
 +	bch2_fs_btree_cache_init_early(&c->btree_cache);
 +
@@ -68329,13 +71714,32 @@ index 000000000000..3744b6d519a7
 +	if (ret)
 +		goto err;
 +
-+	scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
++	uuid_unparse_lower(c->sb.user_uuid.b, c->name);
++
++	/* Compat: */
++	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
++	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
++		SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
++
++	if (sb->version <= bcachefs_metadata_version_inode_v2 &&
++	    !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
++		SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
 +
 +	c->opts = bch2_opts_default;
-+	bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
++	ret = bch2_opts_from_sb(&c->opts, sb);
++	if (ret)
++		goto err;
++
 +	bch2_opts_apply(&c->opts, opts);
 +
-+	c->block_bits		= ilog2(c->opts.block_size);
++	/* key cache currently disabled for inodes, because of snapshots: */
++	c->opts.inodes_use_key_cache = 0;
++
++	c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
++	if (c->opts.inodes_use_key_cache)
++		c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
++
++	c->block_bits		= ilog2(block_sectors(c));
 +	c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 +
 +	if (bch2_fs_init_fault("fs_alloc")) {
@@ -68385,6 +71789,7 @@ index 000000000000..3744b6d519a7
 +	    bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
 +	    bch2_fs_btree_iter_init(c) ?:
 +	    bch2_fs_btree_interior_update_init(c) ?:
++	    bch2_fs_buckets_waiting_for_journal_init(c);
 +	    bch2_fs_subvolumes_init(c) ?:
 +	    bch2_fs_io_init(c) ?:
 +	    bch2_fs_encryption_init(c) ?:
@@ -68394,9 +71799,6 @@ index 000000000000..3744b6d519a7
 +	if (ret)
 +		goto err;
 +
-+	if (c->opts.nochanges)
-+		set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
-+
 +	mi = bch2_sb_get_members(c->disk_sb.sb);
 +	for (i = 0; i < c->sb.nr_devices; i++)
 +		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -68432,12 +71834,9 @@ index 000000000000..3744b6d519a7
 +static void print_mount_opts(struct bch_fs *c)
 +{
 +	enum bch_opt_id i;
-+	char buf[512];
-+	struct printbuf p = PBUF(buf);
++	struct printbuf p = PRINTBUF;
 +	bool first = true;
 +
-+	strcpy(buf, "(null)");
-+
 +	if (c->opts.read_only) {
 +		pr_buf(&p, "ro");
 +		first = false;
@@ -68447,7 +71846,7 @@ index 000000000000..3744b6d519a7
 +		const struct bch_option *opt = &bch2_opt_table[i];
 +		u64 v = bch2_opt_get_by_id(&c->opts, i);
 +
-+		if (!(opt->mode & OPT_MOUNT))
++		if (!(opt->flags & OPT_MOUNT))
 +			continue;
 +
 +		if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
@@ -68456,10 +71855,14 @@ index 000000000000..3744b6d519a7
 +		if (!first)
 +			pr_buf(&p, ",");
 +		first = false;
-+		bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
++		bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
 +	}
 +
-+	bch_info(c, "mounted with opts: %s", buf);
++	if (!p.pos)
++		pr_buf(&p, "(null)");
++
++	bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf);
++	printbuf_exit(&p);
 +}
 +
 +int bch2_fs_start(struct bch_fs *c)
@@ -68507,20 +71910,6 @@ index 000000000000..3744b6d519a7
 +
 +	set_bit(BCH_FS_STARTED, &c->flags);
 +
-+	/*
-+	 * Allocator threads don't start filling copygc reserve until after we
-+	 * set BCH_FS_STARTED - wake them now:
-+	 *
-+	 * XXX ugly hack:
-+	 * Need to set ca->allocator_state here instead of relying on the
-+	 * allocator threads to do it to avoid racing with the copygc threads
-+	 * checking it and thinking they have no alloc reserve:
-+	 */
-+	for_each_online_member(ca, c, i) {
-+		ca->allocator_state = ALLOCATOR_running;
-+		bch2_wake_allocator(ca);
-+	}
-+
 +	if (c->opts.read_only || c->opts.nochanges) {
 +		bch2_fs_read_only(c);
 +	} else {
@@ -68573,7 +71962,7 @@ index 000000000000..3744b6d519a7
 +	if (!sb_mi)
 +		return "Invalid superblock: member info area missing";
 +
-+	if (le16_to_cpu(sb->block_size) != c->opts.block_size)
++	if (le16_to_cpu(sb->block_size) != block_sectors(c))
 +		return "mismatched block size";
 +
 +	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
@@ -68612,8 +72001,6 @@ index 000000000000..3744b6d519a7
 +
 +static void bch2_dev_free(struct bch_dev *ca)
 +{
-+	bch2_dev_allocator_stop(ca);
-+
 +	cancel_work_sync(&ca->io_error_work);
 +
 +	if (ca->kobj.state_in_sysfs &&
@@ -68728,8 +72115,8 @@ index 000000000000..3744b6d519a7
 +	ca->mi = bch2_mi_to_cpu(member);
 +	ca->uuid = member->uuid;
 +
-+	if (opt_defined(c->opts, discard))
-+		ca->mi.discard = opt_get(c->opts, discard);
++	ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
++			     ca->mi.bucket_size / btree_sectors(c));
 +
 +	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 +			    0, GFP_KERNEL) ||
@@ -68780,12 +72167,6 @@ index 000000000000..3744b6d519a7
 +
 +	ca->fs = c;
 +
-+	if (ca->mi.state == BCH_MEMBER_STATE_rw &&
-+	    bch2_dev_allocator_start(ca)) {
-+		bch2_dev_free(ca);
-+		goto err;
-+	}
-+
 +	bch2_dev_attach(c, ca, dev_idx);
 +out:
 +	pr_verbose_init(c->opts, "ret %i", ret);
@@ -68831,6 +72212,8 @@ index 000000000000..3744b6d519a7
 +		ca->disk_sb.bdev->bd_holder = ca;
 +	memset(sb, 0, sizeof(*sb));
 +
++	ca->dev = ca->disk_sb.bdev->bd_dev;
++
 +	percpu_ref_reinit(&ca->io_ref);
 +
 +	return 0;
@@ -68969,14 +72352,13 @@ index 000000000000..3744b6d519a7
 +	/*
 +	 * The allocator thread itself allocates btree nodes, so stop it first:
 +	 */
-+	bch2_dev_allocator_stop(ca);
 +	bch2_dev_allocator_remove(c, ca);
 +	bch2_dev_journal_stop(&c->journal, ca);
 +
 +	bch2_copygc_start(c);
 +}
 +
-+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
++static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 +{
 +	lockdep_assert_held(&c->state_lock);
 +
@@ -68984,8 +72366,6 @@ index 000000000000..3744b6d519a7
 +
 +	bch2_dev_allocator_add(c, ca);
 +	bch2_recalc_capacity(c);
-+
-+	return bch2_dev_allocator_start(ca);
 +}
 +
 +int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -69012,7 +72392,7 @@ index 000000000000..3744b6d519a7
 +	mutex_unlock(&c->sb_lock);
 +
 +	if (new_state == BCH_MEMBER_STATE_rw)
-+		ret = __bch2_dev_read_write(c, ca);
++		__bch2_dev_read_write(c, ca);
 +
 +	rebalance_wakeup(c);
 +
@@ -69035,30 +72415,20 @@ index 000000000000..3744b6d519a7
 +
 +static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 +{
-+	struct btree_trans trans;
-+	size_t i;
++	struct bpos start	= POS(ca->dev_idx, 0);
++	struct bpos end		= POS(ca->dev_idx, U64_MAX);
 +	int ret;
 +
-+	bch2_trans_init(&trans, c, 0, 0);
-+
-+	for (i = 0; i < ca->mi.nbuckets; i++) {
-+		ret = lockrestart_do(&trans,
-+			bch2_btree_key_cache_flush(&trans,
-+				BTREE_ID_alloc, POS(ca->dev_idx, i)));
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_exit(&trans);
-+
-+	if (ret) {
++	ret =   bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
++					BTREE_TRIGGER_NORUN, NULL) ?:
++		bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
++					BTREE_TRIGGER_NORUN, NULL);
++	if (ret)
 +		bch_err(c, "error %i removing dev alloc info", ret);
-+		return ret;
-+	}
 +
-+	return bch2_btree_delete_range(c, BTREE_ID_alloc,
-+				       POS(ca->dev_idx, 0),
-+				       POS(ca->dev_idx + 1, 0),
-+				       NULL);
++	return ret;
 +}
 +
 +int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -69123,11 +72493,11 @@ index 000000000000..3744b6d519a7
 +
 +	data = bch2_dev_has_data(c, ca);
 +	if (data) {
-+		char data_has_str[100];
++		struct printbuf data_has = PRINTBUF;
 +
-+		bch2_flags_to_text(&PBUF(data_has_str),
-+				   bch2_data_types, data);
-+		bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
++		bch2_flags_to_text(&data_has, bch2_data_types, data);
++		bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
++		printbuf_exit(&data_has);
 +		ret = -EBUSY;
 +		goto err;
 +	}
@@ -69175,69 +72545,59 @@ index 000000000000..3744b6d519a7
 +	struct bch_dev *ca = NULL;
 +	struct bch_sb_field_members *mi;
 +	struct bch_member dev_mi;
-+	struct bucket_array *buckets;
-+	struct bucket *g;
 +	unsigned dev_idx, nr_devices, u64s;
++	struct printbuf errbuf = PRINTBUF;
 +	int ret;
 +
 +	ret = bch2_read_super(path, &opts, &sb);
-+	if (ret)
-+		return ret;
-+
-+	err = bch2_sb_validate(&sb);
-+	if (err)
-+		return -EINVAL;
++	if (ret) {
++		bch_err(c, "device add error: error reading super: %i", ret);
++		goto err;
++	}
 +
 +	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
 +
 +	err = bch2_dev_may_add(sb.sb, c);
-+	if (err)
-+		return -EINVAL;
++	if (err) {
++		bch_err(c, "device add error: %s", err);
++		ret = -EINVAL;
++		goto err;
++	}
 +
 +	ca = __bch2_dev_alloc(c, &dev_mi);
 +	if (!ca) {
 +		bch2_free_super(&sb);
-+		return -ENOMEM;
++		ret = -ENOMEM;
++		goto err;
 +	}
 +
 +	ret = __bch2_dev_attach_bdev(ca, &sb);
 +	if (ret) {
 +		bch2_dev_free(ca);
-+		return ret;
++		goto err;
 +	}
 +
-+	/*
-+	 * We want to allocate journal on the new device before adding the new
-+	 * device to the filesystem because allocating after we attach requires
-+	 * spinning up the allocator thread, and the allocator thread requires
-+	 * doing btree writes, which if the existing devices are RO isn't going
-+	 * to work
-+	 *
-+	 * So we have to mark where the superblocks are, but marking allocated
-+	 * data normally updates the filesystem usage too, so we have to mark,
-+	 * allocate the journal, reset all the marks, then remark after we
-+	 * attach...
-+	 */
-+	bch2_mark_dev_superblock(NULL, ca, 0);
-+
-+	err = "journal alloc failed";
 +	ret = bch2_dev_journal_alloc(ca);
-+	if (ret)
++	if (ret) {
++		bch_err(c, "device add error: journal alloc failed");
 +		goto err;
++	}
 +
 +	down_write(&c->state_lock);
 +	mutex_lock(&c->sb_lock);
 +
-+	err = "insufficient space in new superblock";
 +	ret = bch2_sb_from_fs(c, ca);
-+	if (ret)
++	if (ret) {
++		bch_err(c, "device add error: new device superblock too small");
 +		goto err_unlock;
++	}
 +
 +	mi = bch2_sb_get_members(ca->disk_sb.sb);
 +
 +	if (!bch2_sb_resize_members(&ca->disk_sb,
 +				le32_to_cpu(mi->field.u64s) +
 +				sizeof(dev_mi) / sizeof(u64))) {
++		bch_err(c, "device add error: new device superblock too small");
 +		ret = -ENOSPC;
 +		goto err_unlock;
 +	}
@@ -69250,7 +72610,7 @@ index 000000000000..3744b6d519a7
 +		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
 +			goto have_slot;
 +no_slot:
-+	err = "no slots available in superblock";
++	bch_err(c, "device add error: already have maximum number of devices");
 +	ret = -ENOSPC;
 +	goto err_unlock;
 +
@@ -69259,12 +72619,12 @@ index 000000000000..3744b6d519a7
 +	u64s = (sizeof(struct bch_sb_field_members) +
 +		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 +
-+	err = "no space in superblock for member info";
-+	ret = -ENOSPC;
-+
 +	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
-+	if (!mi)
++	if (!mi) {
++		bch_err(c, "device add error: no room in superblock for member info");
++		ret = -ENOSPC;
 +		goto err_unlock;
++	}
 +
 +	/* success: */
 +
@@ -69280,27 +72640,23 @@ index 000000000000..3744b6d519a7
 +
 +	bch2_dev_usage_journal_reserve(c);
 +
-+	/*
-+	 * Clear marks before marking transactionally in the btree, so that
-+	 * per-device accounting gets done correctly:
-+	 */
-+	down_read(&ca->bucket_lock);
-+	buckets = bucket_array(ca);
-+	for_each_bucket(g, buckets)
-+		atomic64_set(&g->_mark.v, 0);
-+	up_read(&ca->bucket_lock);
-+
-+	err = "error marking superblock";
 +	ret = bch2_trans_mark_dev_sb(c, ca);
-+	if (ret)
++	if (ret) {
++		bch_err(c, "device add error: error marking new superblock: %i", ret);
 +		goto err_late;
-+
-+	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-+		ret = __bch2_dev_read_write(c, ca);
-+		if (ret)
-+			goto err_late;
 +	}
 +
++	ret = bch2_fs_freespace_init(c);
++	if (ret) {
++		bch_err(c, "device add error: error initializing free space: %i", ret);
++		goto err_late;
++	}
++
++	ca->new_fs_bucket_idx = 0;
++
++	if (ca->mi.state == BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_write(c, ca);
++
 +	up_write(&c->state_lock);
 +	return 0;
 +
@@ -69311,12 +72667,12 @@ index 000000000000..3744b6d519a7
 +	if (ca)
 +		bch2_dev_free(ca);
 +	bch2_free_super(&sb);
-+	bch_err(c, "Unable to add device: %s", err);
++	printbuf_exit(&errbuf);
 +	return ret;
 +err_late:
 +	up_write(&c->state_lock);
-+	bch_err(c, "Error going rw after adding device: %s", err);
-+	return -EINVAL;
++	ca = NULL;
++	goto err;
 +}
 +
 +/* Hot add existing device to running filesystem: */
@@ -69359,11 +72715,8 @@ index 000000000000..3744b6d519a7
 +		goto err;
 +	}
 +
-+	if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-+		ret = __bch2_dev_read_write(c, ca);
-+		if (ret)
-+			goto err;
-+	}
++	if (ca->mi.state == BCH_MEMBER_STATE_rw)
++		__bch2_dev_read_write(c, ca);
 +
 +	mutex_lock(&c->sb_lock);
 +	mi = bch2_sb_get_members(c->disk_sb.sb);
@@ -69450,20 +72803,14 @@ index 000000000000..3744b6d519a7
 +}
 +
 +/* return with ref on ca->ref: */
-+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 +{
 +	struct bch_dev *ca;
-+	dev_t dev;
 +	unsigned i;
-+	int ret;
-+
-+	ret = lookup_bdev(path, &dev);
-+	if (ret)
-+		return ERR_PTR(ret);
 +
 +	rcu_read_lock();
 +	for_each_member_device_rcu(ca, c, i, NULL)
-+		if (ca->disk_sb.bdev->bd_dev == dev)
++		if (!strcmp(name, ca->name))
 +			goto found;
 +	ca = ERR_PTR(-ENOENT);
 +found:
@@ -69482,18 +72829,17 @@ index 000000000000..3744b6d519a7
 +	struct bch_sb_field_members *mi;
 +	unsigned i, best_sb = 0;
 +	const char *err;
++	struct printbuf errbuf = PRINTBUF;
 +	int ret = 0;
 +
++	if (!try_module_get(THIS_MODULE))
++		return ERR_PTR(-ENODEV);
++
 +	pr_verbose_init(opts, "");
 +
 +	if (!nr_devices) {
-+		c = ERR_PTR(-EINVAL);
-+		goto out2;
-+	}
-+
-+	if (!try_module_get(THIS_MODULE)) {
-+		c = ERR_PTR(-ENODEV);
-+		goto out2;
++		ret = -EINVAL;
++		goto err;
 +	}
 +
 +	sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
@@ -69507,9 +72853,6 @@ index 000000000000..3744b6d519a7
 +		if (ret)
 +			goto err;
 +
-+		err = bch2_sb_validate(&sb[i]);
-+		if (err)
-+			goto err_print;
 +	}
 +
 +	for (i = 1; i < nr_devices; i++)
@@ -69564,8 +72907,8 @@ index 000000000000..3744b6d519a7
 +	}
 +out:
 +	kfree(sb);
++	printbuf_exit(&errbuf);
 +	module_put(THIS_MODULE);
-+out2:
 +	pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
 +	return c;
 +err_print:
@@ -69582,81 +72925,6 @@ index 000000000000..3744b6d519a7
 +	goto out;
 +}
 +
-+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
-+					      struct bch_opts opts)
-+{
-+	const char *err;
-+	struct bch_fs *c;
-+	bool allocated_fs = false;
-+	int ret;
-+
-+	err = bch2_sb_validate(sb);
-+	if (err)
-+		return err;
-+
-+	mutex_lock(&bch_fs_list_lock);
-+	c = __bch2_uuid_to_fs(sb->sb->uuid);
-+	if (c) {
-+		closure_get(&c->cl);
-+
-+		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
-+		if (err)
-+			goto err;
-+	} else {
-+		allocated_fs = true;
-+		c = bch2_fs_alloc(sb->sb, opts);
-+
-+		err = "bch2_fs_alloc() error";
-+		if (IS_ERR(c))
-+			goto err;
-+	}
-+
-+	err = "bch2_dev_online() error";
-+
-+	mutex_lock(&c->sb_lock);
-+	if (bch2_dev_attach_bdev(c, sb)) {
-+		mutex_unlock(&c->sb_lock);
-+		goto err;
-+	}
-+	mutex_unlock(&c->sb_lock);
-+
-+	if (!c->opts.nostart && bch2_fs_may_start(c)) {
-+		err = "error starting filesystem";
-+		ret = bch2_fs_start(c);
-+		if (ret)
-+			goto err;
-+	}
-+
-+	closure_put(&c->cl);
-+	mutex_unlock(&bch_fs_list_lock);
-+
-+	return NULL;
-+err:
-+	mutex_unlock(&bch_fs_list_lock);
-+
-+	if (allocated_fs && !IS_ERR(c))
-+		bch2_fs_stop(c);
-+	else if (c)
-+		closure_put(&c->cl);
-+
-+	return err;
-+}
-+
-+const char *bch2_fs_open_incremental(const char *path)
-+{
-+	struct bch_sb_handle sb;
-+	struct bch_opts opts = bch2_opts_empty();
-+	const char *err;
-+
-+	if (bch2_read_super(path, &opts, &sb))
-+		return "error reading superblock";
-+
-+	err = __bch2_fs_open_incremental(&sb, opts);
-+	bch2_free_super(&sb);
-+
-+	return err;
-+}
-+
 +/* Global interfaces/init */
 +
 +static void bcachefs_exit(void)
@@ -69697,10 +72965,10 @@ index 000000000000..3744b6d519a7
 +module_init(bcachefs_init);
 diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
 new file mode 100644
-index 000000000000..739e8fd18176
+index 000000000000..6d3efda26e63
 --- /dev/null
 +++ b/fs/bcachefs/super.h
-@@ -0,0 +1,238 @@
+@@ -0,0 +1,264 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUPER_H
 +#define _BCACHEFS_SUPER_H
@@ -69729,6 +72997,12 @@ index 000000000000..739e8fd18176
 +	return remainder;
 +}
 +
++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
++						 u32 *offset)
++{
++	return div_u64_rem(s, ca->mi.bucket_size, offset);
++}
++
 +static inline bool bch2_dev_is_online(struct bch_dev *ca)
 +{
 +	return !percpu_ref_is_zero(&ca->io_ref);
@@ -69897,6 +73171,27 @@ index 000000000000..739e8fd18176
 +	return devs;
 +}
 +
++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
++{
++	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
++	u64 b_offset	= bucket_to_sector(ca, b);
++	u64 b_end	= bucket_to_sector(ca, b + 1);
++	unsigned i;
++
++	if (!b)
++		return true;
++
++	for (i = 0; i < layout->nr_superblocks; i++) {
++		u64 offset = le64_to_cpu(layout->sb_offset[i]);
++		u64 end = offset + (1 << layout->sb_max_size_bits);
++
++		if (!(offset >= b_end || end <= b_offset))
++			return true;
++	}
++
++	return false;
++}
++
 +struct bch_fs *bch2_dev_to_fs(dev_t);
 +struct bch_fs *bch2_uuid_to_fs(uuid_le);
 +
@@ -69936,12 +73231,11 @@ index 000000000000..739e8fd18176
 +
 +int bch2_fs_start(struct bch_fs *);
 +struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-+const char *bch2_fs_open_incremental(const char *path);
 +
 +#endif /* _BCACHEFS_SUPER_H */
 diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
 new file mode 100644
-index 000000000000..96023f37afea
+index 000000000000..89419fc7930d
 --- /dev/null
 +++ b/fs/bcachefs/super_types.h
 @@ -0,0 +1,51 @@
@@ -69976,10 +73270,10 @@ index 000000000000..96023f37afea
 +	u16			bucket_size;	/* sectors */
 +	u16			group;
 +	u8			state;
-+	u8			replacement;
 +	u8			discard;
 +	u8			data_allowed;
 +	u8			durability;
++	u8			freespace_initialized;
 +	u8			valid;
 +};
 +
@@ -69998,10 +73292,10 @@ index 000000000000..96023f37afea
 +#endif /* _BCACHEFS_SUPER_TYPES_H */
 diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
 new file mode 100644
-index 000000000000..864be8601868
+index 000000000000..2594fec4b821
 --- /dev/null
 +++ b/fs/bcachefs/sysfs.c
-@@ -0,0 +1,1009 @@
+@@ -0,0 +1,889 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcache sysfs interfaces
@@ -70014,6 +73308,7 @@ index 000000000000..864be8601868
 +
 +#include "bcachefs.h"
 +#include "alloc_background.h"
++#include "alloc_foreground.h"
 +#include "sysfs.h"
 +#include "btree_cache.h"
 +#include "btree_io.h"
@@ -70049,8 +73344,28 @@ index 000000000000..864be8601868
 +}
 +
 +#define SHOW(fn)							\
++static ssize_t fn ## _to_text(struct printbuf *,			\
++			      struct kobject *, struct attribute *);\
++									\
 +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
 +			   char *buf)					\
++{									\
++	struct printbuf out = PRINTBUF;					\
++	ssize_t ret = fn ## _to_text(&out, kobj, attr);			\
++									\
++	if (!ret && out.allocation_failure)				\
++		ret = -ENOMEM;						\
++									\
++	if (!ret) {							\
++		ret = min_t(size_t, out.pos, PAGE_SIZE - 1);		\
++		memcpy(buf, out.buf, ret);				\
++	}								\
++	printbuf_exit(&out);						\
++	return ret;							\
++}									\
++									\
++static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\
++			      struct attribute *attr)
 +
 +#define STORE(fn)							\
 +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
@@ -70067,22 +73382,19 @@ index 000000000000..864be8601868
 +#define sysfs_printf(file, fmt, ...)					\
 +do {									\
 +	if (attr == &sysfs_ ## file)					\
-+		return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
++		pr_buf(out, fmt "\n", __VA_ARGS__);			\
 +} while (0)
 +
 +#define sysfs_print(file, var)						\
 +do {									\
 +	if (attr == &sysfs_ ## file)					\
-+		return snprint(buf, PAGE_SIZE, var);			\
++		snprint(out, var);					\
 +} while (0)
 +
 +#define sysfs_hprint(file, val)						\
 +do {									\
-+	if (attr == &sysfs_ ## file) {					\
-+		bch2_hprint(&out, val);					\
-+		pr_buf(&out, "\n");					\
-+		return out.pos - buf;					\
-+	}								\
++	if (attr == &sysfs_ ## file)					\
++		bch2_hprint(out, val);					\
 +} while (0)
 +
 +#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
@@ -70135,7 +73447,6 @@ index 000000000000..864be8601868
 +		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
 +} while (0)
 +
-+write_attribute(trigger_journal_flush);
 +write_attribute(trigger_gc);
 +write_attribute(prune_cache);
 +rw_attribute(btree_gc_periodic);
@@ -70144,8 +73455,6 @@ index 000000000000..864be8601868
 +read_attribute(uuid);
 +read_attribute(minor);
 +read_attribute(bucket_size);
-+read_attribute(block_size);
-+read_attribute(btree_node_size);
 +read_attribute(first_bucket);
 +read_attribute(nbuckets);
 +read_attribute(durability);
@@ -70159,13 +73468,10 @@ index 000000000000..864be8601868
 +
 +read_attribute(btree_avg_write_size);
 +
-+read_attribute(reserve_stats);
 +read_attribute(btree_cache_size);
 +read_attribute(compression_stats);
 +read_attribute(journal_debug);
-+read_attribute(journal_pins);
 +read_attribute(btree_updates);
-+read_attribute(dirty_btree_nodes);
 +read_attribute(btree_cache);
 +read_attribute(btree_key_cache);
 +read_attribute(btree_transactions);
@@ -70176,17 +73482,13 @@ index 000000000000..864be8601868
 +
 +read_attribute(has_data);
 +read_attribute(alloc_debug);
-+write_attribute(wake_allocator);
 +
 +read_attribute(read_realloc_races);
 +read_attribute(extent_migrate_done);
 +read_attribute(extent_migrate_raced);
-+
-+rw_attribute(journal_write_delay_ms);
-+rw_attribute(journal_reclaim_delay_ms);
++read_attribute(bucket_alloc_fail);
 +
 +rw_attribute(discard);
-+rw_attribute(cache_replacement_policy);
 +rw_attribute(label);
 +
 +rw_attribute(copy_gc_enabled);
@@ -70202,7 +73504,7 @@ index 000000000000..864be8601868
 +read_attribute(io_timers_read);
 +read_attribute(io_timers_write);
 +
-+read_attribute(data_op_data_progress);
++read_attribute(data_jobs);
 +
 +#ifdef CONFIG_BCACHEFS_TESTS
 +write_attribute(perf_test);
@@ -70240,59 +73542,36 @@ index 000000000000..864be8601868
 +	return nr ? div64_u64(sectors, nr) : 0;
 +}
 +
-+static long stats_to_text(struct printbuf *out, struct bch_fs *c,
-+			  struct bch_move_stats *stats)
-+{
-+	pr_buf(out, "%s: data type %s btree_id %s position: ",
-+		stats->name,
-+		bch2_data_types[stats->data_type],
-+		bch2_btree_ids[stats->btree_id]);
-+	bch2_bpos_to_text(out, stats->pos);
-+	pr_buf(out, "%s", "\n");
-+
-+	return 0;
-+}
-+
 +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +	long ret = 0;
-+	struct bch_move_stats *iter;
++	struct bch_move_stats *stats;
 +
 +	mutex_lock(&c->data_progress_lock);
-+
-+	if (list_empty(&c->data_progress_list))
-+		pr_buf(out, "%s", "no progress to report\n");
-+	else
-+		list_for_each_entry(iter, &c->data_progress_list, list) {
-+			stats_to_text(out, c, iter);
-+		}
++	list_for_each_entry(stats, &c->data_progress_list, list) {
++		pr_buf(out, "%s: data type %s btree_id %s position: ",
++		       stats->name,
++		       bch2_data_types[stats->data_type],
++		       bch2_btree_ids[stats->btree_id]);
++		bch2_bpos_to_text(out, stats->pos);
++		pr_buf(out, "%s", "\n");
++	}
 +
 +	mutex_unlock(&c->data_progress_lock);
 +	return ret;
 +}
 +
-+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-+{
-+	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
-+
-+	if (!fs_usage)
-+		return -ENOMEM;
-+
-+	bch2_fs_usage_to_text(out, c, fs_usage);
-+
-+	percpu_up_read(&c->mark_lock);
-+
-+	kfree(fs_usage);
-+	return 0;
-+}
-+
 +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
-+	u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
++	enum btree_id id;
++	u64 nr_uncompressed_extents = 0,
 +	    nr_compressed_extents = 0,
++	    nr_incompressible_extents = 0,
++	    uncompressed_sectors = 0,
++	    incompressible_sectors = 0,
 +	    compressed_sectors_compressed = 0,
 +	    compressed_sectors_uncompressed = 0;
 +	int ret;
@@ -70302,47 +73581,72 @@ index 000000000000..864be8601868
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
-+		if (k.k->type == KEY_TYPE_extent) {
-+			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		if (!((1U << id) & BTREE_ID_HAS_PTRS))
++			continue;
++
++		for_each_btree_key(&trans, iter, id, POS_MIN,
++				   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++			struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +			const union bch_extent_entry *entry;
 +			struct extent_ptr_decoded p;
++			bool compressed = false, uncompressed = false, incompressible = false;
 +
-+			extent_for_each_ptr_decode(e, p, entry) {
-+				if (!crc_is_compressed(p.crc)) {
-+					nr_uncompressed_extents++;
-+					uncompressed_sectors += e.k->size;
-+				} else {
-+					nr_compressed_extents++;
++			bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
++				switch (p.crc.compression_type) {
++				case BCH_COMPRESSION_TYPE_none:
++					uncompressed = true;
++					uncompressed_sectors += k.k->size;
++					break;
++				case BCH_COMPRESSION_TYPE_incompressible:
++					incompressible = true;
++					incompressible_sectors += k.k->size;
++					break;
++				default:
 +					compressed_sectors_compressed +=
 +						p.crc.compressed_size;
 +					compressed_sectors_uncompressed +=
 +						p.crc.uncompressed_size;
++					compressed = true;
++					break;
 +				}
-+
-+				/* only looking at the first ptr */
-+				break;
 +			}
++
++			if (incompressible)
++				nr_incompressible_extents++;
++			else if (uncompressed)
++				nr_uncompressed_extents++;
++			else if (compressed)
++				nr_compressed_extents++;
 +		}
-+	bch2_trans_iter_exit(&trans, &iter);
++		bch2_trans_iter_exit(&trans, &iter);
++	}
 +
 +	bch2_trans_exit(&trans);
++
 +	if (ret)
 +		return ret;
 +
-+	pr_buf(out,
-+	       "uncompressed data:\n"
-+	       "	nr extents:			%llu\n"
-+	       "	size (bytes):			%llu\n"
-+	       "compressed data:\n"
-+	       "	nr extents:			%llu\n"
-+	       "	compressed size (bytes):	%llu\n"
-+	       "	uncompressed size (bytes):	%llu\n",
-+	       nr_uncompressed_extents,
-+	       uncompressed_sectors << 9,
-+	       nr_compressed_extents,
-+	       compressed_sectors_compressed << 9,
-+	       compressed_sectors_uncompressed << 9);
++	pr_buf(out, "uncompressed:\n");
++	pr_buf(out, "	nr extents:		%llu\n", nr_uncompressed_extents);
++	pr_buf(out, "	size:			");
++	bch2_hprint(out, uncompressed_sectors << 9);
++	pr_buf(out, "\n");
++
++	pr_buf(out, "compressed:\n");
++	pr_buf(out, "	nr extents:		%llu\n", nr_compressed_extents);
++	pr_buf(out, "	compressed size:	");
++	bch2_hprint(out, compressed_sectors_compressed << 9);
++	pr_buf(out, "\n");
++	pr_buf(out, "	uncompressed size:	");
++	bch2_hprint(out, compressed_sectors_uncompressed << 9);
++	pr_buf(out, "\n");
++
++	pr_buf(out, "incompressible:\n");
++	pr_buf(out, "	nr extents:		%llu\n", nr_incompressible_extents);
++	pr_buf(out, "	size:			");
++	bch2_hprint(out, incompressible_sectors << 9);
++	pr_buf(out, "\n");
 +	return 0;
 +}
 +
@@ -70356,16 +73660,10 @@ index 000000000000..864be8601868
 +SHOW(bch2_fs)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
-+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 +
 +	sysfs_print(minor,			c->minor);
 +	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
 +
-+	sysfs_print(journal_write_delay_ms,	c->journal.write_delay_ms);
-+	sysfs_print(journal_reclaim_delay_ms,	c->journal.reclaim_delay_ms);
-+
-+	sysfs_print(block_size,			block_bytes(c));
-+	sysfs_print(btree_node_size,		btree_bytes(c));
 +	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
 +	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
 +
@@ -70375,13 +73673,13 @@ index 000000000000..864be8601868
 +		    atomic_long_read(&c->extent_migrate_done));
 +	sysfs_print(extent_migrate_raced,
 +		    atomic_long_read(&c->extent_migrate_raced));
++	sysfs_print(bucket_alloc_fail,
++		    atomic_long_read(&c->bucket_alloc_fail));
 +
 +	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 +
-+	if (attr == &sysfs_gc_gens_pos) {
-+		bch2_gc_gens_pos_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_gc_gens_pos)
++		bch2_gc_gens_pos_to_text(out, c);
 +
 +	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
 +
@@ -70391,86 +73689,48 @@ index 000000000000..864be8601868
 +		     max(0LL, c->copygc_wait -
 +			 atomic64_read(&c->io_clock[WRITE].now)) << 9);
 +
-+	if (attr == &sysfs_rebalance_work) {
-+		bch2_rebalance_work_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_rebalance_work)
++		bch2_rebalance_work_to_text(out, c);
 +
 +	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
 +
 +	/* Debugging: */
 +
-+	if (attr == &sysfs_alloc_debug)
-+		return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
++	if (attr == &sysfs_journal_debug)
++		bch2_journal_debug_to_text(out, &c->journal);
 +
-+	if (attr == &sysfs_journal_debug) {
-+		bch2_journal_debug_to_text(&out, &c->journal);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_btree_updates)
++		bch2_btree_updates_to_text(out, c);
 +
-+	if (attr == &sysfs_journal_pins) {
-+		bch2_journal_pins_to_text(&out, &c->journal);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_btree_cache)
++		bch2_btree_cache_to_text(out, c);
 +
-+	if (attr == &sysfs_btree_updates) {
-+		bch2_btree_updates_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_btree_key_cache)
++		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
 +
-+	if (attr == &sysfs_dirty_btree_nodes) {
-+		bch2_dirty_btree_nodes_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_btree_transactions)
++		bch2_btree_trans_to_text(out, c);
 +
-+	if (attr == &sysfs_btree_cache) {
-+		bch2_btree_cache_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_stripes_heap)
++		bch2_stripes_heap_to_text(out, c);
 +
-+	if (attr == &sysfs_btree_key_cache) {
-+		bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_open_buckets)
++		bch2_open_buckets_to_text(out, c);
 +
-+	if (attr == &sysfs_btree_transactions) {
-+		bch2_btree_trans_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_compression_stats)
++		bch2_compression_stats_to_text(out, c);
 +
-+	if (attr == &sysfs_stripes_heap) {
-+		bch2_stripes_heap_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_new_stripes)
++		bch2_new_stripes_to_text(out, c);
 +
-+	if (attr == &sysfs_open_buckets) {
-+		bch2_open_buckets_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_io_timers_read)
++		bch2_io_timers_to_text(out, &c->io_clock[READ]);
 +
-+	if (attr == &sysfs_compression_stats) {
-+		bch2_compression_stats_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_io_timers_write)
++		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 +
-+	if (attr == &sysfs_new_stripes) {
-+		bch2_new_stripes_to_text(&out, c);
-+		return out.pos - buf;
-+	}
-+
-+	if (attr == &sysfs_io_timers_read) {
-+		bch2_io_timers_to_text(&out, &c->io_clock[READ]);
-+		return out.pos - buf;
-+	}
-+	if (attr == &sysfs_io_timers_write) {
-+		bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
-+		return out.pos - buf;
-+	}
-+
-+	if (attr == &sysfs_data_op_data_progress) {
-+		data_progress_to_text(&out, c);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_data_jobs)
++		data_progress_to_text(out, c);
 +
 +	return 0;
 +}
@@ -70479,9 +73739,6 @@ index 000000000000..864be8601868
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 +
-+	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
-+	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-+
 +	if (attr == &sysfs_btree_gc_periodic) {
 +		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
 +			?: (ssize_t) size;
@@ -70518,8 +73775,16 @@ index 000000000000..864be8601868
 +
 +	/* Debugging: */
 +
-+	if (attr == &sysfs_trigger_journal_flush)
-+		bch2_journal_meta(&c->journal);
++	if (!test_bit(BCH_FS_RW, &c->flags))
++		return -EROFS;
++
++	if (attr == &sysfs_prune_cache) {
++		struct shrink_control sc;
++
++		sc.gfp_mask = GFP_KERNEL;
++		sc.nr_to_scan = strtoul_or_return(buf);
++		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
++	}
 +
 +	if (attr == &sysfs_trigger_gc) {
 +		/*
@@ -70534,14 +73799,6 @@ index 000000000000..864be8601868
 +#endif
 +	}
 +
-+	if (attr == &sysfs_prune_cache) {
-+		struct shrink_control sc;
-+
-+		sc.gfp_mask = GFP_KERNEL;
-+		sc.nr_to_scan = strtoul_or_return(buf);
-+		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
-+	}
-+
 +#ifdef CONFIG_BCACHEFS_TESTS
 +	if (attr == &sysfs_perf_test) {
 +		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -70568,14 +73825,9 @@ index 000000000000..864be8601868
 +
 +struct attribute *bch2_fs_files[] = {
 +	&sysfs_minor,
-+	&sysfs_block_size,
-+	&sysfs_btree_node_size,
 +	&sysfs_btree_cache_size,
 +	&sysfs_btree_avg_write_size,
 +
-+	&sysfs_journal_write_delay_ms,
-+	&sysfs_journal_reclaim_delay_ms,
-+
 +	&sysfs_promote_whole_extents,
 +
 +	&sysfs_compression_stats,
@@ -70591,7 +73843,7 @@ index 000000000000..864be8601868
 +SHOW(bch2_fs_internal)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
-+	return bch2_fs_show(&c->kobj, attr, buf);
++	return bch2_fs_to_text(out, &c->kobj, attr);
 +}
 +
 +STORE(bch2_fs_internal)
@@ -70602,25 +73854,26 @@ index 000000000000..864be8601868
 +SYSFS_OPS(bch2_fs_internal);
 +
 +struct attribute *bch2_fs_internal_files[] = {
-+	&sysfs_alloc_debug,
 +	&sysfs_journal_debug,
-+	&sysfs_journal_pins,
 +	&sysfs_btree_updates,
-+	&sysfs_dirty_btree_nodes,
 +	&sysfs_btree_cache,
 +	&sysfs_btree_key_cache,
 +	&sysfs_btree_transactions,
++	&sysfs_new_stripes,
 +	&sysfs_stripes_heap,
 +	&sysfs_open_buckets,
++	&sysfs_io_timers_read,
++	&sysfs_io_timers_write,
++
++	&sysfs_trigger_gc,
++	&sysfs_prune_cache,
 +
 +	&sysfs_read_realloc_races,
 +	&sysfs_extent_migrate_done,
 +	&sysfs_extent_migrate_raced,
++	&sysfs_bucket_alloc_fail,
 +
-+	&sysfs_trigger_journal_flush,
-+	&sysfs_trigger_gc,
 +	&sysfs_gc_gens_pos,
-+	&sysfs_prune_cache,
 +
 +	&sysfs_copy_gc_enabled,
 +	&sysfs_copy_gc_wait,
@@ -70629,12 +73882,7 @@ index 000000000000..864be8601868
 +	&sysfs_rebalance_work,
 +	sysfs_pd_controller_files(rebalance),
 +
-+	&sysfs_new_stripes,
-+
-+	&sysfs_io_timers_read,
-+	&sysfs_io_timers_write,
-+
-+	&sysfs_data_op_data_progress,
++	&sysfs_data_jobs,
 +
 +	&sysfs_internal_uuid,
 +	NULL
@@ -70644,47 +73892,49 @@ index 000000000000..864be8601868
 +
 +SHOW(bch2_fs_opts_dir)
 +{
-+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 +	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 +	int id = opt - bch2_opt_table;
 +	u64 v = bch2_opt_get_by_id(&c->opts, id);
 +
-+	bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
-+	pr_buf(&out, "\n");
++	bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
++	pr_char(out, '\n');
 +
-+	return out.pos - buf;
++	return 0;
 +}
 +
 +STORE(bch2_fs_opts_dir)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 +	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
-+	int ret, id = opt - bch2_opt_table;
++	int ret = size, id = opt - bch2_opt_table;
 +	char *tmp;
 +	u64 v;
 +
-+	tmp = kstrdup(buf, GFP_KERNEL);
-+	if (!tmp)
-+		return -ENOMEM;
++	/*
++	 * We don't need to take c->writes for correctness, but it eliminates an
++	 * unsightly error message in the dmesg log when we're RO:
++	 */
++	if (unlikely(!percpu_ref_tryget(&c->writes)))
++		return -EROFS;
 +
-+	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
++	tmp = kstrdup(buf, GFP_KERNEL);
++	if (!tmp) {
++		ret = -ENOMEM;
++		goto err;
++	}
++
++	ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
 +	kfree(tmp);
 +
 +	if (ret < 0)
-+		return ret;
++		goto err;
 +
 +	ret = bch2_opt_check_may_set(c, id, v);
 +	if (ret < 0)
-+		return ret;
-+
-+	if (opt->set_sb != SET_NO_SB_OPT) {
-+		mutex_lock(&c->sb_lock);
-+		opt->set_sb(c->disk_sb.sb, v);
-+		bch2_write_super(c);
-+		mutex_unlock(&c->sb_lock);
-+	}
++		goto err;
 +
++	bch2_opt_set_sb(c, opt, v);
 +	bch2_opt_set_by_id(&c->opts, id, v);
 +
 +	if ((id == Opt_background_target ||
@@ -70692,8 +73942,9 @@ index 000000000000..864be8601868
 +		bch2_rebalance_add_work(c, S64_MAX);
 +		rebalance_wakeup(c);
 +	}
-+
-+	return size;
++err:
++	percpu_ref_put(&c->writes);
++	return ret;
 +}
 +SYSFS_OPS(bch2_fs_opts_dir);
 +
@@ -70707,7 +73958,7 @@ index 000000000000..864be8601868
 +	for (i = bch2_opt_table;
 +	     i < bch2_opt_table + bch2_opts_nr;
 +	     i++) {
-+		if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
++		if (!(i->flags & OPT_FS))
 +			continue;
 +
 +		ret = sysfs_create_file(kobj, &i->attr);
@@ -70723,13 +73974,10 @@ index 000000000000..864be8601868
 +SHOW(bch2_fs_time_stats)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 +
 +#define x(name)								\
-+	if (attr == &sysfs_time_stat_##name) {				\
-+		bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
-+		return out.pos - buf;					\
-+	}
++	if (attr == &sysfs_time_stat_##name)				\
++		bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]);
 +	BCH_TIME_STATS()
 +#undef x
 +
@@ -70750,24 +73998,6 @@ index 000000000000..864be8601868
 +	NULL
 +};
 +
-+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-+{
-+	enum alloc_reserve i;
-+
-+	spin_lock(&ca->fs->freelist_lock);
-+
-+	pr_buf(out, "free_inc:\t%zu\t%zu\n",
-+	       fifo_used(&ca->free_inc),
-+	       ca->free_inc.size);
-+
-+	for (i = 0; i < RESERVE_NR; i++)
-+		pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
-+		       fifo_used(&ca->free[i]),
-+		       ca->free[i].size);
-+
-+	spin_unlock(&ca->fs->freelist_lock);
-+}
-+
 +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 +{
 +	struct bch_fs *c = ca->fs;
@@ -70777,7 +74007,7 @@ index 000000000000..864be8601868
 +	memset(nr, 0, sizeof(nr));
 +
 +	for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-+		nr[c->open_buckets[i].type]++;
++		nr[c->open_buckets[i].data_type]++;
 +
 +	pr_buf(out,
 +	       "\t\t buckets\t sectors      fragmented\n"
@@ -70793,9 +74023,6 @@ index 000000000000..864be8601868
 +	       "ec\t%16llu\n"
 +	       "available%15llu\n"
 +	       "\n"
-+	       "free_inc\t\t%zu/%zu\n"
-+	       "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
-+	       "free[RESERVE_NONE]\t%zu/%zu\n"
 +	       "freelist_wait\t\t%s\n"
 +	       "open buckets allocated\t%u\n"
 +	       "open buckets this dev\t%u\n"
@@ -70803,13 +74030,9 @@ index 000000000000..864be8601868
 +	       "open_buckets_wait\t%s\n"
 +	       "open_buckets_btree\t%u\n"
 +	       "open_buckets_user\t%u\n"
-+	       "btree reserve cache\t%u\n"
-+	       "thread state:\t\t%s\n",
++	       "btree reserve cache\t%u\n",
 +	       stats.buckets_ec,
-+	       __dev_buckets_available(ca, stats),
-+	       fifo_used(&ca->free_inc),		ca->free_inc.size,
-+	       fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
-+	       fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
++	       __dev_buckets_available(ca, stats, RESERVE_none),
 +	       c->freelist_wait.list.first		? "waiting" : "empty",
 +	       OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
 +	       ca->nr_open_buckets,
@@ -70817,8 +74040,7 @@ index 000000000000..864be8601868
 +	       c->open_buckets_wait.list.first		? "waiting" : "empty",
 +	       nr[BCH_DATA_btree],
 +	       nr[BCH_DATA_user],
-+	       c->btree_reserve_cache_nr,
-+	       bch2_allocator_states[ca->allocator_state]);
++	       c->btree_reserve_cache_nr);
 +}
 +
 +static const char * const bch2_rw[] = {
@@ -70845,12 +74067,10 @@ index 000000000000..864be8601868
 +{
 +	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 +	struct bch_fs *c = ca->fs;
-+	struct printbuf out = _PBUF(buf, PAGE_SIZE);
 +
 +	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
 +
 +	sysfs_print(bucket_size,	bucket_bytes(ca));
-+	sysfs_print(block_size,		block_bytes(c));
 +	sysfs_print(first_bucket,	ca->mi.first_bucket);
 +	sysfs_print(nbuckets,		ca->mi.nbuckets);
 +	sysfs_print(durability,		ca->mi.durability);
@@ -70859,66 +74079,44 @@ index 000000000000..864be8601868
 +	if (attr == &sysfs_label) {
 +		if (ca->mi.group) {
 +			mutex_lock(&c->sb_lock);
-+			bch2_disk_path_to_text(&out, &c->disk_sb,
++			bch2_disk_path_to_text(out, c->disk_sb.sb,
 +					       ca->mi.group - 1);
 +			mutex_unlock(&c->sb_lock);
 +		}
 +
-+		pr_buf(&out, "\n");
-+		return out.pos - buf;
++		pr_char(out, '\n');
 +	}
 +
 +	if (attr == &sysfs_has_data) {
-+		bch2_flags_to_text(&out, bch2_data_types,
++		bch2_flags_to_text(out, bch2_data_types,
 +				   bch2_dev_has_data(c, ca));
-+		pr_buf(&out, "\n");
-+		return out.pos - buf;
-+	}
-+
-+	if (attr == &sysfs_cache_replacement_policy) {
-+		bch2_string_opt_to_text(&out,
-+					bch2_cache_replacement_policies,
-+					ca->mi.replacement);
-+		pr_buf(&out, "\n");
-+		return out.pos - buf;
++		pr_char(out, '\n');
 +	}
 +
 +	if (attr == &sysfs_state_rw) {
-+		bch2_string_opt_to_text(&out, bch2_member_states,
++		bch2_string_opt_to_text(out, bch2_member_states,
 +					ca->mi.state);
-+		pr_buf(&out, "\n");
-+		return out.pos - buf;
++		pr_char(out, '\n');
 +	}
 +
-+	if (attr == &sysfs_iodone) {
-+		dev_iodone_to_text(&out, ca);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_iodone)
++		dev_iodone_to_text(out, ca);
 +
 +	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
 +	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 +
-+	if (attr == &sysfs_io_latency_stats_read) {
-+		bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
-+		return out.pos - buf;
-+	}
-+	if (attr == &sysfs_io_latency_stats_write) {
-+		bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_io_latency_stats_read)
++		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
++
++	if (attr == &sysfs_io_latency_stats_write)
++		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
 +
 +	sysfs_printf(congested,			"%u%%",
 +		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
 +		     * 100 / CONGESTED_MAX);
 +
-+	if (attr == &sysfs_reserve_stats) {
-+		reserve_stats_to_text(&out, ca);
-+		return out.pos - buf;
-+	}
-+	if (attr == &sysfs_alloc_debug) {
-+		dev_alloc_debug_to_text(&out, ca);
-+		return out.pos - buf;
-+	}
++	if (attr == &sysfs_alloc_debug)
++		dev_alloc_debug_to_text(out, ca);
 +
 +	return 0;
 +}
@@ -70942,22 +74140,6 @@ index 000000000000..864be8601868
 +		mutex_unlock(&c->sb_lock);
 +	}
 +
-+	if (attr == &sysfs_cache_replacement_policy) {
-+		ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
-+
-+		if (v < 0)
-+			return v;
-+
-+		mutex_lock(&c->sb_lock);
-+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-+
-+		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
-+			SET_BCH_MEMBER_REPLACEMENT(mi, v);
-+			bch2_write_super(c);
-+		}
-+		mutex_unlock(&c->sb_lock);
-+	}
-+
 +	if (attr == &sysfs_label) {
 +		char *tmp;
 +		int ret;
@@ -70972,9 +74154,6 @@ index 000000000000..864be8601868
 +			return ret;
 +	}
 +
-+	if (attr == &sysfs_wake_allocator)
-+		bch2_wake_allocator(ca);
-+
 +	return size;
 +}
 +SYSFS_OPS(bch2_dev);
@@ -70982,14 +74161,12 @@ index 000000000000..864be8601868
 +struct attribute *bch2_dev_files[] = {
 +	&sysfs_uuid,
 +	&sysfs_bucket_size,
-+	&sysfs_block_size,
 +	&sysfs_first_bucket,
 +	&sysfs_nbuckets,
 +	&sysfs_durability,
 +
 +	/* settings: */
 +	&sysfs_discard,
-+	&sysfs_cache_replacement_policy,
 +	&sysfs_state_rw,
 +	&sysfs_label,
 +
@@ -71002,11 +74179,8 @@ index 000000000000..864be8601868
 +	&sysfs_io_latency_stats_write,
 +	&sysfs_congested,
 +
-+	&sysfs_reserve_stats,
-+
 +	/* debug: */
 +	&sysfs_alloc_debug,
-+	&sysfs_wake_allocator,
 +	NULL
 +};
 +
@@ -71063,16 +74237,17 @@ index 000000000000..525fd05d91f7
 +#endif  /* _BCACHEFS_SYSFS_H_ */
 diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
 new file mode 100644
-index 000000000000..d5a74f4db64d
+index 000000000000..4369bfc55a94
 --- /dev/null
 +++ b/fs/bcachefs/tests.c
-@@ -0,0 +1,871 @@
+@@ -0,0 +1,947 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifdef CONFIG_BCACHEFS_TESTS
 +
 +#include "bcachefs.h"
 +#include "btree_update.h"
 +#include "journal_reclaim.h"
++#include "subvolume.h"
 +#include "tests.h"
 +
 +#include "linux/kthread.h"
@@ -71083,13 +74258,14 @@ index 000000000000..d5a74f4db64d
 +	int ret;
 +
 +	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-+				      POS(0, 0), POS(0, U64_MAX),
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0,
 +				      NULL);
 +	BUG_ON(ret);
 +
 +	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-+				      POS(0, 0), POS(0, U64_MAX),
-+				      NULL);
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0, NULL);
 +	BUG_ON(ret);
 +}
 +
@@ -71213,7 +74389,7 @@ index 000000000000..d5a74f4db64d
 +	i = 0;
 +
 +	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-+			   POS_MIN, 0, k, ret) {
++			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 +		if (k.k->p.inode)
 +			break;
 +
@@ -71269,7 +74445,7 @@ index 000000000000..d5a74f4db64d
 +	i = 0;
 +
 +	for_each_btree_key(&trans, iter, BTREE_ID_extents,
-+			   POS_MIN, 0, k, ret) {
++			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 +		BUG_ON(bkey_start_offset(k.k) != i);
 +		i = k.k->p.offset;
 +	}
@@ -71323,8 +74499,8 @@ index 000000000000..d5a74f4db64d
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
-+			   0, k, ret) {
++	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 +		if (k.k->p.inode)
 +			break;
 +
@@ -71339,7 +74515,8 @@ index 000000000000..d5a74f4db64d
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
++	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(0, 0, U32_MAX),
 +			   BTREE_ITER_SLOTS, k, ret) {
 +		BUG_ON(k.k->p.offset != i);
 +		BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -71388,8 +74565,8 @@ index 000000000000..d5a74f4db64d
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-+			   0, k, ret) {
++	for_each_btree_key(&trans, iter, BTREE_ID_extents,
++			   SPOS(0, 0, U32_MAX), 0, k, ret) {
 +		BUG_ON(bkey_start_offset(k.k) != i + 8);
 +		BUG_ON(k.k->size != 8);
 +		i += 16;
@@ -71402,7 +74579,8 @@ index 000000000000..d5a74f4db64d
 +
 +	i = 0;
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
++	for_each_btree_key(&trans, iter, BTREE_ID_extents,
++			   SPOS(0, 0, U32_MAX),
 +			   BTREE_ITER_SLOTS, k, ret) {
 +		BUG_ON(bkey_deleted(k.k) != !(i % 16));
 +
@@ -71430,7 +74608,8 @@ index 000000000000..d5a74f4db64d
 +	struct bkey_s_c k;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
 +
 +	k = bch2_btree_iter_peek(&iter);
 +	BUG_ON(k.k);
@@ -71450,7 +74629,8 @@ index 000000000000..d5a74f4db64d
 +	struct bkey_s_c k;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     SPOS(0, 0, U32_MAX), 0);
 +
 +	k = bch2_btree_iter_peek(&iter);
 +	BUG_ON(k.k);
@@ -71473,8 +74653,6 @@ index 000000000000..d5a74f4db64d
 +	struct bkey_i_cookie k;
 +	int ret;
 +
-+	//pr_info("inserting %llu-%llu v %llu", start, end, test_version);
-+
 +	bkey_cookie_init(&k.k_i);
 +	k.k_i.k.p.offset = end;
 +	k.k_i.k.p.snapshot = U32_MAX;
@@ -71526,6 +74704,70 @@ index 000000000000..d5a74f4db64d
 +		__test_extent_overwrite(c, 32, 64, 32, 128);
 +}
 +
++/* snapshot unit tests */
++
++/* Test skipping over keys in unrelated snapshots: */
++static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_i_cookie cookie;
++	int ret;
++
++	bkey_cookie_init(&cookie.k_i);
++	cookie.k.p.snapshot = snapid_hi;
++	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, snapid_lo), 0);
++	k = bch2_btree_iter_peek(&iter);
++
++	BUG_ON(k.k->p.snapshot != U32_MAX);
++
++	bch2_trans_iter_exit(&trans, &iter);
++	bch2_trans_exit(&trans);
++	return ret;
++}
++
++static int test_snapshots(struct bch_fs *c, u64 nr)
++{
++	struct bkey_i_cookie cookie;
++	u32 snapids[2];
++	u32 snapid_subvols[2] = { 1, 1 };
++	int ret;
++
++	bkey_cookie_init(&cookie.k_i);
++	cookie.k.p.snapshot = U32_MAX;
++	ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
++				NULL, NULL, 0);
++	if (ret)
++		return ret;
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++		      bch2_snapshot_node_create(&trans, U32_MAX,
++						snapids,
++						snapid_subvols,
++						2));
++	if (ret)
++		return ret;
++
++	if (snapids[0] > snapids[1])
++		swap(snapids[0], snapids[1]);
++
++	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
++	if (ret) {
++		bch_err(c, "err %i from test_snapshot_filter", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
 +/* perf tests */
 +
 +static u64 test_rand(void)
@@ -71610,10 +74852,11 @@ index 000000000000..d5a74f4db64d
 +	u64 i;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
 +
 +	for (i = 0; i < nr; i++) {
-+		bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
++		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 +
 +		k = bch2_btree_iter_peek(&iter);
 +		ret = bkey_err(k);
@@ -71636,7 +74879,7 @@ index 000000000000..d5a74f4db64d
 +	struct bkey_s_c k;
 +	int ret;
 +
-+	bch2_btree_iter_set_pos(iter, POS(0, pos));
++	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
 +
 +	k = bch2_btree_iter_peek(iter);
 +	ret = bkey_err(k);
@@ -71648,10 +74891,10 @@ index 000000000000..d5a74f4db64d
 +	if (!(i & 3) && k.k) {
 +		bkey_cookie_init(&cookie->k_i);
 +		cookie->k.p = iter->pos;
-+		bch2_trans_update(trans, iter, &cookie->k_i, 0);
++		ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
 +	}
 +
-+	return 0;
++	return ret;
 +}
 +
 +static int rand_mixed(struct bch_fs *c, u64 nr)
@@ -71663,7 +74906,8 @@ index 000000000000..d5a74f4db64d
 +	u64 i, rand;
 +
 +	bch2_trans_init(&trans, c, 0, 0);
-+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
++			     SPOS(0, 0, U32_MAX), 0);
 +
 +	for (i = 0; i < nr; i++) {
 +		rand = test_rand();
@@ -71683,7 +74927,6 @@ index 000000000000..d5a74f4db64d
 +static int __do_delete(struct btree_trans *trans, struct bpos pos)
 +{
 +	struct btree_iter iter;
-+	struct bkey_i delete;
 +	struct bkey_s_c k;
 +	int ret = 0;
 +
@@ -71697,10 +74940,7 @@ index 000000000000..d5a74f4db64d
 +	if (!k.k)
 +		goto err;
 +
-+	bkey_init(&delete.k);
-+	delete.k.p = k.k->p;
-+
-+	ret = bch2_trans_update(trans, &iter, &delete, 0);
++	ret = bch2_btree_delete_at(trans, &iter, 0);
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
@@ -71715,7 +74955,7 @@ index 000000000000..d5a74f4db64d
 +	bch2_trans_init(&trans, c, 0, 0);
 +
 +	for (i = 0; i < nr; i++) {
-+		struct bpos pos = POS(0, test_rand());
++		struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 +
 +		ret = __bch2_trans_do(&trans, NULL, NULL, 0,
 +			__do_delete(&trans, pos));
@@ -71742,7 +74982,7 @@ index 000000000000..d5a74f4db64d
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
++	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
 +			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 +		insert.k.p = iter.pos;
 +
@@ -71772,7 +75012,8 @@ index 000000000000..d5a74f4db64d
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
++	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(0, 0, U32_MAX), 0, k, ret)
 +		;
 +	bch2_trans_iter_exit(&trans, &iter);
 +
@@ -71789,7 +75030,8 @@ index 000000000000..d5a74f4db64d
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
++	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(0, 0, U32_MAX),
 +			   BTREE_ITER_INTENT, k, ret) {
 +		struct bkey_i_cookie u;
 +
@@ -71814,8 +75056,8 @@ index 000000000000..d5a74f4db64d
 +	int ret;
 +
 +	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-+				      POS(0, 0), POS(0, U64_MAX),
-+				      NULL);
++				      SPOS(0, 0, U32_MAX), SPOS_MAX,
++				      0, NULL);
 +	if (ret)
 +		bch_err(c, "error in seq_delete: %i", ret);
 +	return ret;
@@ -71853,8 +75095,10 @@ index 000000000000..d5a74f4db64d
 +	}
 +
 +	ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
-+	if (ret)
++	if (ret) {
++		bch_err(j->c, "%ps: error %i", j->fn, ret);
 +		j->ret = ret;
++	}
 +
 +	if (atomic_dec_and_test(&j->done)) {
 +		j->finish = sched_clock();
@@ -71868,7 +75112,9 @@ index 000000000000..d5a74f4db64d
 +			 u64 nr, unsigned nr_threads)
 +{
 +	struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
-+	char name_buf[20], nr_buf[20], per_sec_buf[20];
++	char name_buf[20];
++	struct printbuf nr_buf = PRINTBUF;
++	struct printbuf per_sec_buf = PRINTBUF;
 +	unsigned i;
 +	u64 time;
 +
@@ -71907,6 +75153,8 @@ index 000000000000..d5a74f4db64d
 +	perf_test(test_extent_overwrite_middle);
 +	perf_test(test_extent_overwrite_all);
 +
++	perf_test(test_snapshots);
++
 +	if (!j.fn) {
 +		pr_err("unknown test %s", testname);
 +		return -EINVAL;
@@ -71927,13 +75175,15 @@ index 000000000000..d5a74f4db64d
 +	time = j.finish - j.start;
 +
 +	scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
-+	bch2_hprint(&PBUF(nr_buf), nr);
-+	bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
++	bch2_hprint(&nr_buf, nr);
++	bch2_hprint(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time));
 +	printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
-+		name_buf, nr_buf, nr_threads,
++		name_buf, nr_buf.buf, nr_threads,
 +		div_u64(time, NSEC_PER_SEC),
 +		div_u64(time * nr_threads, nr),
-+		per_sec_buf);
++		per_sec_buf.buf);
++	printbuf_exit(&per_sec_buf);
++	printbuf_exit(&nr_buf);
 +	return j.ret;
 +}
 +
@@ -71979,10 +75229,10 @@ index 000000000000..59e8dfa3d245
 +#include <trace/events/bcachefs.h>
 diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
 new file mode 100644
-index 000000000000..52de7c49cacb
+index 000000000000..37fc20413764
 --- /dev/null
 +++ b/fs/bcachefs/util.c
-@@ -0,0 +1,912 @@
+@@ -0,0 +1,984 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * random utiility code, for bcache but in theory not specific to bcache
@@ -72084,6 +75334,71 @@ index 000000000000..52de7c49cacb
 +STRTO_H(strtoull, unsigned long long)
 +STRTO_H(strtou64, u64)
 +
++static int bch2_printbuf_realloc(struct printbuf *out, unsigned extra)
++{
++	unsigned new_size;
++	char *buf;
++
++	if (out->pos + extra + 1 < out->size)
++		return 0;
++
++	new_size = roundup_pow_of_two(out->size + extra);
++	buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_ATOMIC);
++
++	if (!buf) {
++		out->allocation_failure = true;
++		return -ENOMEM;
++	}
++
++	out->buf	= buf;
++	out->size	= new_size;
++	return 0;
++}
++
++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
++{
++	va_list args;
++	int len;
++
++	do {
++		va_start(args, fmt);
++		len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args);
++		va_end(args);
++	} while (len + 1 >= printbuf_remaining(out) &&
++		 !bch2_printbuf_realloc(out, len + 1));
++
++	len = min_t(size_t, len,
++		  printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0);
++	out->pos += len;
++}
++
++void bch2_pr_tab_rjust(struct printbuf *buf)
++{
++	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
++
++	if (printbuf_linelen(buf) < buf->tabstops[buf->tabstop]) {
++		unsigned move = buf->pos - buf->last_field;
++		unsigned shift = buf->tabstops[buf->tabstop] -
++			printbuf_linelen(buf);
++
++		bch2_printbuf_realloc(buf, shift);
++
++		if (buf->last_field + shift + 1 < buf->size) {
++			move = min(move, buf->size - 1 - buf->last_field - shift);
++
++			memmove(buf->buf + buf->last_field + shift,
++				buf->buf + buf->last_field,
++				move);
++			memset(buf->buf + buf->last_field, ' ', shift);
++			buf->pos += shift;
++			buf->buf[buf->pos] = 0;
++		}
++	}
++
++	buf->last_field = buf->pos;
++	buf->tabstop++;
++}
++
 +void bch2_hprint(struct printbuf *buf, s64 v)
 +{
 +	int u, t = 0;
@@ -72099,10 +75414,25 @@ index 000000000000..52de7c49cacb
 +	 * 103 is magic: t is in the range [-1023, 1023] and we want
 +	 * to turn it into [-9, 9]
 +	 */
-+	if (u && v < 100 && v > -100)
++	if (u && t && v < 100 && v > -100)
 +		pr_buf(buf, ".%i", t / 103);
 +	if (u)
-+		pr_buf(buf, "%c", si_units[u]);
++		pr_char(buf, si_units[u]);
++}
++
++void bch2_pr_units(struct printbuf *out, s64 raw, s64 bytes)
++{
++	switch (out->units) {
++	case PRINTBUF_UNITS_RAW:
++		pr_buf(out, "%llu", raw);
++		break;
++	case PRINTBUF_UNITS_BYTES:
++		pr_buf(out, "%llu", bytes);
++		break;
++	case PRINTBUF_UNITS_HUMAN_READABLE:
++		bch2_hprint(out, bytes);
++		break;
++	}
 +}
 +
 +void bch2_string_opt_to_text(struct printbuf *out,
@@ -72121,9 +75451,6 @@ index 000000000000..52de7c49cacb
 +	unsigned bit, nr = 0;
 +	bool first = true;
 +
-+	if (out->pos != out->end)
-+		*out->pos = '\0';
-+
 +	while (list[nr])
 +		nr++;
 +
@@ -72452,36 +75779,44 @@ index 000000000000..52de7c49cacb
 +	pd->backpressure	= 1;
 +}
 +
-+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
++void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd)
 +{
-+	/* 2^64 - 1 is 20 digits, plus null byte */
-+	char rate[21];
-+	char actual[21];
-+	char target[21];
-+	char proportional[21];
-+	char derivative[21];
-+	char change[21];
-+	s64 next_io;
++	out->tabstops[0] = 20;
 +
-+	bch2_hprint(&PBUF(rate),	pd->rate.rate);
-+	bch2_hprint(&PBUF(actual),	pd->last_actual);
-+	bch2_hprint(&PBUF(target),	pd->last_target);
-+	bch2_hprint(&PBUF(proportional), pd->last_proportional);
-+	bch2_hprint(&PBUF(derivative),	pd->last_derivative);
-+	bch2_hprint(&PBUF(change),	pd->last_change);
++	pr_buf(out, "rate:");
++	pr_tab(out);
++	bch2_hprint(out, pd->rate.rate);
++	pr_newline(out);
 +
-+	next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
++	pr_buf(out, "target:");
++	pr_tab(out);
++	bch2_hprint(out, pd->last_target);
++	pr_newline(out);
 +
-+	return sprintf(buf,
-+		       "rate:\t\t%s/sec\n"
-+		       "target:\t\t%s\n"
-+		       "actual:\t\t%s\n"
-+		       "proportional:\t%s\n"
-+		       "derivative:\t%s\n"
-+		       "change:\t\t%s/sec\n"
-+		       "next io:\t%llims\n",
-+		       rate, target, actual, proportional,
-+		       derivative, change, next_io);
++	pr_buf(out, "actual:");
++	pr_tab(out);
++	bch2_hprint(out, pd->last_actual);
++	pr_newline(out);
++
++	pr_buf(out, "proportional:");
++	pr_tab(out);
++	bch2_hprint(out, pd->last_proportional);
++	pr_newline(out);
++
++	pr_buf(out, "derivative:");
++	pr_tab(out);
++	bch2_hprint(out, pd->last_derivative);
++	pr_newline(out);
++
++	pr_buf(out, "change:");
++	pr_tab(out);
++	bch2_hprint(out, pd->last_change);
++	pr_newline(out);
++
++	pr_buf(out, "next io:");
++	pr_tab(out);
++	pr_buf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC));
++	pr_newline(out);
 +}
 +
 +/* misc: */
@@ -72564,19 +75899,6 @@ index 000000000000..52de7c49cacb
 +	}
 +}
 +
-+void bch_scnmemcpy(struct printbuf *out,
-+		   const char *src, size_t len)
-+{
-+	size_t n = printbuf_remaining(out);
-+
-+	if (n) {
-+		n = min(n - 1, len);
-+		memcpy(out->pos, src, n);
-+		out->pos += n;
-+		*out->pos = '\0';
-+	}
-+}
-+
 +#include "eytzinger.h"
 +
 +static int alignment_ok(const void *base, size_t align)
@@ -72897,10 +76219,10 @@ index 000000000000..52de7c49cacb
 +}
 diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
 new file mode 100644
-index 000000000000..80402b398442
+index 000000000000..888693703c75
 --- /dev/null
 +++ b/fs/bcachefs/util.h
-@@ -0,0 +1,749 @@
+@@ -0,0 +1,877 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_UTIL_H
 +#define _BCACHEFS_UTIL_H
@@ -73113,9 +76435,11 @@ index 000000000000..80402b398442
 +									\
 +	BUG_ON(_i >= (h)->used);					\
 +	(h)->used--;							\
-+	heap_swap(h, _i, (h)->used, set_backpointer);			\
-+	heap_sift_up(h, _i, cmp, set_backpointer);			\
-+	heap_sift_down(h, _i, cmp, set_backpointer);			\
++	if ((_i) < (h)->used) {						\
++		heap_swap(h, _i, (h)->used, set_backpointer);		\
++		heap_sift_up(h, _i, cmp, set_backpointer);		\
++		heap_sift_down(h, _i, cmp, set_backpointer);		\
++	}								\
 +} while (0)
 +
 +#define heap_pop(h, d, cmp, set_backpointer)				\
@@ -73138,31 +76462,157 @@ index 000000000000..80402b398442
 +#define ANYSINT_MAX(t)							\
 +	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
 +
-+struct printbuf {
-+	char		*pos;
-+	char		*end;
++enum printbuf_units {
++	PRINTBUF_UNITS_RAW,
++	PRINTBUF_UNITS_BYTES,
++	PRINTBUF_UNITS_HUMAN_READABLE,
 +};
 +
++struct printbuf {
++	char			*buf;
++	unsigned		size;
++	unsigned		pos;
++	unsigned		last_newline;
++	unsigned		last_field;
++	unsigned		indent;
++	enum printbuf_units	units:8;
++	u8			atomic;
++	bool			allocation_failure:1;
++	u8			tabstop;
++	u8			tabstops[4];
++};
++
++#define PRINTBUF ((struct printbuf) { NULL })
++
++static inline void printbuf_exit(struct printbuf *buf)
++{
++	kfree(buf->buf);
++	buf->buf = ERR_PTR(-EINTR); /* poison value */
++}
++
++static inline void printbuf_reset(struct printbuf *buf)
++{
++	buf->pos		= 0;
++	buf->last_newline	= 0;
++	buf->last_field		= 0;
++	buf->indent		= 0;
++	buf->tabstop		= 0;
++}
++
 +static inline size_t printbuf_remaining(struct printbuf *buf)
 +{
-+	return buf->end - buf->pos;
++	return buf->size - buf->pos;
 +}
 +
-+#define _PBUF(_buf, _len)						\
-+	((struct printbuf) {						\
-+		.pos	= _buf,						\
-+		.end	= _buf + _len,					\
-+	})
++static inline size_t printbuf_linelen(struct printbuf *buf)
++{
++	return buf->pos - buf->last_newline;
++}
 +
-+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
++void bch2_pr_buf(struct printbuf *out, const char *fmt, ...)
++	__attribute__ ((format (printf, 2, 3)));
 +
-+#define pr_buf(_out, ...)						\
-+do {									\
-+	(_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out),	\
-+				 __VA_ARGS__);				\
-+} while (0)
++#define pr_buf(_out, ...) bch2_pr_buf(_out, __VA_ARGS__)
 +
-+void bch_scnmemcpy(struct printbuf *, const char *, size_t);
++static inline void pr_char(struct printbuf *out, char c)
++{
++	bch2_pr_buf(out, "%c", c);
++}
++
++static inline void pr_indent_push(struct printbuf *buf, unsigned spaces)
++{
++	buf->indent += spaces;
++	while (spaces--)
++		pr_char(buf, ' ');
++}
++
++static inline void pr_indent_pop(struct printbuf *buf, unsigned spaces)
++{
++	if (buf->last_newline + buf->indent == buf->pos) {
++		buf->pos -= spaces;
++		buf->buf[buf->pos] = 0;
++	}
++	buf->indent -= spaces;
++}
++
++static inline void pr_newline(struct printbuf *buf)
++{
++	unsigned i;
++
++	pr_char(buf, '\n');
++
++	buf->last_newline	= buf->pos;
++
++	for (i = 0; i < buf->indent; i++)
++		pr_char(buf, ' ');
++
++	buf->last_field		= buf->pos;
++	buf->tabstop = 0;
++}
++
++static inline void pr_tab(struct printbuf *buf)
++{
++	BUG_ON(buf->tabstop > ARRAY_SIZE(buf->tabstops));
++
++	while (printbuf_remaining(buf) > 1 &&
++	       printbuf_linelen(buf) < buf->tabstops[buf->tabstop])
++		pr_char(buf, ' ');
++
++	buf->last_field = buf->pos;
++	buf->tabstop++;
++}
++
++void bch2_pr_tab_rjust(struct printbuf *);
++
++static inline void pr_tab_rjust(struct printbuf *buf)
++{
++	bch2_pr_tab_rjust(buf);
++}
++
++void bch2_pr_units(struct printbuf *, s64, s64);
++#define pr_units(...) bch2_pr_units(__VA_ARGS__)
++
++static inline void pr_sectors(struct printbuf *out, u64 v)
++{
++	bch2_pr_units(out, v, v << 9);
++}
++
++#ifdef __KERNEL__
++static inline void pr_time(struct printbuf *out, u64 time)
++{
++	pr_buf(out, "%llu", time);
++}
++#else
++#include <time.h>
++static inline void pr_time(struct printbuf *out, u64 _time)
++{
++	char time_str[64];
++	time_t time = _time;
++	struct tm *tm = localtime(&time);
++	size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
++	if (!err)
++		pr_buf(out, "(formatting error)");
++	else
++		pr_buf(out, "%s", time_str);
++}
++#endif
++
++#ifdef __KERNEL__
++static inline void uuid_unparse_lower(u8 *uuid, char *out)
++{
++	sprintf(out, "%pUb", uuid);
++}
++#else
++#include <uuid/uuid.h>
++#endif
++
++static inline void pr_uuid(struct printbuf *out, u8 *uuid)
++{
++	char uuid_str[40];
++
++	uuid_unparse_lower(uuid, uuid_str);
++	pr_buf(out, uuid_str);
++}
 +
 +int bch2_strtoint_h(const char *, int *);
 +int bch2_strtouint_h(const char *, unsigned int *);
@@ -73226,8 +76676,8 @@ index 000000000000..80402b398442
 +	_r;								\
 +})
 +
-+#define snprint(buf, size, var)						\
-+	snprintf(buf, size,						\
++#define snprint(out, var)						\
++	pr_buf(out,							\
 +		   type_is(var, int)		? "%i\n"		\
 +		 : type_is(var, unsigned)	? "%u\n"		\
 +		 : type_is(var, long)		? "%li\n"		\
@@ -73344,7 +76794,7 @@ index 000000000000..80402b398442
 +
 +void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
 +void bch2_pd_controller_init(struct bch_pd_controller *);
-+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
++void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *);
 +
 +#define sysfs_pd_controller_attribute(name)				\
 +	rw_attribute(name##_rate);					\
@@ -73368,7 +76818,7 @@ index 000000000000..80402b398442
 +	sysfs_print(name##_rate_p_term_inverse,	(var)->p_term_inverse);	\
 +									\
 +	if (attr == &sysfs_##name##_rate_debug)				\
-+		return bch2_pd_controller_print_debug(var, buf);		\
++		bch2_pd_controller_debug_to_text(out, var);		\
 +} while (0)
 +
 +#define sysfs_pd_controller_store(name, var)				\
@@ -73795,7 +77245,7 @@ index 000000000000..92a182fb3d7a
 +#endif /* _BCACHEFS_VARINT_H */
 diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
 new file mode 100644
-index 000000000000..c099cdc0605f
+index 000000000000..53a694d71967
 --- /dev/null
 +++ b/fs/bcachefs/vstructs.h
 @@ -0,0 +1,63 @@
@@ -73821,7 +77271,7 @@ index 000000000000..c099cdc0605f
 +({									\
 +	BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64));		\
 +									\
-+	(offsetof(_type, _data) + (_u64s) * sizeof(u64));		\
++	(size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64));	\
 +})
 +
 +#define vstruct_bytes(_s)						\
@@ -73864,7 +77314,7 @@ index 000000000000..c099cdc0605f
 +#endif /* _VSTRUCTS_H */
 diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
 new file mode 100644
-index 000000000000..464ed68318e7
+index 000000000000..8d23b4c2449e
 --- /dev/null
 +++ b/fs/bcachefs/xattr.c
 @@ -0,0 +1,629 @@
@@ -73981,11 +77431,11 @@ index 000000000000..464ed68318e7
 +	else
 +		pr_buf(out, "(unknown type %u)", xattr.v->x_type);
 +
-+	bch_scnmemcpy(out, xattr.v->x_name,
-+		      xattr.v->x_name_len);
-+	pr_buf(out, ":");
-+	bch_scnmemcpy(out, xattr_val(xattr.v),
-+		      le16_to_cpu(xattr.v->x_val_len));
++	pr_buf(out, "%.*s:%.*s",
++	       xattr.v->x_name_len,
++	       xattr.v->x_name,
++	       le16_to_cpu(xattr.v->x_val_len),
++	       (char *) xattr_val(xattr.v));
 +}
 +
 +static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -74181,13 +77631,9 @@ index 000000000000..464ed68318e7
 +	if (ret)
 +		goto err;
 +
-+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
-+			   SPOS(inum, offset, snapshot), 0, k, ret) {
-+		BUG_ON(k.k->p.inode < inum);
-+
-+		if (k.k->p.inode > inum)
-+			break;
-+
++	for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
++			   SPOS(inum, offset, snapshot),
++			   POS(inum, U64_MAX), 0, k, ret) {
 +		if (k.k->type != KEY_TYPE_xattr)
 +			continue;
 +
@@ -74296,9 +77742,8 @@ index 000000000000..464ed68318e7
 +		bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
 +	const struct bch_option *opt;
 +	int id, inode_opt_id;
-+	char buf[512];
-+	struct printbuf out = PBUF(buf);
-+	unsigned val_len;
++	struct printbuf out = PRINTBUF;
++	int ret;
 +	u64 v;
 +
 +	id = bch2_opt_lookup(name);
@@ -74319,16 +77764,21 @@ index 000000000000..464ed68318e7
 +		return -ENODATA;
 +
 +	v = bch2_opt_get_by_id(&opts, id);
-+	bch2_opt_to_text(&out, c, opt, v, 0);
++	bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0);
 +
-+	val_len = out.pos - buf;
++	ret = out.pos;
 +
-+	if (buffer && val_len > size)
-+		return -ERANGE;
++	if (out.allocation_failure) {
++		ret = -ENOMEM;
++	} else if (buffer) {
++		if (out.pos > size)
++			ret = -ERANGE;
++		else
++			memcpy(buffer, out.buf, out.pos);
++	}
 +
-+	if (buffer)
-+		memcpy(buffer, buf, val_len);
-+	return val_len;
++	printbuf_exit(&out);
++	return ret;
 +}
 +
 +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
@@ -74395,7 +77845,7 @@ index 000000000000..464ed68318e7
 +		memcpy(buf, value, size);
 +		buf[size] = '\0';
 +
-+		ret = bch2_opt_parse(c, opt, buf, &v);
++		ret = bch2_opt_parse(c, opt, buf, &v, NULL);
 +		kfree(buf);
 +
 +		if (ret < 0)
@@ -74583,7 +78033,7 @@ index cf871a81f4fd..30910dae37ad 100644
  }
  EXPORT_SYMBOL(d_tmpfile);
 diff --git a/fs/inode.c b/fs/inode.c
-index ed0cab8a32db..900927eab51c 100644
+index 8279c700a2b7..f6aa9ec4382b 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
 @@ -56,8 +56,23 @@
@@ -74612,7 +78062,7 @@ index ed0cab8a32db..900927eab51c 100644
  
  /*
   * Empty aops. Can be used for the cases where the user does not
-@@ -394,7 +409,7 @@ EXPORT_SYMBOL(address_space_init_once);
+@@ -393,7 +408,7 @@ EXPORT_SYMBOL(address_space_init_once);
  void inode_init_once(struct inode *inode)
  {
  	memset(inode, 0, sizeof(*inode));
@@ -74621,7 +78071,7 @@ index ed0cab8a32db..900927eab51c 100644
  	INIT_LIST_HEAD(&inode->i_devices);
  	INIT_LIST_HEAD(&inode->i_io_list);
  	INIT_LIST_HEAD(&inode->i_wb_list);
-@@ -478,14 +493,15 @@ static inline void inode_sb_list_del(struct inode *inode)
+@@ -477,14 +492,15 @@ static inline void inode_sb_list_del(struct inode *inode)
  	}
  }
  
@@ -74644,7 +78094,7 @@ index ed0cab8a32db..900927eab51c 100644
  }
  
  /**
-@@ -498,13 +514,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+@@ -497,13 +513,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
   */
  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
  {
@@ -74662,7 +78112,7 @@ index ed0cab8a32db..900927eab51c 100644
  }
  EXPORT_SYMBOL(__insert_inode_hash);
  
-@@ -516,11 +532,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
+@@ -515,11 +531,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
   */
  void __remove_inode_hash(struct inode *inode)
  {
@@ -74712,7 +78162,7 @@ index ed0cab8a32db..900927eab51c 100644
  }
  EXPORT_SYMBOL(__remove_inode_hash);
  
-@@ -817,26 +866,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+@@ -816,26 +865,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
  	return freed;
  }
  
@@ -74745,7 +78195,7 @@ index ed0cab8a32db..900927eab51c 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -855,19 +906,20 @@ static struct inode *find_inode(struct super_block *sb,
+@@ -854,19 +905,20 @@ static struct inode *find_inode(struct super_block *sb,
   * iget_locked for details.
   */
  static struct inode *find_inode_fast(struct super_block *sb,
@@ -74769,7 +78219,7 @@ index ed0cab8a32db..900927eab51c 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -1076,26 +1128,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
+@@ -1075,26 +1127,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
   * return it locked, hashed, and with the I_NEW flag set. The file system gets
   * to fill it in before unlocking it via unlock_new_inode().
   *
@@ -74802,7 +78252,7 @@ index ed0cab8a32db..900927eab51c 100644
  		if (IS_ERR(old))
  			return NULL;
  		wait_on_inode(old);
-@@ -1117,12 +1169,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+@@ -1116,12 +1168,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
  	 */
  	spin_lock(&inode->i_lock);
  	inode->i_state |= I_NEW;
@@ -74817,7 +78267,7 @@ index ed0cab8a32db..900927eab51c 100644
  
  	return inode;
  }
-@@ -1183,12 +1235,12 @@ EXPORT_SYMBOL(iget5_locked);
+@@ -1182,12 +1234,12 @@ EXPORT_SYMBOL(iget5_locked);
   */
  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  {
@@ -74834,7 +78284,7 @@ index ed0cab8a32db..900927eab51c 100644
  	if (inode) {
  		if (IS_ERR(inode))
  			return NULL;
-@@ -1204,17 +1256,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1203,17 +1255,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  	if (inode) {
  		struct inode *old;
  
@@ -74856,7 +78306,7 @@ index ed0cab8a32db..900927eab51c 100644
  
  			/* Return the locked inode with I_NEW set, the
  			 * caller is responsible for filling in the contents
-@@ -1227,7 +1279,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1226,7 +1278,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  		 * us. Use the old inode instead of the one we just
  		 * allocated.
  		 */
@@ -74865,7 +78315,7 @@ index ed0cab8a32db..900927eab51c 100644
  		destroy_inode(inode);
  		if (IS_ERR(old))
  			return NULL;
-@@ -1251,10 +1303,11 @@ EXPORT_SYMBOL(iget_locked);
+@@ -1250,10 +1302,11 @@ EXPORT_SYMBOL(iget_locked);
   */
  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
  {
@@ -74879,7 +78329,7 @@ index ed0cab8a32db..900927eab51c 100644
  		if (inode->i_ino == ino && inode->i_sb == sb)
  			return 0;
  	}
-@@ -1338,12 +1391,12 @@ EXPORT_SYMBOL(igrab);
+@@ -1337,12 +1390,12 @@ EXPORT_SYMBOL(igrab);
  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
  		int (*test)(struct inode *, void *), void *data)
  {
@@ -74896,7 +78346,7 @@ index ed0cab8a32db..900927eab51c 100644
  
  	return IS_ERR(inode) ? NULL : inode;
  }
-@@ -1393,12 +1446,12 @@ EXPORT_SYMBOL(ilookup5);
+@@ -1392,12 +1445,12 @@ EXPORT_SYMBOL(ilookup5);
   */
  struct inode *ilookup(struct super_block *sb, unsigned long ino)
  {
@@ -74913,7 +78363,7 @@ index ed0cab8a32db..900927eab51c 100644
  
  	if (inode) {
  		if (IS_ERR(inode))
-@@ -1442,12 +1495,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1441,12 +1494,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
  					     void *),
  				void *data)
  {
@@ -74930,7 +78380,7 @@ index ed0cab8a32db..900927eab51c 100644
  		if (inode->i_sb != sb)
  			continue;
  		mval = match(inode, hashval, data);
-@@ -1458,7 +1512,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1457,7 +1511,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
  		goto out;
  	}
  out:
@@ -74939,7 +78389,7 @@ index ed0cab8a32db..900927eab51c 100644
  	return ret_inode;
  }
  EXPORT_SYMBOL(find_inode_nowait);
-@@ -1487,13 +1541,14 @@ EXPORT_SYMBOL(find_inode_nowait);
+@@ -1486,13 +1540,14 @@ EXPORT_SYMBOL(find_inode_nowait);
  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
  			     int (*test)(struct inode *, void *), void *data)
  {
@@ -74956,7 +78406,7 @@ index ed0cab8a32db..900927eab51c 100644
  		if (inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
  		    test(inode, data))
-@@ -1525,13 +1580,14 @@ EXPORT_SYMBOL(find_inode_rcu);
+@@ -1524,13 +1579,14 @@ EXPORT_SYMBOL(find_inode_rcu);
  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
  				    unsigned long ino)
  {
@@ -74973,7 +78423,7 @@ index ed0cab8a32db..900927eab51c 100644
  		if (inode->i_ino == ino &&
  		    inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
-@@ -1545,39 +1601,42 @@ int insert_inode_locked(struct inode *inode)
+@@ -1544,39 +1600,42 @@ int insert_inode_locked(struct inode *inode)
  {
  	struct super_block *sb = inode->i_sb;
  	ino_t ino = inode->i_ino;
@@ -75090,10 +78540,10 @@ index 00952e92eae1..ae18dabd3fe7 100644
  extern const char *bio_devname(struct bio *bio, char *buffer);
  
 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
-index 12b9dbcc980e..2f5c517209b1 100644
+index 67344dfe07a7..0447a4213315 100644
 --- a/include/linux/blkdev.h
 +++ b/include/linux/blkdev.h
-@@ -891,6 +891,7 @@ extern const char *blk_op_str(unsigned int op);
+@@ -897,6 +897,7 @@ extern const char *blk_op_str(unsigned int op);
  
  int blk_status_to_errno(blk_status_t status);
  blk_status_t errno_to_blk_status(int errno);
@@ -75213,10 +78663,10 @@ index c88cdc4ae4ec..36b4a83f9b77 100644
 +
  #endif /* _LINUX_CLOSURE_H */
 diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
-index e6ec63403965..4e9c50282193 100644
+index 3de06a8fae73..88142e91593a 100644
 --- a/include/linux/compiler_attributes.h
 +++ b/include/linux/compiler_attributes.h
-@@ -305,4 +305,9 @@
+@@ -315,4 +315,9 @@
   */
  #define __weak                          __attribute__((__weak__))
  
@@ -75256,10 +78706,10 @@ index 3260fe714846..bac82bd72626 100644
  	 * 128 bit child FID (struct lu_fid)
  	 * 128 bit parent FID (struct lu_fid)
 diff --git a/include/linux/fs.h b/include/linux/fs.h
-index e7a633353fd2..08f3c4ab0828 100644
+index fd4c450dc612..19c3c0d8c718 100644
 --- a/include/linux/fs.h
 +++ b/include/linux/fs.h
-@@ -675,7 +675,8 @@ struct inode {
+@@ -676,7 +676,8 @@ struct inode {
  	unsigned long		dirtied_when;	/* jiffies of first dirtying */
  	unsigned long		dirtied_time_when;
  
@@ -75269,7 +78719,7 @@ index e7a633353fd2..08f3c4ab0828 100644
  	struct list_head	i_io_list;	/* backing dev IO list */
  #ifdef CONFIG_CGROUP_WRITEBACK
  	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
-@@ -741,7 +742,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
+@@ -742,7 +743,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
  
  static inline int inode_unhashed(struct inode *inode)
  {
@@ -75278,7 +78728,7 @@ index e7a633353fd2..08f3c4ab0828 100644
  }
  
  /*
-@@ -752,7 +753,7 @@ static inline int inode_unhashed(struct inode *inode)
+@@ -753,7 +754,7 @@ static inline int inode_unhashed(struct inode *inode)
   */
  static inline void inode_fake_hash(struct inode *inode)
  {
@@ -75287,7 +78737,7 @@ index e7a633353fd2..08f3c4ab0828 100644
  }
  
  /*
-@@ -3187,7 +3188,7 @@ static inline void insert_inode_hash(struct inode *inode)
+@@ -3129,7 +3130,7 @@ static inline void insert_inode_hash(struct inode *inode)
  extern void __remove_inode_hash(struct inode *);
  static inline void remove_inode_hash(struct inode *inode)
  {
@@ -75347,10 +78797,10 @@ index ae1b541446c9..8ee2bf5af131 100644
  {
  	bit_spin_lock(0, (unsigned long *)b);
 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
-index 9fe165beb0f9..1113f6ed0eb4 100644
+index aa0ecfc6cdb4..65b6d7da4345 100644
 --- a/include/linux/lockdep.h
 +++ b/include/linux/lockdep.h
-@@ -336,6 +336,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
+@@ -340,6 +340,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
  #define lockdep_repin_lock(l,c)	lock_repin_lock(&(l)->dep_map, (c))
  #define lockdep_unpin_lock(l,c)	lock_unpin_lock(&(l)->dep_map, (c))
  
@@ -75359,7 +78809,7 @@ index 9fe165beb0f9..1113f6ed0eb4 100644
  #else /* !CONFIG_LOCKDEP */
  
  static inline void lockdep_init_task(struct task_struct *task)
-@@ -423,6 +425,8 @@ extern int lockdep_is_held(const void *);
+@@ -427,6 +429,8 @@ extern int lockdep_is_held(const void *);
  #define lockdep_repin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
  #define lockdep_unpin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
  
@@ -75369,7 +78819,7 @@ index 9fe165beb0f9..1113f6ed0eb4 100644
  
  enum xhlock_context_t {
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index c1a927ddec64..10a353e3313f 100644
+index e418935f8db6..6c7427c7d547 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -856,6 +856,7 @@ struct task_struct {
@@ -75590,10 +79040,10 @@ index 000000000000..477c33eb00d7
 +
 +#endif /* _LINUX_SIX_H */
 diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
-index 671d402c3778..94417aac15c7 100644
+index 5535be1012a2..8ffb67b9e118 100644
 --- a/include/linux/vmalloc.h
 +++ b/include/linux/vmalloc.h
-@@ -141,6 +141,7 @@ extern void *vzalloc(unsigned long size);
+@@ -148,6 +148,7 @@ extern void *vzalloc(unsigned long size);
  extern void *vmalloc_user(unsigned long size);
  extern void *vmalloc_node(unsigned long size, int node);
  extern void *vzalloc_node(unsigned long size, int node);
@@ -75603,10 +79053,10 @@ index 671d402c3778..94417aac15c7 100644
  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
 new file mode 100644
-index 000000000000..fce3146378f9
+index 000000000000..f63a7c87265d
 --- /dev/null
 +++ b/include/trace/events/bcachefs.h
-@@ -0,0 +1,800 @@
+@@ -0,0 +1,1034 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#undef TRACE_SYSTEM
 +#define TRACE_SYSTEM bcachefs
@@ -75791,6 +79241,40 @@ index 000000000000..fce3146378f9
 +		  __entry->nr_flushed)
 +);
 +
++/* allocator: */
++
++TRACE_EVENT(do_discards,
++	TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
++		 u64 need_journal_commit, u64 discarded, int ret),
++	TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
++
++	TP_STRUCT__entry(
++		__field(dev_t,		dev			)
++		__field(u64,		seen			)
++		__field(u64,		open			)
++		__field(u64,		need_journal_commit	)
++		__field(u64,		discarded		)
++		__field(int,		ret			)
++	),
++
++	TP_fast_assign(
++		__entry->dev			= c->dev;
++		__entry->seen			= seen;
++		__entry->open			= open;
++		__entry->need_journal_commit	= need_journal_commit;
++		__entry->discarded		= discarded;
++		__entry->ret			= ret;
++	),
++
++	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->seen,
++		  __entry->open,
++		  __entry->need_journal_commit,
++		  __entry->discarded,
++		  __entry->ret)
++);
++
 +/* bset.c: */
 +
 +DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -75927,6 +79411,80 @@ index 000000000000..fce3146378f9
 +	TP_ARGS(c, b)
 +);
 +
++TRACE_EVENT(btree_cache_scan,
++	TP_PROTO(unsigned long nr_to_scan_pages,
++		 unsigned long nr_to_scan_nodes,
++		 unsigned long can_free_nodes,
++		 long ret),
++	TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
++
++	TP_STRUCT__entry(
++		__field(unsigned long,	nr_to_scan_pages	)
++		__field(unsigned long,	nr_to_scan_nodes	)
++		__field(unsigned long,	can_free_nodes		)
++		__field(long,		ret			)
++	),
++
++	TP_fast_assign(
++		__entry->nr_to_scan_pages	= nr_to_scan_pages;
++		__entry->nr_to_scan_nodes	= nr_to_scan_nodes;
++		__entry->can_free_nodes		= can_free_nodes;
++		__entry->ret			= ret;
++	),
++
++	TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
++		  __entry->nr_to_scan_pages,
++		  __entry->nr_to_scan_nodes,
++		  __entry->can_free_nodes,
++		  __entry->ret)
++);
++
++TRACE_EVENT(btree_node_relock_fail,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos,
++		 unsigned long node,
++		 u32 iter_lock_seq,
++		 u32 node_lock_seq),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(u8,			btree_id	)
++		__field(u64,			pos_inode	)
++		__field(u64,			pos_offset	)
++		__field(u32,			pos_snapshot	)
++		__field(unsigned long,		node		)
++		__field(u32,			iter_lock_seq	)
++		__field(u32,			node_lock_seq	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip		= caller_ip;
++		__entry->btree_id		= btree_id;
++		__entry->pos_inode		= pos->inode;
++		__entry->pos_offset		= pos->offset;
++		__entry->pos_snapshot		= pos->snapshot;
++		__entry->node			= node;
++		__entry->iter_lock_seq		= iter_lock_seq;
++		__entry->node_lock_seq		= node_lock_seq;
++	),
++
++	TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  __entry->btree_id,
++		  __entry->pos_inode,
++		  __entry->pos_offset,
++		  __entry->pos_snapshot,
++		  __entry->node,
++		  __entry->iter_lock_seq,
++		  __entry->node_lock_seq)
++);
++
 +/* Garbage collection */
 +
 +DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
@@ -75968,7 +79526,7 @@ index 000000000000..fce3146378f9
 +	),
 +
 +	TP_fast_assign(
-+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
++		__entry->dev		= ca->dev;
 +		__entry->found		= found;
 +		__entry->inc_gen	= inc_gen;
 +		__entry->inc_gen_skipped = inc_gen_skipped;
@@ -75990,7 +79548,7 @@ index 000000000000..fce3146378f9
 +	),
 +
 +	TP_fast_assign(
-+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
++		__entry->dev		= ca->dev;
 +		__entry->offset		= offset,
 +		__entry->sectors	= sectors;
 +	),
@@ -76003,37 +79561,79 @@ index 000000000000..fce3146378f9
 +);
 +
 +DECLARE_EVENT_CLASS(bucket_alloc,
-+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-+	TP_ARGS(ca, reserve),
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
++	TP_ARGS(ca, alloc_reserve),
 +
 +	TP_STRUCT__entry(
 +		__field(dev_t,			dev	)
-+		__field(enum alloc_reserve,	reserve	)
++		__array(char,	reserve,	16	)
 +	),
 +
 +	TP_fast_assign(
-+		__entry->dev		= ca->disk_sb.bdev->bd_dev;
-+		__entry->reserve	= reserve;
++		__entry->dev		= ca->dev;
++		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 +	),
 +
-+	TP_printk("%d,%d reserve %d",
++	TP_printk("%d,%d reserve %s",
 +		  MAJOR(__entry->dev), MINOR(__entry->dev),
 +		  __entry->reserve)
 +);
 +
 +DEFINE_EVENT(bucket_alloc, bucket_alloc,
-+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-+	TP_ARGS(ca, reserve)
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
++	TP_ARGS(ca, alloc_reserve)
 +);
 +
-+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-+	TP_ARGS(ca, reserve)
++TRACE_EVENT(bucket_alloc_fail,
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
++		 u64 avail,
++		 u64 seen,
++		 u64 open,
++		 u64 need_journal_commit,
++		 u64 nouse,
++		 bool nonblocking,
++		 int ret),
++	TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
++
++	TP_STRUCT__entry(
++		__field(dev_t,			dev			)
++		__array(char,	reserve,	16			)
++		__field(u64,			avail			)
++		__field(u64,			seen			)
++		__field(u64,			open			)
++		__field(u64,			need_journal_commit	)
++		__field(u64,			nouse			)
++		__field(bool,			nonblocking		)
++		__field(int,			ret			)
++	),
++
++	TP_fast_assign(
++		__entry->dev		= ca->dev;
++		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++		__entry->avail		= avail;
++		__entry->seen		= seen;
++		__entry->open		= open;
++		__entry->need_journal_commit = need_journal_commit;
++		__entry->nouse		= nouse;
++		__entry->nonblocking	= nonblocking;
++		__entry->ret		= ret;
++	),
++
++	TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
++		  MAJOR(__entry->dev), MINOR(__entry->dev),
++		  __entry->reserve,
++		  __entry->avail,
++		  __entry->seen,
++		  __entry->open,
++		  __entry->need_journal_commit,
++		  __entry->nouse,
++		  __entry->nonblocking,
++		  __entry->ret)
 +);
 +
 +DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
-+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-+	TP_ARGS(ca, reserve)
++	TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
++	TP_ARGS(ca, alloc_reserve)
 +);
 +
 +/* Moving IO */
@@ -76127,94 +79727,87 @@ index 000000000000..fce3146378f9
 +		  __entry->wait_amount, __entry->until)
 +);
 +
-+TRACE_EVENT(transaction_restart_ip,
-+	TP_PROTO(unsigned long caller, unsigned long ip),
-+	TP_ARGS(caller, ip),
-+
-+	TP_STRUCT__entry(
-+		__field(unsigned long,		caller	)
-+		__field(unsigned long,		ip	)
-+	),
-+
-+	TP_fast_assign(
-+		__entry->caller	= caller;
-+		__entry->ip	= ip;
-+	),
-+
-+	TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
-+);
-+
 +DECLARE_EVENT_CLASS(transaction_restart,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip),
++	TP_ARGS(trans_fn, caller_ip),
 +
 +	TP_STRUCT__entry(
-+		__field(unsigned long,		trans_ip	)
++		__array(char,			trans_fn, 24	)
 +		__field(unsigned long,		caller_ip	)
 +	),
 +
 +	TP_fast_assign(
-+		__entry->trans_ip		= trans_ip;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +	),
 +
-+	TP_printk("%ps %pS",
-+		  (void *) __entry->trans_ip,
-+		  (void *) __entry->caller_ip)
++	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
++);
++
++DEFINE_EVENT(transaction_restart,	transaction_restart_ip,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_blocked_journal_reclaim,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_restart_journal_res_get,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_restart_journal_preres_get,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_restart_journal_reclaim,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_restart_fault_inject,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_traverse_all,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DEFINE_EVENT(transaction_restart,	trans_restart_mark_replicas,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip),
-+	TP_ARGS(trans_ip, caller_ip)
++	TP_ARGS(trans_fn, caller_ip)
++);
++
++DEFINE_EVENT(transaction_restart,	trans_restart_key_cache_raced,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip),
++	TP_ARGS(trans_fn, caller_ip)
 +);
 +
 +DECLARE_EVENT_CLASS(transaction_restart_iter,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos),
 +
 +	TP_STRUCT__entry(
-+		__field(unsigned long,		trans_ip	)
++		__array(char,			trans_fn, 24	)
 +		__field(unsigned long,		caller_ip	)
 +		__field(u8,			btree_id	)
 +		__field(u64,			pos_inode	)
@@ -76223,7 +79816,7 @@ index 000000000000..fce3146378f9
 +	),
 +
 +	TP_fast_assign(
-+		__entry->trans_ip		= trans_ip;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= btree_id;
 +		__entry->pos_inode		= pos->inode;
@@ -76231,8 +79824,8 @@ index 000000000000..fce3146378f9
 +		__entry->pos_snapshot		= pos->snapshot;
 +	),
 +
-+	TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
-+		  (void *) __entry->trans_ip,
++	TP_printk("%s %pS btree %u pos %llu:%llu:%u",
++		  __entry->trans_fn,
 +		  (void *) __entry->caller_ip,
 +		  __entry->btree_id,
 +		  __entry->pos_inode,
@@ -76241,63 +79834,111 @@ index 000000000000..fce3146378f9
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_reused,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_btree_node_split,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
-+);
-+
-+DEFINE_EVENT(transaction_restart_iter,	trans_restart_mark,
-+	TP_PROTO(unsigned long trans_ip,
-+		 unsigned long caller_ip,
-+		 enum btree_id btree_id,
-+		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_upgrade,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_iter_upgrade,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_next_node,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_parent_for_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_after_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_key_cache_fill,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_relock_path_intent,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +DEFINE_EVENT(transaction_restart_iter,	trans_restart_traverse,
-+	TP_PROTO(unsigned long trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
 +		 enum btree_id btree_id,
 +		 struct bpos *pos),
-+	TP_ARGS(trans_ip, caller_ip, btree_id, pos)
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
++);
++
++DEFINE_EVENT(transaction_restart_iter,	trans_restart_memory_allocation_failure,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 +);
 +
 +TRACE_EVENT(trans_restart_would_deadlock,
-+	TP_PROTO(unsigned long	trans_ip,
++	TP_PROTO(const char *trans_fn,
 +		 unsigned long	caller_ip,
 +		 bool		in_traverse_all,
 +		 unsigned	reason,
@@ -76307,12 +79948,12 @@ index 000000000000..fce3146378f9
 +		 enum btree_id	want_btree_id,
 +		 unsigned	want_iter_type,
 +		 struct bpos	*want_pos),
-+	TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
++	TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
 +		have_btree_id, have_iter_type, have_pos,
 +		want_btree_id, want_iter_type, want_pos),
 +
 +	TP_STRUCT__entry(
-+		__field(unsigned long,		trans_ip	)
++		__array(char,			trans_fn, 24	)
 +		__field(unsigned long,		caller_ip	)
 +		__field(u8,			in_traverse_all	)
 +		__field(u8,			reason		)
@@ -76330,7 +79971,7 @@ index 000000000000..fce3146378f9
 +	),
 +
 +	TP_fast_assign(
-+		__entry->trans_ip		= trans_ip;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->in_traverse_all	= in_traverse_all;
 +		__entry->reason			= reason;
@@ -76348,8 +79989,8 @@ index 000000000000..fce3146378f9
 +		__entry->want_pos_snapshot	= want_pos->snapshot;
 +	),
 +
-+	TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
-+		  (void *) __entry->trans_ip,
++	TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
++		  __entry->trans_fn,
 +		  (void *) __entry->caller_ip,
 +		  __entry->in_traverse_all,
 +		  __entry->reason,
@@ -76366,43 +80007,86 @@ index 000000000000..fce3146378f9
 +);
 +
 +TRACE_EVENT(trans_restart_would_deadlock_write,
-+	TP_PROTO(unsigned long trans_ip),
-+	TP_ARGS(trans_ip),
++	TP_PROTO(const char *trans_fn),
++	TP_ARGS(trans_fn),
 +
 +	TP_STRUCT__entry(
-+		__field(unsigned long,		trans_ip	)
++		__array(char,			trans_fn, 24	)
 +	),
 +
 +	TP_fast_assign(
-+		__entry->trans_ip	= trans_ip;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +	),
 +
-+	TP_printk("%ps", (void *) __entry->trans_ip)
++	TP_printk("%s", __entry->trans_fn)
 +);
 +
 +TRACE_EVENT(trans_restart_mem_realloced,
-+	TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
 +		 unsigned long bytes),
-+	TP_ARGS(trans_ip, caller_ip, bytes),
++	TP_ARGS(trans_fn, caller_ip, bytes),
 +
 +	TP_STRUCT__entry(
-+		__field(unsigned long,		trans_ip	)
++		__array(char,			trans_fn, 24	)
 +		__field(unsigned long,		caller_ip	)
 +		__field(unsigned long,		bytes		)
 +	),
 +
 +	TP_fast_assign(
-+		__entry->trans_ip	= trans_ip;
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip	= caller_ip;
 +		__entry->bytes		= bytes;
 +	),
 +
-+	TP_printk("%ps %pS bytes %lu",
-+		  (void *) __entry->trans_ip,
++	TP_printk("%s %pS bytes %lu",
++		  __entry->trans_fn,
 +		  (void *) __entry->caller_ip,
 +		  __entry->bytes)
 +);
 +
++TRACE_EVENT(trans_restart_key_cache_key_realloced,
++	TP_PROTO(const char *trans_fn,
++		 unsigned long caller_ip,
++		 enum btree_id btree_id,
++		 struct bpos *pos,
++		 unsigned old_u64s,
++		 unsigned new_u64s),
++	TP_ARGS(trans_fn, caller_ip, btree_id, pos, old_u64s, new_u64s),
++
++	TP_STRUCT__entry(
++		__array(char,			trans_fn, 24	)
++		__field(unsigned long,		caller_ip	)
++		__field(enum btree_id,		btree_id	)
++		__field(u64,			inode		)
++		__field(u64,			offset		)
++		__field(u32,			snapshot	)
++		__field(u32,			old_u64s	)
++		__field(u32,			new_u64s	)
++	),
++
++	TP_fast_assign(
++		strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		__entry->caller_ip	= caller_ip;
++		__entry->btree_id	= btree_id;
++		__entry->inode		= pos->inode;
++		__entry->offset		= pos->offset;
++		__entry->snapshot	= pos->snapshot;
++		__entry->old_u64s	= old_u64s;
++		__entry->new_u64s	= new_u64s;
++	),
++
++	TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
++		  __entry->trans_fn,
++		  (void *) __entry->caller_ip,
++		  bch2_btree_ids[__entry->btree_id],
++		  __entry->inode,
++		  __entry->offset,
++		  __entry->snapshot,
++		  __entry->old_u64s,
++		  __entry->new_u64s)
++);
++
 +#endif /* _TRACE_BCACHE_H */
 +
 +/* This part must be outside protection */
@@ -76440,10 +80124,10 @@ index d51cabf28f38..cadbf6520c4b 100644
  obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
 +obj-$(CONFIG_SIXLOCKS) += six.o
 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
-index bf1c00c881e4..22e7c15ebab4 100644
+index e6a282bc1665..641d3d50780f 100644
 --- a/kernel/locking/lockdep.c
 +++ b/kernel/locking/lockdep.c
-@@ -6438,6 +6438,26 @@ void debug_check_no_locks_held(void)
+@@ -6443,6 +6443,26 @@ void debug_check_no_locks_held(void)
  }
  EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
  
@@ -77236,7 +80920,7 @@ index 000000000000..fca1208720b6
 +}
 +EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
 diff --git a/kernel/module.c b/kernel/module.c
-index 5c26a76e800b..155f9c1536b0 100644
+index ef79f4dbda87..17af02711eda 100644
 --- a/kernel/module.c
 +++ b/kernel/module.c
 @@ -2835,9 +2835,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
@@ -77251,10 +80935,10 @@ index 5c26a76e800b..155f9c1536b0 100644
  
  bool __weak module_init_section(const char *name)
 diff --git a/lib/Kconfig b/lib/Kconfig
-index 5e7165e6a346..32786f287f46 100644
+index baa977e003b7..aa1c7f286bad 100644
 --- a/lib/Kconfig
 +++ b/lib/Kconfig
-@@ -481,6 +481,9 @@ config ASSOCIATIVE_ARRAY
+@@ -485,6 +485,9 @@ config ASSOCIATIVE_ARRAY
  
  	  for more information.
  
@@ -77265,10 +80949,10 @@ index 5e7165e6a346..32786f287f46 100644
  	bool
  	depends on !NO_IOMEM
 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index 2a9b6dcdac4f..1c2f33b8a526 100644
+index 1699b2124558..5ce59387de9e 100644
 --- a/lib/Kconfig.debug
 +++ b/lib/Kconfig.debug
-@@ -1666,6 +1666,15 @@ config DEBUG_CREDENTIALS
+@@ -1672,6 +1672,15 @@ config DEBUG_CREDENTIALS
  
  source "kernel/rcu/Kconfig.debug"
  
@@ -77285,7 +80969,7 @@ index 2a9b6dcdac4f..1c2f33b8a526 100644
  	bool "Force round-robin CPU selection for unbound work items"
  	depends on DEBUG_KERNEL
 diff --git a/lib/Makefile b/lib/Makefile
-index a841be5244ac..9759073b2c27 100644
+index 0868cb67e5b0..f8b06256f03a 100644
 --- a/lib/Makefile
 +++ b/lib/Makefile
 @@ -239,6 +239,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
@@ -77451,10 +81135,10 @@ index f25eb111c051..7dfa88282b00 100644
  				objs_per_page;
  			if (i == GENRADIX_ARY)
 diff --git a/mm/filemap.c b/mm/filemap.c
-index dae481293b5d..0dc615e48c8c 100644
+index dbc461703ff4..75a38c04c94f 100644
 --- a/mm/filemap.c
 +++ b/mm/filemap.c
-@@ -2186,6 +2186,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+@@ -2187,6 +2187,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
  
  	return ret;
  }
@@ -77492,10 +81176,10 @@ index 02d2427b8f9e..2f4894f9b7e3 100644
   * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
   *	@size:		allocation size
 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
-index e8a807c78110..5fb4311fdc07 100644
+index 8375eecc55de..9f7ac3851caa 100644
 --- a/mm/vmalloc.c
 +++ b/mm/vmalloc.c
-@@ -3200,6 +3200,27 @@ void *vzalloc_node(unsigned long size, int node)
+@@ -3201,6 +3201,27 @@ void *vzalloc_node(unsigned long size, int node)
  }
  EXPORT_SYMBOL(vzalloc_node);
  
@@ -77523,3 +81207,6 @@ index e8a807c78110..5fb4311fdc07 100644
  #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
+-- 
+2.38.1.385.g3b08839926
+
diff --git a/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch b/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch
index e3d463e..1a153c2 100644
--- a/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch
+++ b/linux-tkg-patches/6.0/0008-6.0-bcachefs.patch
@@ -1,12 +1,13 @@
-From 3affa82eb5db6a292711c1d9febffa8ff7332f55 Mon Sep 17 00:00:00 2001
+From 3522899e342d546a6da80a5a6a621e9bd69e8536 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 7 Oct 2022 19:45:17 +0200
+Date: Sun, 6 Nov 2022 10:46:22 +0100
 Subject: [PATCH] bcachefs
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  .github/ISSUE_TEMPLATE/bug_report.md          |   61 +
  Documentation/core-api/printk-formats.rst     |   22 +
+ MAINTAINERS                                   |    9 +
  arch/powerpc/kernel/process.c                 |   16 +-
  arch/powerpc/kernel/security.c                |   75 +-
  arch/powerpc/platforms/pseries/papr_scm.c     |   34 +-
@@ -26,48 +27,50 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  drivers/pci/p2pdma.c                          |   21 +-
  fs/Kconfig                                    |    1 +
  fs/Makefile                                   |    1 +
- fs/bcachefs/Kconfig                           |   59 +
+ fs/bcachefs/Kconfig                           |   60 +
  fs/bcachefs/Makefile                          |   70 +
  fs/bcachefs/acl.c                             |  406 ++
  fs/bcachefs/acl.h                             |   58 +
- fs/bcachefs/alloc_background.c                | 1551 ++++++++
+ fs/bcachefs/alloc_background.c                | 1585 ++++++++
  fs/bcachefs/alloc_background.h                |  183 +
- fs/bcachefs/alloc_foreground.c                | 1383 +++++++
- fs/bcachefs/alloc_foreground.h                |  181 +
- fs/bcachefs/alloc_types.h                     |   87 +
- fs/bcachefs/backpointers.c                    |  898 +++++
- fs/bcachefs/backpointers.h                    |   38 +
- fs/bcachefs/bcachefs.h                        | 1001 +++++
- fs/bcachefs/bcachefs_format.h                 | 2122 ++++++++++
+ fs/bcachefs/alloc_foreground.c                | 1398 +++++++
+ fs/bcachefs/alloc_foreground.h                |  174 +
+ fs/bcachefs/alloc_types.h                     |   92 +
+ fs/bcachefs/backpointers.c                    | 1103 +++++
+ fs/bcachefs/backpointers.h                    |   58 +
+ fs/bcachefs/bbpos.h                           |   48 +
+ fs/bcachefs/bcachefs.h                        | 1019 +++++
+ fs/bcachefs/bcachefs_format.h                 | 2172 ++++++++++
  fs/bcachefs/bcachefs_ioctl.h                  |  368 ++
- fs/bcachefs/bkey.c                            | 1203 ++++++
- fs/bcachefs/bkey.h                            |  571 +++
- fs/bcachefs/bkey_buf.h                        |   60 +
- fs/bcachefs/bkey_methods.c                    |  503 +++
+ fs/bcachefs/bkey.c                            | 1098 +++++
+ fs/bcachefs/bkey.h                            |  666 +++
+ fs/bcachefs/bkey_buf.h                        |   61 +
+ fs/bcachefs/bkey_cmp.h                        |  129 +
+ fs/bcachefs/bkey_methods.c                    |  505 +++
  fs/bcachefs/bkey_methods.h                    |  175 +
- fs/bcachefs/bkey_sort.c                       |  198 +
+ fs/bcachefs/bkey_sort.c                       |  199 +
  fs/bcachefs/bkey_sort.h                       |   44 +
- fs/bcachefs/bset.c                            | 1598 ++++++++
- fs/bcachefs/bset.h                            |  615 +++
- fs/bcachefs/btree_cache.c                     | 1149 ++++++
- fs/bcachefs/btree_cache.h                     |  105 +
+ fs/bcachefs/bset.c                            | 1601 ++++++++
+ fs/bcachefs/bset.h                            |  521 +++
+ fs/bcachefs/btree_cache.c                     | 1204 ++++++
+ fs/bcachefs/btree_cache.h                     |  106 +
  fs/bcachefs/btree_gc.c                        | 2106 ++++++++++
  fs/bcachefs/btree_gc.h                        |  112 +
- fs/bcachefs/btree_io.c                        | 2154 +++++++++++
- fs/bcachefs/btree_io.h                        |  222 ++
- fs/bcachefs/btree_iter.c                      | 3043 +++++++++++++++
- fs/bcachefs/btree_iter.h                      |  564 +++
- fs/bcachefs/btree_key_cache.c                 |  983 +++++
- fs/bcachefs/btree_key_cache.h                 |   47 +
- fs/bcachefs/btree_locking.c                   |  676 ++++
- fs/bcachefs/btree_locking.h                   |  418 ++
- fs/bcachefs/btree_types.h                     |  696 ++++
+ fs/bcachefs/btree_io.c                        | 2203 ++++++++++
+ fs/bcachefs/btree_io.h                        |  228 ++
+ fs/bcachefs/btree_iter.c                      | 3121 ++++++++++++++
+ fs/bcachefs/btree_iter.h                      |  599 +++
+ fs/bcachefs/btree_key_cache.c                 | 1034 +++++
+ fs/bcachefs/btree_key_cache.h                 |   48 +
+ fs/bcachefs/btree_locking.c                   |  679 ++++
+ fs/bcachefs/btree_locking.h                   |  419 ++
+ fs/bcachefs/btree_types.h                     |  708 ++++
  fs/bcachefs/btree_update.h                    |  158 +
- fs/bcachefs/btree_update_interior.c           | 2352 ++++++++++++
- fs/bcachefs/btree_update_interior.h           |  322 ++
- fs/bcachefs/btree_update_leaf.c               | 1745 +++++++++
- fs/bcachefs/buckets.c                         | 2113 ++++++++++
- fs/bcachefs/buckets.h                         |  300 ++
+ fs/bcachefs/btree_update_interior.c           | 2437 +++++++++++
+ fs/bcachefs/btree_update_interior.h           |  324 ++
+ fs/bcachefs/btree_update_leaf.c               | 1760 ++++++++
+ fs/bcachefs/buckets.c                         | 2117 ++++++++++
+ fs/bcachefs/buckets.h                         |  326 ++
  fs/bcachefs/buckets_types.h                   |  103 +
  fs/bcachefs/buckets_waiting_for_journal.c     |  167 +
  fs/bcachefs/buckets_waiting_for_journal.h     |   15 +
@@ -75,7 +78,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  fs/bcachefs/chardev.c                         |  760 ++++
  fs/bcachefs/chardev.h                         |   31 +
  fs/bcachefs/checksum.c                        |  712 ++++
- fs/bcachefs/checksum.h                        |  204 +
+ fs/bcachefs/checksum.h                        |  212 +
  fs/bcachefs/clock.c                           |  191 +
  fs/bcachefs/clock.h                           |   38 +
  fs/bcachefs/clock_types.h                     |   37 +
@@ -84,103 +87,103 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  fs/bcachefs/counters.c                        |  107 +
  fs/bcachefs/counters.h                        |   17 +
  fs/bcachefs/darray.h                          |   77 +
- fs/bcachefs/data_update.c                     |  373 ++
- fs/bcachefs/data_update.h                     |   38 +
- fs/bcachefs/debug.c                           |  831 ++++
+ fs/bcachefs/data_update.c                     |  387 ++
+ fs/bcachefs/data_update.h                     |   41 +
+ fs/bcachefs/debug.c                           |  811 ++++
  fs/bcachefs/debug.h                           |   30 +
  fs/bcachefs/dirent.c                          |  565 +++
  fs/bcachefs/dirent.h                          |   67 +
  fs/bcachefs/disk_groups.c                     |  505 +++
  fs/bcachefs/disk_groups.h                     |   91 +
- fs/bcachefs/ec.c                              | 1673 ++++++++
- fs/bcachefs/ec.h                              |  230 ++
+ fs/bcachefs/ec.c                              | 1680 ++++++++
+ fs/bcachefs/ec.h                              |  224 ++
  fs/bcachefs/ec_types.h                        |   46 +
- fs/bcachefs/errcode.c                         |   62 +
- fs/bcachefs/errcode.h                         |   96 +
- fs/bcachefs/error.c                           |  218 ++
- fs/bcachefs/error.h                           |  222 ++
+ fs/bcachefs/errcode.c                         |   63 +
+ fs/bcachefs/errcode.h                         |   97 +
+ fs/bcachefs/error.c                           |  221 +
+ fs/bcachefs/error.h                           |  222 +
  fs/bcachefs/extent_update.c                   |  178 +
  fs/bcachefs/extent_update.h                   |   12 +
- fs/bcachefs/extents.c                         | 1324 +++++++
- fs/bcachefs/extents.h                         |  685 ++++
+ fs/bcachefs/extents.c                         | 1324 ++++++
+ fs/bcachefs/extents.h                         |  689 ++++
  fs/bcachefs/extents_types.h                   |   40 +
  fs/bcachefs/eytzinger.h                       |  281 ++
  fs/bcachefs/fifo.h                            |  127 +
- fs/bcachefs/fs-common.c                       |  496 +++
+ fs/bcachefs/fs-common.c                       |  501 +++
  fs/bcachefs/fs-common.h                       |   43 +
- fs/bcachefs/fs-io.c                           | 3421 +++++++++++++++++
+ fs/bcachefs/fs-io.c                           | 3577 +++++++++++++++++
  fs/bcachefs/fs-io.h                           |   54 +
- fs/bcachefs/fs-ioctl.c                        |  539 +++
+ fs/bcachefs/fs-ioctl.c                        |  555 +++
  fs/bcachefs/fs-ioctl.h                        |   81 +
- fs/bcachefs/fs.c                              | 1942 ++++++++++
+ fs/bcachefs/fs.c                              | 1941 +++++++++
  fs/bcachefs/fs.h                              |  208 +
- fs/bcachefs/fsck.c                            | 2395 ++++++++++++
+ fs/bcachefs/fsck.c                            | 2395 +++++++++++
  fs/bcachefs/fsck.h                            |    8 +
- fs/bcachefs/inode.c                           |  771 ++++
- fs/bcachefs/inode.h                           |  189 +
- fs/bcachefs/io.c                              | 2436 ++++++++++++
- fs/bcachefs/io.h                              |  190 +
- fs/bcachefs/io_types.h                        |  161 +
+ fs/bcachefs/inode.c                           |  892 ++++
+ fs/bcachefs/inode.h                           |  202 +
+ fs/bcachefs/io.c                              | 2469 ++++++++++++
+ fs/bcachefs/io.h                              |  183 +
+ fs/bcachefs/io_types.h                        |  156 +
  fs/bcachefs/journal.c                         | 1436 +++++++
- fs/bcachefs/journal.h                         |  521 +++
- fs/bcachefs/journal_io.c                      | 1759 +++++++++
+ fs/bcachefs/journal.h                         |  540 +++
+ fs/bcachefs/journal_io.c                      | 1807 +++++++++
  fs/bcachefs/journal_io.h                      |   59 +
  fs/bcachefs/journal_reclaim.c                 |  853 ++++
  fs/bcachefs/journal_reclaim.h                 |   86 +
- fs/bcachefs/journal_sb.c                      |  220 ++
+ fs/bcachefs/journal_sb.c                      |  220 +
  fs/bcachefs/journal_sb.h                      |   24 +
  fs/bcachefs/journal_seq_blacklist.c           |  322 ++
  fs/bcachefs/journal_seq_blacklist.h           |   22 +
  fs/bcachefs/journal_types.h                   |  340 ++
- fs/bcachefs/keylist.c                         |   67 +
- fs/bcachefs/keylist.h                         |   76 +
+ fs/bcachefs/keylist.c                         |   68 +
+ fs/bcachefs/keylist.h                         |   75 +
  fs/bcachefs/keylist_types.h                   |   16 +
  fs/bcachefs/lru.c                             |  206 +
  fs/bcachefs/lru.h                             |   19 +
  fs/bcachefs/migrate.c                         |  186 +
  fs/bcachefs/migrate.h                         |    7 +
- fs/bcachefs/move.c                            |  954 +++++
+ fs/bcachefs/move.c                            | 1011 +++++
  fs/bcachefs/move.h                            |   67 +
  fs/bcachefs/move_types.h                      |   19 +
  fs/bcachefs/movinggc.c                        |  285 ++
  fs/bcachefs/movinggc.h                        |   10 +
  fs/bcachefs/opts.c                            |  578 +++
  fs/bcachefs/opts.h                            |  509 +++
- fs/bcachefs/quota.c                           |  823 ++++
+ fs/bcachefs/quota.c                           |  978 +++++
  fs/bcachefs/quota.h                           |   71 +
  fs/bcachefs/quota_types.h                     |   43 +
  fs/bcachefs/rebalance.c                       |  362 ++
  fs/bcachefs/rebalance.h                       |   28 +
  fs/bcachefs/rebalance_types.h                 |   26 +
- fs/bcachefs/recovery.c                        | 1587 ++++++++
+ fs/bcachefs/recovery.c                        | 1606 ++++++++
  fs/bcachefs/recovery.h                        |   58 +
  fs/bcachefs/reflink.c                         |  422 ++
  fs/bcachefs/reflink.h                         |   76 +
- fs/bcachefs/replicas.c                        | 1071 ++++++
- fs/bcachefs/replicas.h                        |  106 +
- fs/bcachefs/replicas_types.h                  |   10 +
+ fs/bcachefs/replicas.c                        | 1071 +++++
+ fs/bcachefs/replicas.h                        |  107 +
+ fs/bcachefs/replicas_types.h                  |   11 +
  fs/bcachefs/siphash.c                         |  173 +
  fs/bcachefs/siphash.h                         |   87 +
  fs/bcachefs/str_hash.h                        |  370 ++
- fs/bcachefs/subvolume.c                       | 1110 ++++++
+ fs/bcachefs/subvolume.c                       | 1111 +++++
  fs/bcachefs/subvolume.h                       |  137 +
  fs/bcachefs/subvolume_types.h                 |    9 +
- fs/bcachefs/super-io.c                        | 1603 ++++++++
+ fs/bcachefs/super-io.c                        | 1601 ++++++++
  fs/bcachefs/super-io.h                        |  126 +
- fs/bcachefs/super.c                           | 1964 ++++++++++
+ fs/bcachefs/super.c                           | 1961 +++++++++
  fs/bcachefs/super.h                           |  264 ++
  fs/bcachefs/super_types.h                     |   51 +
- fs/bcachefs/sysfs.c                           |  954 +++++
+ fs/bcachefs/sysfs.c                           |  963 +++++
  fs/bcachefs/sysfs.h                           |   48 +
- fs/bcachefs/tests.c                           |  976 +++++
+ fs/bcachefs/tests.c                           |  973 +++++
  fs/bcachefs/tests.h                           |   15 +
  fs/bcachefs/trace.c                           |   14 +
- fs/bcachefs/util.c                            |  993 +++++
- fs/bcachefs/util.h                            |  787 ++++
+ fs/bcachefs/util.c                            | 1104 +++++
+ fs/bcachefs/util.h                            |  793 ++++
  fs/bcachefs/varint.c                          |  121 +
  fs/bcachefs/varint.h                          |   11 +
  fs/bcachefs/vstructs.h                        |   63 +
- fs/bcachefs/xattr.c                           |  650 ++++
+ fs/bcachefs/xattr.c                           |  654 +++
  fs/bcachefs/xattr.h                           |   50 +
  fs/d_path.c                                   |   35 +
  fs/dcache.c                                   |   10 +-
@@ -197,23 +200,24 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  include/linux/list_bl.h                       |   22 +
  include/linux/lockdep.h                       |   10 +
  include/linux/lockdep_types.h                 |    2 +-
+ include/linux/mean_and_variance.h             |  170 +
  include/linux/pretty-printers.h               |   10 +
  include/linux/printbuf.h                      |  306 ++
  include/linux/sched.h                         |    1 +
  include/linux/seq_buf.h                       |  162 -
  include/linux/shrinker.h                      |    9 +-
- include/linux/six.h                           |  222 ++
+ include/linux/six.h                           |  222 +
  include/linux/string.h                        |    5 +
  include/linux/string_helpers.h                |    8 +-
  include/linux/trace_events.h                  |    2 +-
  include/linux/trace_seq.h                     |   17 +-
  include/linux/vmalloc.h                       |    1 +
- include/trace/events/bcachefs.h               | 1101 ++++++
+ include/trace/events/bcachefs.h               | 1105 +++++
  init/init_task.c                              |    1 +
  kernel/Kconfig.locks                          |    3 +
  kernel/locking/Makefile                       |    1 +
  kernel/locking/lockdep.c                      |   45 +
- kernel/locking/six.c                          |  748 ++++
+ kernel/locking/six.c                          |  757 ++++
  kernel/module/main.c                          |    4 +-
  kernel/stacktrace.c                           |    2 +
  kernel/trace/trace.c                          |   45 +-
@@ -224,20 +228,25 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  kernel/trace/trace_kprobe.c                   |    2 +-
  kernel/trace/trace_seq.c                      |  111 +-
  lib/Kconfig                                   |    3 +
- lib/Kconfig.debug                             |    9 +
+ lib/Kconfig.debug                             |   18 +
  lib/Makefile                                  |    8 +-
  {drivers/md/bcache => lib}/closure.c          |   35 +-
  lib/errname.c                                 |    1 +
  lib/generic-radix-tree.c                      |   76 +-
  lib/hexdump.c                                 |  246 +-
+ lib/math/Kconfig                              |    3 +
+ lib/math/Makefile                             |    2 +
+ lib/math/mean_and_variance.c                  |  178 +
+ lib/math/mean_and_variance_test.c             |  152 +
  lib/pretty-printers.c                         |   60 +
  lib/printbuf.c                                |  368 ++
  lib/seq_buf.c                                 |  397 --
  lib/string_helpers.c                          |  224 +-
  lib/test_hexdump.c                            |   30 +-
  lib/test_printf.c                             |   33 +-
- lib/vsprintf.c                                | 1740 ++++-----
+ lib/vsprintf.c                                | 1741 ++++----
  mm/Makefile                                   |    2 +-
+ mm/filemap.c                                  |    1 +
  mm/memcontrol.c                               |   54 +-
  mm/nommu.c                                    |   18 +
  mm/oom_kill.c                                 |   23 -
@@ -247,7 +256,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  mm/vmalloc.c                                  |   21 +
  mm/vmscan.c                                   |  105 +-
  tools/testing/nvdimm/test/ndtest.c            |   22 +-
- 242 files changed, 85340 insertions(+), 2187 deletions(-)
+ 251 files changed, 87492 insertions(+), 2187 deletions(-)
  create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
  create mode 100644 fs/bcachefs/Kconfig
  create mode 100644 fs/bcachefs/Makefile
@@ -260,12 +269,14 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 fs/bcachefs/alloc_types.h
  create mode 100644 fs/bcachefs/backpointers.c
  create mode 100644 fs/bcachefs/backpointers.h
+ create mode 100644 fs/bcachefs/bbpos.h
  create mode 100644 fs/bcachefs/bcachefs.h
  create mode 100644 fs/bcachefs/bcachefs_format.h
  create mode 100644 fs/bcachefs/bcachefs_ioctl.h
  create mode 100644 fs/bcachefs/bkey.c
  create mode 100644 fs/bcachefs/bkey.h
  create mode 100644 fs/bcachefs/bkey_buf.h
+ create mode 100644 fs/bcachefs/bkey_cmp.h
  create mode 100644 fs/bcachefs/bkey_methods.c
  create mode 100644 fs/bcachefs/bkey_methods.h
  create mode 100644 fs/bcachefs/bkey_sort.c
@@ -406,6 +417,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 fs/bcachefs/xattr.c
  create mode 100644 fs/bcachefs/xattr.h
  rename {drivers/md/bcache => include/linux}/closure.h (94%)
+ create mode 100644 include/linux/mean_and_variance.h
  create mode 100644 include/linux/pretty-printers.h
  create mode 100644 include/linux/printbuf.h
  delete mode 100644 include/linux/seq_buf.h
@@ -413,6 +425,8 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 include/trace/events/bcachefs.h
  create mode 100644 kernel/locking/six.c
  rename {drivers/md/bcache => lib}/closure.c (88%)
+ create mode 100644 lib/math/mean_and_variance.c
+ create mode 100644 lib/math/mean_and_variance_test.c
  create mode 100644 lib/pretty-printers.c
  create mode 100644 lib/printbuf.c
  delete mode 100644 lib/seq_buf.c
@@ -486,7 +500,7 @@ index 000000000000..8af34357dd98
 +* provide the output of `bcachefs list_journal -a <list of devices> | zstd -f -T0 -o ../journal.log.zst`
 +*compress & upload all the `metdata.dump.*` files from: bcachefs dump -o metadata.dump <list of devices>
 diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
-index 5e89497ba314..4f4a35b3aadc 100644
+index 5e89497ba314..608eb514f171 100644
 --- a/Documentation/core-api/printk-formats.rst
 +++ b/Documentation/core-api/printk-formats.rst
 @@ -625,6 +625,28 @@ Examples::
@@ -503,7 +517,7 @@ index 5e89497ba314..4f4a35b3aadc 100644
 +
 +For calling generic pretty printers. A pretty printer is a function that takes
 +as its first argument a pointer to a printbuf, and then zero or more additional
-+pointer arguments. For example:
++pointer arguments. For example::
 +
 +        void foo_to_text(struct printbuf *out, struct foo *foo)
 +        {
@@ -518,6 +532,26 @@ index 5e89497ba314..4f4a35b3aadc 100644
  Thanks
  ======
  
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 72b9654f764c..06bb50e760df 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -12505,6 +12505,15 @@ F:	Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt
+ F:	drivers/net/ieee802154/mcr20a.c
+ F:	drivers/net/ieee802154/mcr20a.h
+ 
++MEAN AND VARIANCE LIBRARY
++M:	Daniel B. Hill <daniel@gluo.nz>
++M:	Kent Overstreet <kent.overstreet@linux.dev>
++S:	Maintained
++T:	git https://github.com/YellowOnion/linux/
++F:	include/linux/mean_and_variance.h
++F:	lib/math/mean_and_variance.c
++F:	lib/math/mean_and_variance_test.c
++
+ MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
+ M:	William Breathitt Gray <william.gray@linaro.org>
+ L:	linux-iio@vger.kernel.org
 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
 index 0fbda89cd1bb..05654dbeb2c4 100644
 --- a/arch/powerpc/kernel/process.c
@@ -880,7 +914,7 @@ index f276aff521e8..50c12711a249 100644
  	ret = rdtgroup_setup_root();
  	if (ret)
 diff --git a/block/bio.c b/block/bio.c
-index 3d3a2678fea2..ed9a4df9ea36 100644
+index 77e3b764a078..cdb26dc0d638 100644
 --- a/block/bio.c
 +++ b/block/bio.c
 @@ -582,15 +582,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
@@ -902,7 +936,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644
  
  /**
   * bio_truncate - truncate the bio to small size of @new_size
-@@ -1200,7 +1200,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+@@ -1198,7 +1198,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  	struct page **pages = (struct page **)bv;
  	ssize_t size, left;
  	unsigned len, i = 0;
@@ -911,7 +945,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644
  	int ret = 0;
  
  	/*
-@@ -1225,10 +1225,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+@@ -1223,10 +1223,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  
  	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
  
@@ -927,7 +961,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644
  	if (unlikely(!size)) {
  		ret = -EFAULT;
  		goto out;
-@@ -1437,6 +1439,7 @@ void bio_set_pages_dirty(struct bio *bio)
+@@ -1435,6 +1437,7 @@ void bio_set_pages_dirty(struct bio *bio)
  			set_page_dirty_lock(bvec->bv_page);
  	}
  }
@@ -935,7 +969,7 @@ index 3d3a2678fea2..ed9a4df9ea36 100644
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
-@@ -1496,6 +1499,7 @@ void bio_check_pages_dirty(struct bio *bio)
+@@ -1494,6 +1497,7 @@ void bio_check_pages_dirty(struct bio *bio)
  	spin_unlock_irqrestore(&bio_dirty_lock, flags);
  	schedule_work(&bio_dirty_work);
  }
@@ -956,7 +990,7 @@ index 651057c4146b..10bf4ac26bed 100644
  /**
   * blk_sync_queue - cancel any pending callbacks on a queue
 diff --git a/block/blk.h b/block/blk.h
-index d7142c4d2fef..1b22813ee530 100644
+index 52432eab621e..2af5287b97de 100644
 --- a/block/blk.h
 +++ b/block/blk.h
 @@ -250,7 +250,6 @@ static inline void blk_integrity_del(struct gendisk *disk)
@@ -1296,10 +1330,10 @@ index 93b80529f8e8..2b8d04016a20 100644
  obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
 diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
 new file mode 100644
-index 000000000000..008886967841
+index 000000000000..2b9387ac1bca
 --- /dev/null
 +++ b/fs/bcachefs/Kconfig
-@@ -0,0 +1,59 @@
+@@ -0,0 +1,60 @@
 +
 +config BCACHEFS_FS
 +	tristate "bcachefs filesystem support"
@@ -1325,6 +1359,7 @@ index 000000000000..008886967841
 +	select XXHASH
 +	select SRCU
 +	select SYMBOLIC_ERRNAME
++	select MEAN_AND_VARIANCE
 +	help
 +	The bcachefs filesystem - a modern, copy on write filesystem, with
 +	support for multiple devices, compression, checksumming, etc.
@@ -1437,7 +1472,7 @@ index 000000000000..8124d356baa1
 +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o
 diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
 new file mode 100644
-index 000000000000..5c6ccf685094
+index 000000000000..9592541f7b5c
 --- /dev/null
 +++ b/fs/bcachefs/acl.c
 @@ -0,0 +1,406 @@
@@ -1616,7 +1651,7 @@ index 000000000000..5c6ccf685094
 +	bkey_xattr_init(&xattr->k_i);
 +	xattr->k.u64s		= u64s;
 +	xattr->v.x_type		= acl_to_xattr_type(type);
-+	xattr->v.x_name_len	= 0,
++	xattr->v.x_name_len	= 0;
 +	xattr->v.x_val_len	= cpu_to_le16(acl_len);
 +
 +	acl_header = xattr_val(&xattr->v);
@@ -1913,10 +1948,10 @@ index 000000000000..2d76a4897ba8
 +#endif /* _BCACHEFS_ACL_H */
 diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
 new file mode 100644
-index 000000000000..d0d7690a4940
+index 000000000000..ccd3f72ae19f
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.c
-@@ -0,0 +1,1551 @@
+@@ -0,0 +1,1585 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -2129,31 +2164,6 @@ index 000000000000..d0d7690a4940
 +	return ret;
 +}
 +
-+struct bkey_i_alloc_v4 *
-+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
-+			      struct bpos pos)
-+{
-+	struct bkey_s_c k;
-+	struct bkey_i_alloc_v4 *a;
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
-+			     BTREE_ITER_WITH_UPDATES|
-+			     BTREE_ITER_CACHED|
-+			     BTREE_ITER_INTENT);
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret) {
-+		bch2_trans_iter_exit(trans, iter);
-+		return ERR_PTR(ret);
-+	}
-+
-+	a = bch2_alloc_to_v4_mut(trans, k);
-+	if (IS_ERR(a))
-+		bch2_trans_iter_exit(trans, iter);
-+	return a;
-+}
-+
 +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 +{
 +	unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -2223,6 +2233,18 @@ index 000000000000..d0d7690a4940
 +		return -EINVAL;
 +	}
 +
++	if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
++		unsigned i, bp_len = 0;
++
++		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
++			bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
++
++		if (bp_len > a.v->dirty_sectors) {
++			prt_printf(err, "too many backpointers");
++			return -EINVAL;
++		}
++	}
++
 +	if (rw == WRITE) {
 +		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
 +			prt_printf(err, "invalid data type (got %u should be %u)",
@@ -2394,12 +2416,13 @@ index 000000000000..d0d7690a4940
 +	}
 +}
 +
-+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
++static noinline struct bkey_i_alloc_v4 *
++__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 +{
++	struct bkey_i_alloc_v4 *ret;
 +	unsigned bytes = k.k->type == KEY_TYPE_alloc_v4
 +		? bkey_bytes(k.k)
 +		: sizeof(struct bkey_i_alloc_v4);
-+	struct bkey_i_alloc_v4 *ret;
 +
 +	/*
 +	 * Reserve space for one more backpointer here:
@@ -2410,20 +2433,18 @@ index 000000000000..d0d7690a4940
 +		return ret;
 +
 +	if (k.k->type == KEY_TYPE_alloc_v4) {
++		struct bch_backpointer *src, *dst;
++
 +		bkey_reassemble(&ret->k_i, k);
 +
-+		if (BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v) < BCH_ALLOC_V4_U64s) {
-+			struct bch_backpointer *src, *dst;
++		src = alloc_v4_backpointers(&ret->v);
++		SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
++		dst = alloc_v4_backpointers(&ret->v);
 +
-+			src = alloc_v4_backpointers(&ret->v);
-+			SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
-+			dst = alloc_v4_backpointers(&ret->v);
-+
-+			memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
-+				sizeof(struct bch_backpointer));
-+			memset(src, 0, dst - src);
-+			set_alloc_v4_u64s(ret);
-+		}
++		memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
++			sizeof(struct bch_backpointer));
++		memset(src, 0, dst - src);
++		set_alloc_v4_u64s(ret);
 +	} else {
 +		bkey_alloc_v4_init(&ret->k_i);
 +		ret->k.p = k.k->p;
@@ -2432,6 +2453,54 @@ index 000000000000..d0d7690a4940
 +	return ret;
 +}
 +
++static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k)
++{
++	if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
++	    BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
++		/*
++		 * Reserve space for one more backpointer here:
++		 * Not sketchy at doing it this way, nope...
++		 */
++		struct bkey_i_alloc_v4 *ret =
++			bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
++		if (!IS_ERR(ret))
++			bkey_reassemble(&ret->k_i, k);
++		return ret;
++	}
++
++	return __bch2_alloc_to_v4_mut(trans, k);
++}
++
++struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
++{
++	return bch2_alloc_to_v4_mut_inlined(trans, k);
++}
++
++struct bkey_i_alloc_v4 *
++bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
++			      struct bpos pos)
++{
++	struct bkey_s_c k;
++	struct bkey_i_alloc_v4 *a;
++	int ret;
++
++	bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
++			     BTREE_ITER_WITH_UPDATES|
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret) {
++		bch2_trans_iter_exit(trans, iter);
++		return ERR_PTR(ret);
++	}
++
++	a = bch2_alloc_to_v4_mut_inlined(trans, k);
++	if (IS_ERR(a))
++		bch2_trans_iter_exit(trans, iter);
++	return a;
++}
++
 +int bch2_alloc_read(struct bch_fs *c)
 +{
 +	struct btree_trans trans;
@@ -3470,7 +3539,7 @@ index 000000000000..d0d7690a4940
 +}
 diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
 new file mode 100644
-index 000000000000..044bc72992d4
+index 000000000000..ee683bdde956
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.h
 @@ -0,0 +1,183 @@
@@ -3579,34 +3648,34 @@ index 000000000000..044bc72992d4
 +void bch2_alloc_v4_swab(struct bkey_s);
 +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_alloc (struct bkey_ops) {		\
++#define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v1_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
 +	.trans_trigger	= bch2_trans_mark_alloc,	\
 +	.atomic_trigger	= bch2_mark_alloc,		\
-+}
++})
 +
-+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
++#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v2_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
 +	.trans_trigger	= bch2_trans_mark_alloc,	\
 +	.atomic_trigger	= bch2_mark_alloc,		\
-+}
++})
 +
-+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {	\
++#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v3_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
 +	.trans_trigger	= bch2_trans_mark_alloc,	\
 +	.atomic_trigger	= bch2_mark_alloc,		\
-+}
++})
 +
-+#define bch2_bkey_ops_alloc_v4 (struct bkey_ops) {	\
++#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_alloc_v4_invalid,	\
 +	.val_to_text	= bch2_alloc_to_text,		\
 +	.swab		= bch2_alloc_v4_swab,		\
 +	.trans_trigger	= bch2_trans_mark_alloc,	\
 +	.atomic_trigger	= bch2_mark_alloc,		\
-+}
++})
 +
 +static inline bool bkey_is_alloc(const struct bkey *k)
 +{
@@ -3659,10 +3728,10 @@ index 000000000000..044bc72992d4
 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
 new file mode 100644
-index 000000000000..e89999cf9238
+index 000000000000..55708d2da960
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.c
-@@ -0,0 +1,1383 @@
+@@ -0,0 +1,1398 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2012 Google, Inc.
@@ -3679,6 +3748,7 @@ index 000000000000..e89999cf9238
 +#include "bcachefs.h"
 +#include "alloc_background.h"
 +#include "alloc_foreground.h"
++#include "backpointers.h"
 +#include "btree_iter.h"
 +#include "btree_update.h"
 +#include "btree_gc.h"
@@ -3998,6 +4068,29 @@ index 000000000000..e89999cf9238
 +		goto err;
 +	}
 +
++	if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
++		struct bch_backpointer bp;
++		u64 bp_offset = 0;
++
++		ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
++						&bp_offset, &bp,
++						BTREE_ITER_NOPRESERVE);
++		if (ret) {
++			ob = ERR_PTR(ret);
++			goto err;
++		}
++
++		if (bp_offset != U64_MAX) {
++			/*
++			 * Bucket may have data in it - we don't call
++			 * bc2h_trans_inconnsistent() because fsck hasn't
++			 * finished yet
++			 */
++			ob = NULL;
++			goto err;
++		}
++	}
++
 +	ob = __try_alloc_bucket(c, ca, b, reserve, &a,
 +				skipped_open,
 +				skipped_need_journal_commit,
@@ -4154,16 +4247,16 @@ index 000000000000..e89999cf9238
 + * bch_bucket_alloc - allocate a single bucket from a specific device
 + *
 + * Returns index of bucket on success, 0 on failure
-+ * */
++ */
 +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 +				      struct bch_dev *ca,
 +				      enum alloc_reserve reserve,
 +				      bool may_alloc_partial,
-+				      struct closure *cl)
++				      struct closure *cl,
++				      struct bch_dev_usage *usage)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct open_bucket *ob = NULL;
-+	struct bch_dev_usage usage;
 +	bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized);
 +	u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor;
 +	u64 avail;
@@ -4174,16 +4267,16 @@ index 000000000000..e89999cf9238
 +	u64 skipped_nouse = 0;
 +	bool waiting = false;
 +again:
-+	usage = bch2_dev_usage_read(ca);
-+	avail = dev_buckets_free(ca, usage, reserve);
++	bch2_dev_usage_read_fast(ca, usage);
++	avail = dev_buckets_free(ca, *usage, reserve);
 +
-+	if (usage.d[BCH_DATA_need_discard].buckets > avail)
++	if (usage->d[BCH_DATA_need_discard].buckets > avail)
 +		bch2_do_discards(c);
 +
-+	if (usage.d[BCH_DATA_need_gc_gens].buckets > avail)
++	if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
 +		bch2_do_gc_gens(c);
 +
-+	if (should_invalidate_buckets(ca, usage))
++	if (should_invalidate_buckets(ca, *usage))
 +		bch2_do_invalidates(c);
 +
 +	if (!avail) {
@@ -4242,10 +4335,10 @@ index 000000000000..e89999cf9238
 +	if (!IS_ERR(ob))
 +		trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve],
 +				may_alloc_partial, ob->bucket);
-+	else
++	else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart))
 +		trace_and_count(c, bucket_alloc_fail,
 +				ca, bch2_alloc_reserves[reserve],
-+				usage.d[BCH_DATA_free].buckets,
++				usage->d[BCH_DATA_free].buckets,
 +				avail,
 +				bch2_copygc_wait_amount(c),
 +				c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now),
@@ -4264,11 +4357,12 @@ index 000000000000..e89999cf9238
 +				      bool may_alloc_partial,
 +				      struct closure *cl)
 +{
++	struct bch_dev_usage usage;
 +	struct open_bucket *ob;
 +
 +	bch2_trans_do(c, NULL, NULL, 0,
 +		      PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
-+								   may_alloc_partial, cl)));
++							may_alloc_partial, cl, &usage)));
 +	return ob;
 +}
 +
@@ -4295,8 +4389,9 @@ index 000000000000..e89999cf9238
 +	return ret;
 +}
 +
-+void bch2_dev_stripe_increment(struct bch_dev *ca,
-+			       struct dev_stripe_state *stripe)
++static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
++			       struct dev_stripe_state *stripe,
++			       struct bch_dev_usage *usage)
 +{
 +	u64 *v = stripe->next_alloc + ca->dev_idx;
 +	u64 free_space = dev_buckets_available(ca, RESERVE_none);
@@ -4315,6 +4410,15 @@ index 000000000000..e89999cf9238
 +		*v = *v < scale ? 0 : *v - scale;
 +}
 +
++void bch2_dev_stripe_increment(struct bch_dev *ca,
++			       struct dev_stripe_state *stripe)
++{
++	struct bch_dev_usage usage;
++
++	bch2_dev_usage_read_fast(ca, &usage);
++	bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
++}
++
 +#define BUCKET_MAY_ALLOC_PARTIAL	(1 << 0)
 +#define BUCKET_ALLOC_USE_DURABILITY	(1 << 1)
 +
@@ -4359,6 +4463,7 @@ index 000000000000..e89999cf9238
 +	BUG_ON(*nr_effective >= nr_replicas);
 +
 +	for (i = 0; i < devs_sorted.nr; i++) {
++		struct bch_dev_usage usage;
 +		struct open_bucket *ob;
 +
 +		dev = devs_sorted.devs[i];
@@ -4378,9 +4483,9 @@ index 000000000000..e89999cf9238
 +		}
 +
 +		ob = bch2_bucket_alloc_trans(trans, ca, reserve,
-+				flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
++				flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage);
 +		if (!IS_ERR(ob))
-+			bch2_dev_stripe_increment(ca, stripe);
++			bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
 +		percpu_ref_put(&ca->ref);
 +
 +		if (IS_ERR(ob)) {
@@ -4775,23 +4880,24 @@ index 000000000000..e89999cf9238
 +	hlist_add_head_rcu(&wp->node, head);
 +	mutex_unlock(&c->write_points_hash_lock);
 +out:
-+	wp->last_used = sched_clock();
++	wp->last_used = local_clock();
 +	return wp;
 +}
 +
 +/*
 + * Get us an open_bucket we can allocate from, return with it locked:
 + */
-+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *trans,
-+				unsigned target,
-+				unsigned erasure_code,
-+				struct write_point_specifier write_point,
-+				struct bch_devs_list *devs_have,
-+				unsigned nr_replicas,
-+				unsigned nr_replicas_required,
-+				enum alloc_reserve reserve,
-+				unsigned flags,
-+				struct closure *cl)
++int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
++				   unsigned target,
++				   unsigned erasure_code,
++				   struct write_point_specifier write_point,
++				   struct bch_devs_list *devs_have,
++				   unsigned nr_replicas,
++				   unsigned nr_replicas_required,
++				   enum alloc_reserve reserve,
++				   unsigned flags,
++				   struct closure *cl,
++				   struct write_point **wp_ret)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct write_point *wp;
@@ -4813,7 +4919,7 @@ index 000000000000..e89999cf9238
 +	write_points_nr = c->write_points_nr;
 +	have_cache	= false;
 +
-+	wp = writepoint_find(trans, write_point.v);
++	*wp_ret = wp = writepoint_find(trans, write_point.v);
 +
 +	if (wp->data_type == BCH_DATA_user)
 +		ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
@@ -4870,7 +4976,7 @@ index 000000000000..e89999cf9238
 +
 +	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 +
-+	return wp;
++	return 0;
 +err:
 +	open_bucket_for_each(c, &wp->ptrs, ob, i)
 +		if (ptrs.nr < ARRAY_SIZE(ptrs.v))
@@ -4888,39 +4994,13 @@ index 000000000000..e89999cf9238
 +	if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) ||
 +	    bch2_err_matches(ret, BCH_ERR_freelist_empty))
 +		return cl
-+			? ERR_PTR(-EAGAIN)
-+			: ERR_PTR(-BCH_ERR_ENOSPC_bucket_alloc);
++			? -EAGAIN
++			: -BCH_ERR_ENOSPC_bucket_alloc;
 +
 +	if (bch2_err_matches(ret, BCH_ERR_insufficient_devices))
-+		return ERR_PTR(-EROFS);
-+
-+	return ERR_PTR(ret);
-+}
-+
-+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-+				unsigned target,
-+				unsigned erasure_code,
-+				struct write_point_specifier write_point,
-+				struct bch_devs_list *devs_have,
-+				unsigned nr_replicas,
-+				unsigned nr_replicas_required,
-+				enum alloc_reserve reserve,
-+				unsigned flags,
-+				struct closure *cl)
-+{
-+	struct write_point *wp;
-+
-+	bch2_trans_do(c, NULL, NULL, 0,
-+		      PTR_ERR_OR_ZERO(wp = bch2_alloc_sectors_start_trans(&trans, target,
-+							erasure_code,
-+							write_point,
-+							devs_have,
-+							nr_replicas,
-+							nr_replicas_required,
-+							reserve,
-+							flags, cl)));
-+	return wp;
++		return -EROFS;
 +
++	return ret;
 +}
 +
 +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
@@ -4991,6 +5071,10 @@ index 000000000000..e89999cf9238
 +{
 +	mutex_init(&wp->lock);
 +	wp->data_type = type;
++
++	INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
++	INIT_LIST_HEAD(&wp->writes);
++	spin_lock_init(&wp->writes_lock);
 +}
 +
 +void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -5021,7 +5105,7 @@ index 000000000000..e89999cf9238
 +	     wp < c->write_points + c->write_points_nr; wp++) {
 +		writepoint_init(wp, BCH_DATA_user);
 +
-+		wp->last_used	= sched_clock();
++		wp->last_used	= local_clock();
 +		wp->write_point	= (unsigned long) wp;
 +		hlist_add_head_rcu(&wp->node,
 +				   writepoint_hash(c, wp->write_point));
@@ -5048,10 +5132,10 @@ index 000000000000..e89999cf9238
 +}
 diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
 new file mode 100644
-index 000000000000..6de63a351fa8
+index 000000000000..16490ffbd2c7
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.h
-@@ -0,0 +1,181 @@
+@@ -0,0 +1,174 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
 +#define _BCACHEFS_ALLOC_FOREGROUND_H
@@ -5190,22 +5274,15 @@ index 000000000000..6de63a351fa8
 +		      unsigned, unsigned *, bool *, enum alloc_reserve,
 +		      unsigned, struct closure *);
 +
-+struct write_point *bch2_alloc_sectors_start_trans(struct btree_trans *,
-+					     unsigned, unsigned,
-+					     struct write_point_specifier,
-+					     struct bch_devs_list *,
-+					     unsigned, unsigned,
-+					     enum alloc_reserve,
-+					     unsigned,
-+					     struct closure *);
-+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-+					     unsigned, unsigned,
-+					     struct write_point_specifier,
-+					     struct bch_devs_list *,
-+					     unsigned, unsigned,
-+					     enum alloc_reserve,
-+					     unsigned,
-+					     struct closure *);
++int bch2_alloc_sectors_start_trans(struct btree_trans *,
++				   unsigned, unsigned,
++				   struct write_point_specifier,
++				   struct bch_devs_list *,
++				   unsigned, unsigned,
++				   enum alloc_reserve,
++				   unsigned,
++				   struct closure *,
++				   struct write_point **);
 +
 +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
@@ -5235,10 +5312,10 @@ index 000000000000..6de63a351fa8
 +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
 diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
 new file mode 100644
-index 000000000000..e078584d46f6
+index 000000000000..3df98b22bb15
 --- /dev/null
 +++ b/fs/bcachefs/alloc_types.h
-@@ -0,0 +1,87 @@
+@@ -0,0 +1,92 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ALLOC_TYPES_H
 +#define _BCACHEFS_ALLOC_TYPES_H
@@ -5319,6 +5396,11 @@ index 000000000000..e078584d46f6
 +
 +	struct open_buckets	ptrs;
 +	struct dev_stripe_state	stripe;
++
++	struct work_struct	index_update_work;
++
++	struct list_head	writes;
++	spinlock_t		writes_lock;
 +};
 +
 +struct write_point_specifier {
@@ -5328,19 +5410,20 @@ index 000000000000..e078584d46f6
 +#endif /* _BCACHEFS_ALLOC_TYPES_H */
 diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
 new file mode 100644
-index 000000000000..955f3ee96cc0
+index 000000000000..614811eafa59
 --- /dev/null
 +++ b/fs/bcachefs/backpointers.c
-@@ -0,0 +1,898 @@
+@@ -0,0 +1,1103 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
++#include "bbpos.h"
 +#include "alloc_background.h"
 +#include "backpointers.h"
 +#include "btree_cache.h"
 +#include "btree_update.h"
 +#include "error.h"
 +
-+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
++#include <linux/mm.h>
 +
 +/*
 + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
@@ -5363,31 +5446,15 @@ index 000000000000..955f3ee96cc0
 +					   u64 bucket_offset)
 +{
 +	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
++	struct bpos ret;
 +
-+	return POS(bucket.inode,
-+		   (bucket_to_sector(ca, bucket.offset) <<
-+		    MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
-+}
++	ret = POS(bucket.inode,
++		  (bucket_to_sector(ca, bucket.offset) <<
++		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
 +
-+void bch2_extent_ptr_to_bp(struct bch_fs *c,
-+			   enum btree_id btree_id, unsigned level,
-+			   struct bkey_s_c k, struct extent_ptr_decoded p,
-+			   struct bpos *bucket_pos, struct bch_backpointer *bp)
-+{
-+	enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
-+	s64 sectors = level ? btree_sectors(c) : k.k->size;
-+	u32 bucket_offset;
++	BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret)));
 +
-+	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
-+	*bp = (struct bch_backpointer) {
-+		.btree_id	= btree_id,
-+		.level		= level,
-+		.data_type	= data_type,
-+		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
-+			p.crc.offset,
-+		.bucket_len	= ptr_disk_sectors(sectors, p),
-+		.pos		= k.k->p,
-+	};
++	return ret;
 +}
 +
 +static bool extent_matches_bp(struct bch_fs *c,
@@ -5740,20 +5807,24 @@ index 000000000000..955f3ee96cc0
 +int bch2_get_next_backpointer(struct btree_trans *trans,
 +			      struct bpos bucket, int gen,
 +			      u64 *bp_offset,
-+			      struct bch_backpointer *dst)
++			      struct bch_backpointer *dst,
++			      unsigned iter_flags)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bpos bp_pos =
-+		bucket_pos_to_bp(c, bucket,
-+				max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
-+	struct bpos bp_end_pos =
-+		bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
++	struct bpos bp_pos, bp_end_pos;
 +	struct btree_iter alloc_iter, bp_iter = { NULL };
 +	struct bkey_s_c k;
 +	struct bkey_s_c_alloc_v4 a;
 +	size_t i;
 +	int ret;
 +
++	if (*bp_offset == U64_MAX)
++		return 0;
++
++	bp_pos = bucket_pos_to_bp(c, bucket,
++				  max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
++	bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
++
 +	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
 +			     bucket, BTREE_ITER_CACHED);
 +	k = bch2_btree_iter_peek_slot(&alloc_iter);
@@ -5857,7 +5928,7 @@ index 000000000000..955f3ee96cc0
 +	if (bp.level == c->btree_roots[bp.btree_id].level + 1)
 +		k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
 +
-+	if (extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
++	if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
 +		return k;
 +
 +	bch2_trans_iter_exit(trans, iter);
@@ -5907,12 +5978,12 @@ index 000000000000..955f3ee96cc0
 +	if (IS_ERR(b))
 +		goto err;
 +
-+	if (extent_matches_bp(c, bp.btree_id, bp.level,
-+			      bkey_i_to_s_c(&b->key),
-+			      bucket, bp))
++	if (b && extent_matches_bp(c, bp.btree_id, bp.level,
++				   bkey_i_to_s_c(&b->key),
++				   bucket, bp))
 +		return b;
 +
-+	if (btree_node_will_make_reachable(b)) {
++	if (b && btree_node_will_make_reachable(b)) {
 +		b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
 +	} else {
 +		backpointer_not_found(trans, bucket, bp_offset, bp,
@@ -5981,7 +6052,9 @@ index 000000000000..955f3ee96cc0
 +static int check_bp_exists(struct btree_trans *trans,
 +			   struct bpos bucket_pos,
 +			   struct bch_backpointer bp,
-+			   struct bkey_s_c orig_k)
++			   struct bkey_s_c orig_k,
++			   struct bpos bucket_start,
++			   struct bpos bucket_end)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter alloc_iter, bp_iter = { NULL };
@@ -5989,6 +6062,10 @@ index 000000000000..955f3ee96cc0
 +	struct bkey_s_c alloc_k, bp_k;
 +	int ret;
 +
++	if (bpos_cmp(bucket_pos, bucket_start) < 0 ||
++	    bpos_cmp(bucket_pos, bucket_end) > 0)
++		return 0;
++
 +	bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
 +	alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
 +	ret = bkey_err(alloc_k);
@@ -6051,7 +6128,9 @@ index 000000000000..955f3ee96cc0
 +}
 +
 +static int check_extent_to_backpointers(struct btree_trans *trans,
-+					struct btree_iter *iter)
++					struct btree_iter *iter,
++					struct bpos bucket_start,
++					struct bpos bucket_end)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bkey_ptrs_c ptrs;
@@ -6078,7 +6157,7 @@ index 000000000000..955f3ee96cc0
 +		bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
 +				      k, p, &bucket_pos, &bp);
 +
-+		ret = check_bp_exists(trans, bucket_pos, bp, k);
++		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
 +		if (ret)
 +			return ret;
 +	}
@@ -6087,7 +6166,9 @@ index 000000000000..955f3ee96cc0
 +}
 +
 +static int check_btree_root_to_backpointers(struct btree_trans *trans,
-+					    enum btree_id btree_id)
++					    enum btree_id btree_id,
++					    struct bpos bucket_start,
++					    struct bpos bucket_end)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_iter iter;
@@ -6119,7 +6200,7 @@ index 000000000000..955f3ee96cc0
 +		bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
 +				      k, p, &bucket_pos, &bp);
 +
-+		ret = check_bp_exists(trans, bucket_pos, bp, k);
++		ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end);
 +		if (ret)
 +			goto err;
 +	}
@@ -6128,60 +6209,222 @@ index 000000000000..955f3ee96cc0
 +	return ret;
 +}
 +
-+int bch2_check_extents_to_backpointers(struct bch_fs *c)
++static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
++{
++	return (struct bbpos) {
++		.btree	= bp.btree_id,
++		.pos	= bp.pos,
++	};
++}
++
++static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
++{
++	struct sysinfo i;
++	u64 mem_bytes;
++
++	si_meminfo(&i);
++	mem_bytes = i.totalram * i.mem_unit;
++	return (mem_bytes >> 1) / btree_bytes(c);
++}
++
++int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
++				 unsigned btree_leaf_mask,
++				 unsigned btree_interior_mask,
++				 struct bbpos start, struct bbpos *end)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
++	enum btree_id btree;
++	int ret = 0;
++
++	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
++		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
++
++		if (!((1U << btree) & btree_leaf_mask) &&
++		    !((1U << btree) & btree_interior_mask))
++			continue;
++
++		bch2_trans_node_iter_init(trans, &iter, btree,
++					  btree == start.btree ? start.pos : POS_MIN,
++					  0, depth, 0);
++		/*
++		 * for_each_btree_key_contineu() doesn't check the return value
++		 * from bch2_btree_iter_advance(), which is needed when
++		 * iterating over interior nodes where we'll see keys at
++		 * SPOS_MAX:
++		 */
++		do {
++			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
++			ret = bkey_err(k);
++			if (!k.k || ret)
++				break;
++
++			--btree_nodes;
++			if (!btree_nodes) {
++				*end = BBPOS(btree, k.k->p);
++				bch2_trans_iter_exit(trans, &iter);
++				return 0;
++			}
++		} while (bch2_btree_iter_advance(&iter));
++		bch2_trans_iter_exit(trans, &iter);
++	}
++
++	*end = BBPOS_MAX;
++	return ret;
++}
++
++static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
++						   struct bpos bucket_start,
++						   struct bpos bucket_end)
 +{
-+	struct btree_trans trans;
 +	struct btree_iter iter;
 +	enum btree_id btree_id;
 +	int ret = 0;
 +
-+	bch2_trans_init(&trans, c, 0, 0);
 +	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-+		bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
-+					  0,
++		unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
++
++		bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
++					  depth,
 +					  BTREE_ITER_ALL_LEVELS|
 +					  BTREE_ITER_PREFETCH);
 +
 +		do {
-+			ret = commit_do(&trans, NULL, NULL,
-+					      BTREE_INSERT_LAZY_RW|
-+					      BTREE_INSERT_NOFAIL,
-+					      check_extent_to_backpointers(&trans, &iter));
++			ret = commit_do(trans, NULL, NULL,
++					BTREE_INSERT_LAZY_RW|
++					BTREE_INSERT_NOFAIL,
++					check_extent_to_backpointers(trans, &iter,
++								bucket_start, bucket_end));
 +			if (ret)
 +				break;
 +		} while (!bch2_btree_iter_advance(&iter));
 +
-+		bch2_trans_iter_exit(&trans, &iter);
++		bch2_trans_iter_exit(trans, &iter);
 +
 +		if (ret)
 +			break;
 +
-+		ret = commit_do(&trans, NULL, NULL,
-+				      BTREE_INSERT_LAZY_RW|
-+				      BTREE_INSERT_NOFAIL,
-+				      check_btree_root_to_backpointers(&trans, btree_id));
++		ret = commit_do(trans, NULL, NULL,
++				BTREE_INSERT_LAZY_RW|
++				BTREE_INSERT_NOFAIL,
++				check_btree_root_to_backpointers(trans, btree_id,
++							bucket_start, bucket_end));
 +		if (ret)
 +			break;
 +	}
++	return ret;
++}
++
++int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
++				 struct bpos start, struct bpos *end)
++{
++	struct btree_iter alloc_iter;
++	struct btree_iter bp_iter;
++	struct bkey_s_c alloc_k, bp_k;
++	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
++	bool alloc_end = false, bp_end = false;
++	int ret = 0;
++
++	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
++				  start, 0, 1, 0);
++	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
++				  bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0);
++	while (1) {
++		alloc_k = !alloc_end
++			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
++			: bkey_s_c_null;
++		bp_k = !bp_end
++			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
++			: bkey_s_c_null;
++
++		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
++		if ((!alloc_k.k && !bp_k.k) || ret) {
++			*end = SPOS_MAX;
++			break;
++		}
++
++		--btree_nodes;
++		if (!btree_nodes) {
++			*end = alloc_k.k->p;
++			break;
++		}
++
++		if (bpos_cmp(alloc_iter.pos, SPOS_MAX) &&
++		    bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) {
++			if (!bch2_btree_iter_advance(&alloc_iter))
++				alloc_end = true;
++		} else {
++			if (!bch2_btree_iter_advance(&bp_iter))
++				bp_end = true;
++		}
++	}
++	bch2_trans_iter_exit(trans, &bp_iter);
++	bch2_trans_iter_exit(trans, &alloc_iter);
++	return ret;
++}
++
++int bch2_check_extents_to_backpointers(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct bpos start = POS_MIN, end;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	while (1) {
++		ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
++		if (ret)
++			break;
++
++		if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX))
++			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
++				    __func__, btree_nodes_fit_in_ram(c));
++
++		if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) {
++			struct printbuf buf = PRINTBUF;
++
++			prt_str(&buf, "check_extents_to_backpointers(): ");
++			bch2_bpos_to_text(&buf, start);
++			prt_str(&buf, "-");
++			bch2_bpos_to_text(&buf, end);
++
++			bch_verbose(c, "%s", buf.buf);
++			printbuf_exit(&buf);
++		}
++
++		ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
++		if (ret || !bpos_cmp(end, SPOS_MAX))
++			break;
++
++		start = bpos_successor(end);
++	}
 +	bch2_trans_exit(&trans);
++
 +	return ret;
 +}
 +
 +static int check_one_backpointer(struct btree_trans *trans,
 +				 struct bpos bucket,
-+				 u64 *bp_offset)
++				 u64 *bp_offset,
++				 struct bbpos start,
++				 struct bbpos end)
 +{
 +	struct btree_iter iter;
 +	struct bch_backpointer bp;
++	struct bbpos pos;
 +	struct bkey_s_c k;
 +	struct printbuf buf = PRINTBUF;
 +	int ret;
 +
-+	ret = bch2_get_next_backpointer(trans, bucket, -1,
-+					bp_offset, &bp);
++	ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0);
 +	if (ret || *bp_offset == U64_MAX)
 +		return ret;
 +
++	pos = bp_to_bbpos(bp);
++	if (bbpos_cmp(pos, start) < 0 ||
++	    bbpos_cmp(pos, end) > 0)
++		return 0;
++
 +	k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
 +	ret = bkey_err(k);
 +	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
@@ -6204,42 +6447,87 @@ index 000000000000..955f3ee96cc0
 +	return ret;
 +}
 +
-+int bch2_check_backpointers_to_extents(struct bch_fs *c)
++static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
++						   struct bbpos start,
++						   struct bbpos end)
 +{
-+	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	int ret = 0;
 +
-+	bch2_trans_init(&trans, c, 0, 0);
-+	for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
++	for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
 +			   BTREE_ITER_PREFETCH, k, ret) {
 +		u64 bp_offset = 0;
 +
-+		while (!(ret = commit_do(&trans, NULL, NULL,
-+					       BTREE_INSERT_LAZY_RW|
-+					       BTREE_INSERT_NOFAIL,
-+				check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
++		while (!(ret = commit_do(trans, NULL, NULL,
++					 BTREE_INSERT_LAZY_RW|
++					 BTREE_INSERT_NOFAIL,
++				check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) &&
 +		       bp_offset < U64_MAX)
 +			bp_offset++;
 +
 +		if (ret)
 +			break;
 +	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+	bch2_trans_exit(&trans);
++	bch2_trans_iter_exit(trans, &iter);
 +	return ret < 0 ? ret : 0;
 +}
++
++int bch2_check_backpointers_to_extents(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++	while (1) {
++		ret = bch2_get_btree_in_memory_pos(&trans,
++						   (1U << BTREE_ID_extents)|
++						   (1U << BTREE_ID_reflink),
++						   ~0,
++						   start, &end);
++		if (ret)
++			break;
++
++		if (!bbpos_cmp(start, BBPOS_MIN) &&
++		    bbpos_cmp(end, BBPOS_MAX))
++			bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass",
++				    __func__, btree_nodes_fit_in_ram(c));
++
++		if (bbpos_cmp(start, BBPOS_MIN) ||
++		    bbpos_cmp(end, BBPOS_MAX)) {
++			struct printbuf buf = PRINTBUF;
++
++			prt_str(&buf, "check_backpointers_to_extents(): ");
++			bch2_bbpos_to_text(&buf, start);
++			prt_str(&buf, "-");
++			bch2_bbpos_to_text(&buf, end);
++
++			bch_verbose(c, "%s", buf.buf);
++			printbuf_exit(&buf);
++		}
++
++		ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
++		if (ret || !bbpos_cmp(end, BBPOS_MAX))
++			break;
++
++		start = bbpos_successor(end);
++	}
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
 diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
 new file mode 100644
-index 000000000000..fe42af296e9c
+index 000000000000..48a48b75c0ac
 --- /dev/null
 +++ b/fs/bcachefs/backpointers.h
-@@ -0,0 +1,38 @@
+@@ -0,0 +1,58 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
 +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
 +
++#include "buckets.h"
 +#include "super.h"
 +
 +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
@@ -6248,22 +6536,41 @@ index 000000000000..fe42af296e9c
 +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +void bch2_backpointer_swab(struct bkey_s);
 +
-+#define bch2_bkey_ops_backpointer (struct bkey_ops) {	\
++#define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_backpointer_invalid,	\
 +	.val_to_text	= bch2_backpointer_k_to_text,	\
 +	.swab		= bch2_backpointer_swab,	\
-+}
++})
 +
-+void bch2_extent_ptr_to_bp(struct bch_fs *, enum btree_id, unsigned,
-+			   struct bkey_s_c, struct extent_ptr_decoded,
-+			   struct bpos *, struct bch_backpointer *);
++#define MAX_EXTENT_COMPRESS_RATIO_SHIFT		10
++
++static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
++			   enum btree_id btree_id, unsigned level,
++			   struct bkey_s_c k, struct extent_ptr_decoded p,
++			   struct bpos *bucket_pos, struct bch_backpointer *bp)
++{
++	enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
++	s64 sectors = level ? btree_sectors(c) : k.k->size;
++	u32 bucket_offset;
++
++	*bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
++	*bp = (struct bch_backpointer) {
++		.btree_id	= btree_id,
++		.level		= level,
++		.data_type	= data_type,
++		.bucket_offset	= ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
++			p.crc.offset,
++		.bucket_len	= ptr_disk_sectors(sectors, p),
++		.pos		= k.k->p,
++	};
++}
 +
 +int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *,
 +				struct bch_backpointer, struct bkey_s_c);
 +int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *,
 +				struct bch_backpointer, struct bkey_s_c);
 +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
-+			      u64 *, struct bch_backpointer *);
++			      u64 *, struct bch_backpointer *, unsigned);
 +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
 +					 struct bpos, u64, struct bch_backpointer);
 +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
@@ -6274,12 +6581,66 @@ index 000000000000..fe42af296e9c
 +int bch2_check_backpointers_to_extents(struct bch_fs *);
 +
 +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
+diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h
+new file mode 100644
+index 000000000000..1fbed1f8378d
+--- /dev/null
++++ b/fs/bcachefs/bbpos.h
+@@ -0,0 +1,48 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BBPOS_H
++#define _BCACHEFS_BBPOS_H
++
++#include "bkey_methods.h"
++
++struct bbpos {
++	enum btree_id		btree;
++	struct bpos		pos;
++};
++
++static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
++{
++	return (struct bbpos) { btree, pos };
++}
++
++#define BBPOS_MIN	BBPOS(0, POS_MIN)
++#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
++
++static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
++{
++	return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
++}
++
++static inline struct bbpos bbpos_successor(struct bbpos pos)
++{
++	if (bpos_cmp(pos.pos, SPOS_MAX)) {
++		pos.pos = bpos_successor(pos.pos);
++		return pos;
++	}
++
++	if (pos.btree != BTREE_ID_NR) {
++		pos.btree++;
++		pos.pos = POS_MIN;
++		return pos;
++	}
++
++	BUG();
++}
++
++static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
++{
++	prt_str(out, bch2_btree_ids[pos.btree]);
++	prt_char(out, ':');
++	bch2_bpos_to_text(out, pos.pos);
++}
++
++#endif /* _BCACHEFS_BBPOS_H */
 diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
 new file mode 100644
-index 000000000000..ccac2a3fcdf7
+index 000000000000..d90effeb06a7
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs.h
-@@ -0,0 +1,1001 @@
+@@ -0,0 +1,1019 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_H
 +#define _BCACHEFS_H
@@ -6389,7 +6750,7 @@ index 000000000000..ccac2a3fcdf7
 + *
 + * BTREE NODES:
 + *
-+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
++ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
 + * free smaller than a bucket - so, that's how big our btree nodes are.
 + *
 + * (If buckets are really big we'll only use part of the bucket for a btree node
@@ -6564,7 +6925,7 @@ index 000000000000..ccac2a3fcdf7
 +		"When reading btree nodes, read all replicas and "	\
 +		"compare them")
 +
-+/* Parameters that should only be compiled in in debug mode: */
++/* Parameters that should only be compiled in debug mode: */
 +#define BCH_DEBUG_PARAMS_DEBUG()					\
 +	BCH_DEBUG_PARAM(expensive_debug_checks,				\
 +		"Enables various runtime debugging checks that "	\
@@ -6880,6 +7241,23 @@ index 000000000000..ccac2a3fcdf7
 +#define BCACHEFS_ROOT_SUBVOL_INUM					\
 +	((subvol_inum) { BCACHEFS_ROOT_SUBVOL,	BCACHEFS_ROOT_INO })
 +
++#define BCH_BTREE_WRITE_TYPES()						\
++	x(initial,		0)					\
++	x(init_next_bset,	1)					\
++	x(cache_reclaim,	2)					\
++	x(journal_reclaim,	3)					\
++	x(interior,		4)
++
++enum btree_write_type {
++#define x(t, n) BTREE_WRITE_##t,
++	BCH_BTREE_WRITE_TYPES()
++#undef x
++	BTREE_WRITE_TYPE_NR,
++};
++
++#define BTREE_WRITE_TYPE_MASK	(roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
++#define BTREE_WRITE_TYPE_BITS	ilog2(BTREE_WRITE_TYPE_MASK)
++
 +struct bch_fs {
 +	struct closure		cl;
 +
@@ -6989,6 +7367,13 @@ index 000000000000..ccac2a3fcdf7
 +	struct workqueue_struct	*btree_interior_update_worker;
 +	struct work_struct	btree_interior_update_work;
 +
++	/* btree_io.c: */
++	spinlock_t		btree_write_error_lock;
++	struct btree_write_stats {
++		atomic64_t	nr;
++		atomic64_t	bytes;
++	}			btree_write_stats[BTREE_WRITE_TYPE_NR];
++
 +	/* btree_iter.c: */
 +	struct mutex		btree_trans_lock;
 +	struct list_head	btree_trans_list;
@@ -7163,11 +7548,6 @@ index 000000000000..ccac2a3fcdf7
 +	struct bio_set		dio_write_bioset;
 +	struct bio_set		dio_read_bioset;
 +
-+
-+	atomic64_t		btree_writes_nr;
-+	atomic64_t		btree_writes_sectors;
-+	spinlock_t		btree_write_error_lock;
-+
 +	/* ERRORS */
 +	struct list_head	fsck_errors;
 +	struct mutex		fsck_error_lock;
@@ -7212,7 +7592,6 @@ index 000000000000..ccac2a3fcdf7
 +
 +	struct time_stats	times[BCH_TIME_STAT_NR];
 +
-+	const char              *btree_transaction_fns[BCH_TRANSACTIONS_NR];
 +	struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 +};
 +
@@ -7283,10 +7662,10 @@ index 000000000000..ccac2a3fcdf7
 +#endif /* _BCACHEFS_H */
 diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
 new file mode 100644
-index 000000000000..9e10fc8301f0
+index 000000000000..5da9f3a4d47d
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_format.h
-@@ -0,0 +1,2122 @@
+@@ -0,0 +1,2172 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_FORMAT_H
 +#define _BCACHEFS_FORMAT_H
@@ -7436,7 +7815,7 @@ index 000000000000..9e10fc8301f0
 +#else
 +#error edit for your odd byteorder.
 +#endif
-+} __attribute__((packed, aligned(4)));
++} __packed __aligned(4);
 +
 +#define KEY_INODE_MAX			((__u64)~0ULL)
 +#define KEY_OFFSET_MAX			((__u64)~0ULL)
@@ -7470,7 +7849,7 @@ index 000000000000..9e10fc8301f0
 +	__u32		hi;
 +	__u64		lo;
 +#endif
-+} __attribute__((packed, aligned(4)));
++} __packed __aligned(4);
 +
 +struct bkey {
 +	/* Size of combined key and value, in u64s */
@@ -7503,7 +7882,7 @@ index 000000000000..9e10fc8301f0
 +
 +	__u8		pad[1];
 +#endif
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bkey_packed {
 +	__u64		_data[0];
@@ -7537,7 +7916,7 @@ index 000000000000..9e10fc8301f0
 +	 * to the same size as struct bkey should hopefully be safest.
 +	 */
 +	__u8		pad[sizeof(struct bkey) - 3];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BKEY_U64s			(sizeof(struct bkey) / sizeof(__u64))
 +#define BKEY_U64s_MAX			U8_MAX
@@ -7625,7 +8004,7 @@ index 000000000000..9e10fc8301f0
 + *   number.
 + *
 + * - WHITEOUT: for hash table btrees
-+*/
++ */
 +#define BCH_BKEY_TYPES()				\
 +	x(deleted,		0)			\
 +	x(whiteout,		1)			\
@@ -7655,7 +8034,8 @@ index 000000000000..9e10fc8301f0
 +	x(set,			25)			\
 +	x(lru,			26)			\
 +	x(alloc_v4,		27)			\
-+	x(backpointer,		28)
++	x(backpointer,		28)			\
++	x(inode_v3,		29)
 +
 +enum bch_bkey_type {
 +#define x(name, nr) KEY_TYPE_##name	= nr,
@@ -7766,7 +8146,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_csum {
 +	__le64			lo;
 +	__le64			hi;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BCH_EXTENT_ENTRY_TYPES()		\
 +	x(ptr,			0)		\
@@ -7803,7 +8183,7 @@ index 000000000000..9e10fc8301f0
 +				_compressed_size:7,
 +				type:2;
 +#endif
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define CRC32_SIZE_MAX		(1U << 7)
 +#define CRC32_NONCE_MAX		0
@@ -7829,7 +8209,7 @@ index 000000000000..9e10fc8301f0
 +				type:3;
 +#endif
 +	__u64			csum_lo;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define CRC64_SIZE_MAX		(1U << 9)
 +#define CRC64_NONCE_MAX		((1U << 10) - 1)
@@ -7853,7 +8233,7 @@ index 000000000000..9e10fc8301f0
 +				type:4;
 +#endif
 +	struct bch_csum		csum;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define CRC128_SIZE_MAX		(1U << 13)
 +#define CRC128_NONCE_MAX	((1U << 13) - 1)
@@ -7879,7 +8259,7 @@ index 000000000000..9e10fc8301f0
 +				cached:1,
 +				type:1;
 +#endif
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_extent_stripe_ptr {
 +#if defined(__LITTLE_ENDIAN_BITFIELD)
@@ -7931,7 +8311,7 @@ index 000000000000..9e10fc8301f0
 +
 +	__u64			_data[0];
 +	struct bch_extent_ptr	start[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_btree_ptr_v2 {
 +	struct bch_val		v;
@@ -7943,7 +8323,7 @@ index 000000000000..9e10fc8301f0
 +	struct bpos		min_key;
 +	__u64			_data[0];
 +	struct bch_extent_ptr	start[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,	struct bch_btree_ptr_v2, flags, 0, 1);
 +
@@ -7952,7 +8332,7 @@ index 000000000000..9e10fc8301f0
 +
 +	__u64			_data[0];
 +	union bch_extent_entry	start[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_reservation {
 +	struct bch_val		v;
@@ -7960,7 +8340,7 @@ index 000000000000..9e10fc8301f0
 +	__le32			generation;
 +	__u8			nr_replicas;
 +	__u8			pad[3];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* Maximum size (in u64s) a single pointer could be: */
 +#define BKEY_EXTENT_PTR_U64s_MAX\
@@ -7994,7 +8374,7 @@ index 000000000000..9e10fc8301f0
 +	__le32			bi_flags;
 +	__le16			bi_mode;
 +	__u8			fields[0];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_inode_v2 {
 +	struct bch_val		v;
@@ -8004,20 +8384,35 @@ index 000000000000..9e10fc8301f0
 +	__le64			bi_flags;
 +	__le16			bi_mode;
 +	__u8			fields[0];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
++
++struct bch_inode_v3 {
++	struct bch_val		v;
++
++	__le64			bi_journal_seq;
++	__le64			bi_hash_seed;
++	__le64			bi_flags;
++	__le64			bi_sectors;
++	__le64			bi_size;
++	__le64			bi_version;
++	__u8			fields[0];
++} __packed __aligned(8);
++
++#define INODEv3_FIELDS_START_INITIAL	6
++#define INODEv3_FIELDS_START_CUR	(offsetof(struct bch_inode_v3, fields) / sizeof(u64))
 +
 +struct bch_inode_generation {
 +	struct bch_val		v;
 +
 +	__le32			bi_generation;
 +	__le32			pad;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/*
 + * bi_subvol and bi_parent_subvol are only set for subvolume roots:
 + */
 +
-+#define BCH_INODE_FIELDS()			\
++#define BCH_INODE_FIELDS_v2()			\
 +	x(bi_atime,			96)	\
 +	x(bi_ctime,			96)	\
 +	x(bi_mtime,			96)	\
@@ -8044,6 +8439,31 @@ index 000000000000..9e10fc8301f0
 +	x(bi_subvol,			32)	\
 +	x(bi_parent_subvol,		32)
 +
++#define BCH_INODE_FIELDS_v3()			\
++	x(bi_atime,			96)	\
++	x(bi_ctime,			96)	\
++	x(bi_mtime,			96)	\
++	x(bi_otime,			96)	\
++	x(bi_uid,			32)	\
++	x(bi_gid,			32)	\
++	x(bi_nlink,			32)	\
++	x(bi_generation,		32)	\
++	x(bi_dev,			32)	\
++	x(bi_data_checksum,		8)	\
++	x(bi_compression,		8)	\
++	x(bi_project,			32)	\
++	x(bi_background_compression,	8)	\
++	x(bi_data_replicas,		8)	\
++	x(bi_promote_target,		16)	\
++	x(bi_foreground_target,		16)	\
++	x(bi_background_target,		16)	\
++	x(bi_erasure_code,		16)	\
++	x(bi_fields_set,		16)	\
++	x(bi_dir,			64)	\
++	x(bi_dir_offset,		64)	\
++	x(bi_subvol,			32)	\
++	x(bi_parent_subvol,		32)
++
 +/* subset of BCH_INODE_FIELDS */
 +#define BCH_INODE_OPTS()			\
 +	x(data_checksum,		8)	\
@@ -8069,16 +8489,16 @@ index 000000000000..9e10fc8301f0
 +	 * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
 +	 * flags)
 +	 */
-+	__BCH_INODE_SYNC	= 0,
-+	__BCH_INODE_IMMUTABLE	= 1,
-+	__BCH_INODE_APPEND	= 2,
-+	__BCH_INODE_NODUMP	= 3,
-+	__BCH_INODE_NOATIME	= 4,
++	__BCH_INODE_SYNC		= 0,
++	__BCH_INODE_IMMUTABLE		= 1,
++	__BCH_INODE_APPEND		= 2,
++	__BCH_INODE_NODUMP		= 3,
++	__BCH_INODE_NOATIME		= 4,
 +
-+	__BCH_INODE_I_SIZE_DIRTY= 5,
-+	__BCH_INODE_I_SECTORS_DIRTY= 6,
-+	__BCH_INODE_UNLINKED	= 7,
-+	__BCH_INODE_BACKPTR_UNTRUSTED = 8,
++	__BCH_INODE_I_SIZE_DIRTY	= 5,
++	__BCH_INODE_I_SECTORS_DIRTY	= 6,
++	__BCH_INODE_UNLINKED		= 7,
++	__BCH_INODE_BACKPTR_UNTRUSTED	= 8,
 +
 +	/* bits 20+ reserved for packed fields below: */
 +};
@@ -8100,6 +8520,13 @@ index 000000000000..9e10fc8301f0
 +LE64_BITMASK(INODEv2_STR_HASH,	struct bch_inode_v2, bi_flags, 20, 24);
 +LE64_BITMASK(INODEv2_NR_FIELDS,	struct bch_inode_v2, bi_flags, 24, 31);
 +
++LE64_BITMASK(INODEv3_STR_HASH,	struct bch_inode_v3, bi_flags, 20, 24);
++LE64_BITMASK(INODEv3_NR_FIELDS,	struct bch_inode_v3, bi_flags, 24, 31);
++
++LE64_BITMASK(INODEv3_FIELDS_START,
++				struct bch_inode_v3, bi_flags, 31, 36);
++LE64_BITMASK(INODEv3_MODE,	struct bch_inode_v3, bi_flags, 36, 52);
++
 +/* Dirents */
 +
 +/*
@@ -8132,7 +8559,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			d_type;
 +
 +	__u8			d_name[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define DT_SUBVOL	16
 +#define BCH_DT_MAX	17
@@ -8155,7 +8582,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			x_name_len;
 +	__le16			x_val_len;
 +	__u8			x_name[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* Bucket/allocation information: */
 +
@@ -8164,7 +8591,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			fields;
 +	__u8			gen;
 +	__u8			data[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BCH_ALLOC_FIELDS_V1()			\
 +	x(read_time,		16)		\
@@ -8189,7 +8616,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			oldest_gen;
 +	__u8			data_type;
 +	__u8			data[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BCH_ALLOC_FIELDS_V2()			\
 +	x(read_time,		64)		\
@@ -8208,7 +8635,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			oldest_gen;
 +	__u8			data_type;
 +	__u8			data[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
 +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
@@ -8226,7 +8653,7 @@ index 000000000000..9e10fc8301f0
 +	__u64			io_time[2];
 +	__u32			stripe;
 +	__u32			nr_external_backpointers;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BCH_ALLOC_V4_U64s_V0	6
 +#define BCH_ALLOC_V4_U64s	(sizeof(struct bch_alloc_v4) / sizeof(u64))
@@ -8246,7 +8673,7 @@ index 000000000000..9e10fc8301f0
 +	__u64			bucket_offset:40;
 +	__u32			bucket_len;
 +	struct bpos		pos;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* Quotas: */
 +
@@ -8271,7 +8698,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_quota {
 +	struct bch_val		v;
 +	struct bch_quota_counter c[Q_COUNTERS];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* Erasure coding */
 +
@@ -8287,7 +8714,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			pad;
 +
 +	struct bch_extent_ptr	ptrs[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* Reflink: */
 +
@@ -8304,14 +8731,14 @@ index 000000000000..9e10fc8301f0
 +	 */
 +	__le32			front_pad;
 +	__le32			back_pad;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_reflink_v {
 +	struct bch_val		v;
 +	__le64			refcount;
 +	union bch_extent_entry	start[0];
 +	__u64			_data[0];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_indirect_inline_data {
 +	struct bch_val		v;
@@ -8368,7 +8795,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_lru {
 +	struct bch_val		v;
 +	__le64			idx;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define LRU_ID_STRIPES		(1U << 16)
 +
@@ -8567,19 +8994,19 @@ index 000000000000..9e10fc8301f0
 +	__u8			data_type;
 +	__u8			nr_devs;
 +	__u8			devs[];
-+} __attribute__((packed));
++} __packed;
 +
 +struct bch_sb_field_replicas_v0 {
 +	struct bch_sb_field	field;
 +	struct bch_replicas_entry_v0 entries[];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_replicas_entry {
 +	__u8			data_type;
 +	__u8			nr_devs;
 +	__u8			nr_required;
 +	__u8			devs[];
-+} __attribute__((packed));
++} __packed;
 +
 +#define replicas_entry_bytes(_i)					\
 +	(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
@@ -8587,7 +9014,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_sb_field_replicas {
 +	struct bch_sb_field	field;
 +	struct bch_replicas_entry entries[0];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* BCH_SB_FIELD_quota: */
 +
@@ -8604,7 +9031,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_sb_field_quota {
 +	struct bch_sb_field		field;
 +	struct bch_sb_quota_type	q[QTYP_NR];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* BCH_SB_FIELD_disk_groups: */
 +
@@ -8613,7 +9040,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_disk_group {
 +	__u8			label[BCH_SB_LABEL_SIZE];
 +	__le64			flags[2];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
 +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
@@ -8622,7 +9049,7 @@ index 000000000000..9e10fc8301f0
 +struct bch_sb_field_disk_groups {
 +	struct bch_sb_field	field;
 +	struct bch_disk_group	entries[0];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/* BCH_SB_FIELD_counters */
 +
@@ -8783,7 +9210,8 @@ index 000000000000..9e10fc8301f0
 +	x(freespace,			19)		\
 +	x(alloc_v4,			20)		\
 +	x(new_data_types,		21)		\
-+	x(backpointers,			22)
++	x(backpointers,			22)		\
++	x(inode_v3,			23)
 +
 +enum bcachefs_metadata_version {
 +	bcachefs_metadata_version_min = 9,
@@ -8805,7 +9233,7 @@ index 000000000000..9e10fc8301f0
 +	__u8			nr_superblocks;
 +	__u8			pad[5];
 +	__le64			sb_offset[61];
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#define BCH_SB_LAYOUT_SECTOR	7
 +
@@ -8856,7 +9284,7 @@ index 000000000000..9e10fc8301f0
 +		struct bch_sb_field start[0];
 +		__le64		_data[0];
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +/*
 + * Flags:
@@ -9139,6 +9567,7 @@ index 000000000000..9e10fc8301f0
 +static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
 +{
 +	__le64 ret;
++
 +	memcpy(&ret, &sb->uuid, sizeof(ret));
 +	return ret;
 +}
@@ -9213,26 +9642,26 @@ index 000000000000..9e10fc8301f0
 +struct jset_entry_usage {
 +	struct jset_entry	entry;
 +	__le64			v;
-+} __attribute__((packed));
++} __packed;
 +
 +struct jset_entry_data_usage {
 +	struct jset_entry	entry;
 +	__le64			v;
 +	struct bch_replicas_entry r;
-+} __attribute__((packed));
++} __packed;
 +
 +struct jset_entry_clock {
 +	struct jset_entry	entry;
 +	__u8			rw;
 +	__u8			pad[7];
 +	__le64			time;
-+} __attribute__((packed));
++} __packed;
 +
 +struct jset_entry_dev_usage_type {
 +	__le64			buckets;
 +	__le64			sectors;
 +	__le64			fragmented;
-+} __attribute__((packed));
++} __packed;
 +
 +struct jset_entry_dev_usage {
 +	struct jset_entry	entry;
@@ -9243,7 +9672,7 @@ index 000000000000..9e10fc8301f0
 +	__le64			_buckets_unavailable; /* No longer used */
 +
 +	struct jset_entry_dev_usage_type d[];
-+} __attribute__((packed));
++} __packed;
 +
 +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
 +{
@@ -9254,7 +9683,7 @@ index 000000000000..9e10fc8301f0
 +struct jset_entry_log {
 +	struct jset_entry	entry;
 +	u8			d[];
-+} __attribute__((packed));
++} __packed;
 +
 +/*
 + * On disk format for a journal entry:
@@ -9289,7 +9718,7 @@ index 000000000000..9e10fc8301f0
 +		struct jset_entry start[0];
 +		__u64		_data[0];
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE32_BITMASK(JSET_CSUM_TYPE,	struct jset, flags, 0, 4);
 +LE32_BITMASK(JSET_BIG_ENDIAN,	struct jset, flags, 4, 5);
@@ -9352,7 +9781,7 @@ index 000000000000..9e10fc8301f0
 +		struct bkey_packed start[0];
 +		__u64		_data[0];
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE32_BITMASK(BSET_CSUM_TYPE,	struct bset, flags, 0, 4);
 +
@@ -9385,7 +9814,7 @@ index 000000000000..9e10fc8301f0
 +
 +	};
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +LE64_BITMASK(BTREE_NODE_ID,	struct btree_node, flags,  0,  4);
 +LE64_BITMASK(BTREE_NODE_LEVEL,	struct btree_node, flags,  4,  8);
@@ -9406,12 +9835,12 @@ index 000000000000..9e10fc8301f0
 +
 +	};
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +#endif /* _BCACHEFS_FORMAT_H */
 diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
 new file mode 100644
-index 000000000000..b2edabf58260
+index 000000000000..ad47a506a907
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_ioctl.h
 @@ -0,0 +1,368 @@
@@ -9625,7 +10054,7 @@ index 000000000000..b2edabf58260
 +		__u64		pad[8];
 +	};
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +enum bch_data_event {
 +	BCH_DATA_EVENT_PROGRESS	= 0,
@@ -9641,7 +10070,7 @@ index 000000000000..b2edabf58260
 +
 +	__u64			sectors_done;
 +	__u64			sectors_total;
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_ioctl_data_event {
 +	__u8			type;
@@ -9650,12 +10079,12 @@ index 000000000000..b2edabf58260
 +	struct bch_ioctl_data_progress p;
 +	__u64			pad2[15];
 +	};
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
 +struct bch_replicas_usage {
 +	__u64			sectors;
 +	struct bch_replicas_entry r;
-+} __attribute__((packed));
++} __packed;
 +
 +static inline struct bch_replicas_usage *
 +replicas_usage_next(struct bch_replicas_usage *u)
@@ -9785,14 +10214,15 @@ index 000000000000..b2edabf58260
 +#endif /* _BCACHEFS_IOCTL_H */
 diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
 new file mode 100644
-index 000000000000..d348175edad4
+index 000000000000..630df060fbe9
 --- /dev/null
 +++ b/fs/bcachefs/bkey.c
-@@ -0,0 +1,1203 @@
+@@ -0,0 +1,1098 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "bkey.h"
++#include "bkey_cmp.h"
 +#include "bkey_methods.h"
 +#include "bset.h"
 +#include "util.h"
@@ -9807,9 +10237,6 @@ index 000000000000..d348175edad4
 +
 +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 +
-+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
-+			      const struct bkey_packed *);
-+
 +void bch2_bkey_packed_to_binary_text(struct printbuf *out,
 +				     const struct bkey_format *f,
 +				     const struct bkey_packed *k)
@@ -10554,50 +10981,6 @@ index 000000000000..d348175edad4
 +
 +#ifdef CONFIG_X86_64
 +
-+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-+				  unsigned nr_key_bits)
-+{
-+	long d0, d1, d2, d3;
-+	int cmp;
-+
-+	/* we shouldn't need asm for this, but gcc is being retarded: */
-+
-+	asm(".intel_syntax noprefix;"
-+	    "xor eax, eax;"
-+	    "xor edx, edx;"
-+	    "1:;"
-+	    "mov r8, [rdi];"
-+	    "mov r9, [rsi];"
-+	    "sub ecx, 64;"
-+	    "jl 2f;"
-+
-+	    "cmp r8, r9;"
-+	    "jnz 3f;"
-+
-+	    "lea rdi, [rdi - 8];"
-+	    "lea rsi, [rsi - 8];"
-+	    "jmp 1b;"
-+
-+	    "2:;"
-+	    "not ecx;"
-+	    "shr r8, 1;"
-+	    "shr r9, 1;"
-+	    "shr r8, cl;"
-+	    "shr r9, cl;"
-+	    "cmp r8, r9;"
-+
-+	    "3:\n"
-+	    "seta al;"
-+	    "setb dl;"
-+	    "sub eax, edx;"
-+	    ".att_syntax prefix;"
-+	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
-+	    : "0" (l), "1" (r), "3" (nr_key_bits)
-+	    : "r8", "r9", "cc", "memory");
-+
-+	return cmp;
-+}
-+
 +#define I(_x)			(*(out)++ = (_x))
 +#define I1(i0)						I(i0)
 +#define I2(i0, i1)		(I1(i0),		I(i1))
@@ -10828,40 +11211,6 @@ index 000000000000..d348175edad4
 +}
 +
 +#else
-+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
-+				  unsigned nr_key_bits)
-+{
-+	u64 l_v, r_v;
-+
-+	if (!nr_key_bits)
-+		return 0;
-+
-+	/* for big endian, skip past header */
-+	nr_key_bits += high_bit_offset;
-+	l_v = *l & (~0ULL >> high_bit_offset);
-+	r_v = *r & (~0ULL >> high_bit_offset);
-+
-+	while (1) {
-+		if (nr_key_bits < 64) {
-+			l_v >>= 64 - nr_key_bits;
-+			r_v >>= 64 - nr_key_bits;
-+			nr_key_bits = 0;
-+		} else {
-+			nr_key_bits -= 64;
-+		}
-+
-+		if (!nr_key_bits || l_v != r_v)
-+			break;
-+
-+		l = next_word(l);
-+		r = next_word(r);
-+
-+		l_v = *l;
-+		r_v = *r;
-+	}
-+
-+	return cmp_int(l_v, r_v);
-+}
 +#endif
 +
 +__pure
@@ -10869,19 +11218,7 @@ index 000000000000..d348175edad4
 +					  const struct bkey_packed *r,
 +					  const struct btree *b)
 +{
-+	const struct bkey_format *f = &b->format;
-+	int ret;
-+
-+	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
-+	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
-+
-+	ret = __bkey_cmp_bits(high_word(f, l),
-+			      high_word(f, r),
-+			      b->nr_key_bits);
-+
-+	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
-+				bkey_unpack_pos(b, r)));
-+	return ret;
++	return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
 +}
 +
 +__pure __flatten
@@ -10897,20 +11234,7 @@ index 000000000000..d348175edad4
 +			 const struct bkey_packed *l,
 +			 const struct bkey_packed *r)
 +{
-+	struct bkey unpacked;
-+
-+	if (likely(bkey_packed(l) && bkey_packed(r)))
-+		return __bch2_bkey_cmp_packed_format_checked(l, r, b);
-+
-+	if (bkey_packed(l)) {
-+		__bkey_unpack_key_format_checked(b, &unpacked, l);
-+		l = (void*) &unpacked;
-+	} else if (bkey_packed(r)) {
-+		__bkey_unpack_key_format_checked(b, &unpacked, r);
-+		r = (void*) &unpacked;
-+	}
-+
-+	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
++	return bch2_bkey_cmp_packed_inlined(b, l, r);
 +}
 +
 +__pure __flatten
@@ -10994,10 +11318,10 @@ index 000000000000..d348175edad4
 +#endif
 diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
 new file mode 100644
-index 000000000000..df9fb859d1db
+index 000000000000..19b59ffe0a98
 --- /dev/null
 +++ b/fs/bcachefs/bkey.h
-@@ -0,0 +1,571 @@
+@@ -0,0 +1,666 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_H
 +#define _BCACHEFS_BKEY_H
@@ -11005,6 +11329,7 @@ index 000000000000..df9fb859d1db
 +#include <linux/bug.h>
 +#include "bcachefs_format.h"
 +
++#include "btree_types.h"
 +#include "util.h"
 +#include "vstructs.h"
 +
@@ -11134,8 +11459,9 @@ index 000000000000..df9fb859d1db
 +}
 +
 +/*
-+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
-+ * pass it by by val... as much as I hate c++, const ref would be nice here:
++ * The compiler generates better code when we pass bpos by ref, but it's often
++ * enough terribly convenient to pass it by val... as much as I hate c++, const
++ * ref would be nice here:
 + */
 +__pure __flatten
 +static inline int bkey_cmp_left_packed_byval(const struct btree *b,
@@ -11356,6 +11682,99 @@ index 000000000000..df9fb859d1db
 +bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
 +	       const struct bkey_format *);
 +
++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
++
++static inline void
++__bkey_unpack_key_format_checked(const struct btree *b,
++			       struct bkey *dst,
++			       const struct bkey_packed *src)
++{
++	if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) {
++		compiled_unpack_fn unpack_fn = b->aux_data;
++		unpack_fn(dst, src);
++
++		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
++		    bch2_expensive_debug_checks) {
++			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
++
++			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
++		}
++	} else {
++		*dst = __bch2_bkey_unpack_key(&b->format, src);
++	}
++}
++
++static inline struct bkey
++bkey_unpack_key_format_checked(const struct btree *b,
++			       const struct bkey_packed *src)
++{
++	struct bkey dst;
++
++	__bkey_unpack_key_format_checked(b, &dst, src);
++	return dst;
++}
++
++static inline void __bkey_unpack_key(const struct btree *b,
++				     struct bkey *dst,
++				     const struct bkey_packed *src)
++{
++	if (likely(bkey_packed(src)))
++		__bkey_unpack_key_format_checked(b, dst, src);
++	else
++		*dst = *packed_to_bkey_c(src);
++}
++
++/**
++ * bkey_unpack_key -- unpack just the key, not the value
++ */
++static inline struct bkey bkey_unpack_key(const struct btree *b,
++					  const struct bkey_packed *src)
++{
++	return likely(bkey_packed(src))
++		? bkey_unpack_key_format_checked(b, src)
++		: *packed_to_bkey_c(src);
++}
++
++static inline struct bpos
++bkey_unpack_pos_format_checked(const struct btree *b,
++			       const struct bkey_packed *src)
++{
++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
++	return bkey_unpack_key_format_checked(b, src).p;
++#else
++	return __bkey_unpack_pos(&b->format, src);
++#endif
++}
++
++static inline struct bpos bkey_unpack_pos(const struct btree *b,
++					  const struct bkey_packed *src)
++{
++	return likely(bkey_packed(src))
++		? bkey_unpack_pos_format_checked(b, src)
++		: packed_to_bkey_c(src)->p;
++}
++
++/* Disassembled bkeys */
++
++static inline struct bkey_s_c bkey_disassemble(struct btree *b,
++					       const struct bkey_packed *k,
++					       struct bkey *u)
++{
++	__bkey_unpack_key(b, u, k);
++
++	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
++}
++
++/* non const version: */
++static inline struct bkey_s __bkey_disassemble(struct btree *b,
++					       struct bkey_packed *k,
++					       struct bkey *u)
++{
++	__bkey_unpack_key(b, u, k);
++
++	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
++}
++
 +static inline u64 bkey_field_max(const struct bkey_format *f,
 +				 enum bch_bkey_fields nr)
 +{
@@ -11571,15 +11990,16 @@ index 000000000000..df9fb859d1db
 +#endif /* _BCACHEFS_BKEY_H */
 diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
 new file mode 100644
-index 000000000000..0d7c67a959af
+index 000000000000..a30c4ae8eb36
 --- /dev/null
 +++ b/fs/bcachefs/bkey_buf.h
-@@ -0,0 +1,60 @@
+@@ -0,0 +1,61 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_BUF_H
 +#define _BCACHEFS_BKEY_BUF_H
 +
 +#include "bcachefs.h"
++#include "bkey.h"
 +
 +struct bkey_buf {
 +	struct bkey_i	*k;
@@ -11635,12 +12055,147 @@ index 000000000000..0d7c67a959af
 +}
 +
 +#endif /* _BCACHEFS_BKEY_BUF_H */
+diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h
+new file mode 100644
+index 000000000000..5f42a6e69360
+--- /dev/null
++++ b/fs/bcachefs/bkey_cmp.h
+@@ -0,0 +1,129 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BKEY_CMP_H
++#define _BCACHEFS_BKEY_CMP_H
++
++#include "bkey.h"
++
++#ifdef CONFIG_X86_64
++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
++				  unsigned nr_key_bits)
++{
++	long d0, d1, d2, d3;
++	int cmp;
++
++	/* we shouldn't need asm for this, but gcc is being retarded: */
++
++	asm(".intel_syntax noprefix;"
++	    "xor eax, eax;"
++	    "xor edx, edx;"
++	    "1:;"
++	    "mov r8, [rdi];"
++	    "mov r9, [rsi];"
++	    "sub ecx, 64;"
++	    "jl 2f;"
++
++	    "cmp r8, r9;"
++	    "jnz 3f;"
++
++	    "lea rdi, [rdi - 8];"
++	    "lea rsi, [rsi - 8];"
++	    "jmp 1b;"
++
++	    "2:;"
++	    "not ecx;"
++	    "shr r8, 1;"
++	    "shr r9, 1;"
++	    "shr r8, cl;"
++	    "shr r9, cl;"
++	    "cmp r8, r9;"
++
++	    "3:\n"
++	    "seta al;"
++	    "setb dl;"
++	    "sub eax, edx;"
++	    ".att_syntax prefix;"
++	    : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
++	    : "0" (l), "1" (r), "3" (nr_key_bits)
++	    : "r8", "r9", "cc", "memory");
++
++	return cmp;
++}
++#else
++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
++				  unsigned nr_key_bits)
++{
++	u64 l_v, r_v;
++
++	if (!nr_key_bits)
++		return 0;
++
++	/* for big endian, skip past header */
++	nr_key_bits += high_bit_offset;
++	l_v = *l & (~0ULL >> high_bit_offset);
++	r_v = *r & (~0ULL >> high_bit_offset);
++
++	while (1) {
++		if (nr_key_bits < 64) {
++			l_v >>= 64 - nr_key_bits;
++			r_v >>= 64 - nr_key_bits;
++			nr_key_bits = 0;
++		} else {
++			nr_key_bits -= 64;
++		}
++
++		if (!nr_key_bits || l_v != r_v)
++			break;
++
++		l = next_word(l);
++		r = next_word(r);
++
++		l_v = *l;
++		r_v = *r;
++	}
++
++	return cmp_int(l_v, r_v);
++}
++#endif
++
++static inline __pure __flatten
++int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l,
++					  const struct bkey_packed *r,
++					  const struct btree *b)
++{
++	const struct bkey_format *f = &b->format;
++	int ret;
++
++	EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
++	EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
++
++	ret = __bkey_cmp_bits(high_word(f, l),
++			      high_word(f, r),
++			      b->nr_key_bits);
++
++	EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
++				bkey_unpack_pos(b, r)));
++	return ret;
++}
++
++static inline __pure __flatten
++int bch2_bkey_cmp_packed_inlined(const struct btree *b,
++			 const struct bkey_packed *l,
++			 const struct bkey_packed *r)
++{
++	struct bkey unpacked;
++
++	if (likely(bkey_packed(l) && bkey_packed(r)))
++		return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b);
++
++	if (bkey_packed(l)) {
++		__bkey_unpack_key_format_checked(b, &unpacked, l);
++		l = (void *) &unpacked;
++	} else if (bkey_packed(r)) {
++		__bkey_unpack_key_format_checked(b, &unpacked, r);
++		r = (void *) &unpacked;
++	}
++
++	return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
++}
++
++#endif /* _BCACHEFS_BKEY_CMP_H */
 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
 new file mode 100644
-index 000000000000..e0cbac8811af
+index 000000000000..6939d74d705e
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.c
-@@ -0,0 +1,503 @@
+@@ -0,0 +1,505 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -11672,13 +12227,13 @@ index 000000000000..e0cbac8811af
 +	return 0;
 +}
 +
-+#define bch2_bkey_ops_deleted (struct bkey_ops) {	\
++#define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
 +	.key_invalid = deleted_key_invalid,		\
-+}
++})
 +
-+#define bch2_bkey_ops_whiteout (struct bkey_ops) {	\
++#define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
 +	.key_invalid = deleted_key_invalid,		\
-+}
++})
 +
 +static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
 +				 int rw, struct printbuf *err)
@@ -11692,9 +12247,9 @@ index 000000000000..e0cbac8811af
 +	return 0;
 +}
 +
-+#define bch2_bkey_ops_error (struct bkey_ops) {		\
++#define bch2_bkey_ops_error ((struct bkey_ops) {	\
 +	.key_invalid = empty_val_key_invalid,		\
-+}
++})
 +
 +static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
 +				   int rw, struct printbuf *err)
@@ -11708,13 +12263,13 @@ index 000000000000..e0cbac8811af
 +	return 0;
 +}
 +
-+#define bch2_bkey_ops_cookie (struct bkey_ops) {	\
++#define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
 +	.key_invalid = key_type_cookie_invalid,		\
-+}
++})
 +
-+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) {	\
++#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
 +	.key_invalid = empty_val_key_invalid,		\
-+}
++})
 +
 +static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 +					int rw, struct printbuf *err)
@@ -11732,10 +12287,10 @@ index 000000000000..e0cbac8811af
 +	       datalen, min(datalen, 32U), d.v->data);
 +}
 +
-+#define bch2_bkey_ops_inline_data (struct bkey_ops) {	\
++#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
 +	.key_invalid	= key_type_inline_data_invalid,	\
 +	.val_to_text	= key_type_inline_data_to_text,	\
-+}
++})
 +
 +static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
 +				int rw, struct printbuf *err)
@@ -11755,10 +12310,10 @@ index 000000000000..e0cbac8811af
 +	return true;
 +}
 +
-+#define bch2_bkey_ops_set (struct bkey_ops) {		\
++#define bch2_bkey_ops_set ((struct bkey_ops) {		\
 +	.key_invalid	= key_type_set_invalid,		\
 +	.key_merge	= key_type_set_merge,		\
-+}
++})
 +
 +const struct bkey_ops bch2_bkey_ops[] = {
 +#define x(name, nr) [KEY_TYPE_##name]	= bch2_bkey_ops_##name,
@@ -11792,6 +12347,7 @@ index 000000000000..e0cbac8811af
 +		(1U << KEY_TYPE_whiteout)|
 +		(1U << KEY_TYPE_inode)|
 +		(1U << KEY_TYPE_inode_v2)|
++		(1U << KEY_TYPE_inode_v3)|
 +		(1U << KEY_TYPE_inode_generation),
 +	[BKEY_TYPE_dirents] =
 +		(1U << KEY_TYPE_deleted)|
@@ -12082,6 +12638,7 @@ index 000000000000..e0cbac8811af
 +		    btree_id == BTREE_ID_inodes) {
 +			if (!bkey_packed(k)) {
 +				struct bkey_i *u = packed_to_bkey(k);
++
 +				swap(u->k.p.inode, u->k.p.offset);
 +			} else if (f->bits_per_field[BKEY_FIELD_INODE] &&
 +				   f->bits_per_field[BKEY_FIELD_OFFSET]) {
@@ -12146,7 +12703,7 @@ index 000000000000..e0cbac8811af
 +}
 diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
 new file mode 100644
-index 000000000000..db894b40d2ca
+index 000000000000..4739b3c32cff
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.h
 @@ -0,0 +1,175 @@
@@ -12170,7 +12727,7 @@ index 000000000000..db894b40d2ca
 + *
 + * When invalid, error string is returned via @err. @rw indicates whether key is
 + * being read or written; more aggressive checks can be enabled when rw == WRITE.
-+*/
++ */
 +struct bkey_ops {
 +	int		(*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
 +				       int rw, struct printbuf *err);
@@ -12327,13 +12884,14 @@ index 000000000000..db894b40d2ca
 +#endif /* _BCACHEFS_BKEY_METHODS_H */
 diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
 new file mode 100644
-index 000000000000..b1385a77da11
+index 000000000000..be0d4bc1afd3
 --- /dev/null
 +++ b/fs/bcachefs/bkey_sort.c
-@@ -0,0 +1,198 @@
+@@ -0,0 +1,199 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "bkey_buf.h"
++#include "bkey_cmp.h"
 +#include "bkey_sort.h"
 +#include "bset.h"
 +#include "extents.h"
@@ -12488,7 +13046,7 @@ index 000000000000..b1385a77da11
 +				struct bkey_packed *l,
 +				struct bkey_packed *r)
 +{
-+	return bch2_bkey_cmp_packed(b, l, r) ?:
++	return bch2_bkey_cmp_packed_inlined(b, l, r) ?:
 +		(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
 +		(int) l->needs_whiteout - (int) r->needs_whiteout;
 +}
@@ -12510,7 +13068,7 @@ index 000000000000..b1385a77da11
 +			continue;
 +
 +		while ((next = sort_iter_peek(iter)) &&
-+		       !bch2_bkey_cmp_packed(iter->b, in, next)) {
++		       !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) {
 +			BUG_ON(in->needs_whiteout &&
 +			       next->needs_whiteout);
 +			needs_whiteout |= in->needs_whiteout;
@@ -12581,10 +13139,10 @@ index 000000000000..79cf11d1b4e7
 +#endif /* _BCACHEFS_BKEY_SORT_H */
 diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
 new file mode 100644
-index 000000000000..fa60ef84e4ef
+index 000000000000..094235364470
 --- /dev/null
 +++ b/fs/bcachefs/bset.c
-@@ -0,0 +1,1598 @@
+@@ -0,0 +1,1601 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for working with individual keys, and sorted sets of keys with in a
@@ -13552,7 +14110,7 @@ index 000000000000..fa60ef84e4ef
 +	t->size -= j - l;
 +
 +	for (j = l; j < t->size; j++)
-+	       rw_aux_tree(b, t)[j].offset += shift;
++		rw_aux_tree(b, t)[j].offset += shift;
 +
 +	EBUG_ON(l < t->size &&
 +		rw_aux_tree(b, t)[l].offset ==
@@ -13853,7 +14411,7 @@ index 000000000000..fa60ef84e4ef
 +	bch2_btree_node_iter_sort(iter, b);
 +}
 +
-+noinline __flatten __attribute__((cold))
++noinline __flatten __cold
 +static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 +			      struct btree *b, struct bpos *search)
 +{
@@ -14028,7 +14586,10 @@ index 000000000000..fa60ef84e4ef
 +	EBUG_ON(iter->data->k > iter->data->end);
 +
 +	if (unlikely(__btree_node_iter_set_end(iter, 0))) {
-+		bch2_btree_node_iter_set_drop(iter, iter->data);
++		/* avoid an expensive memmove call: */
++		iter->data[0] = iter->data[1];
++		iter->data[1] = iter->data[2];
++		iter->data[2] = (struct btree_node_iter_set) { 0, 0 };
 +		return;
 +	}
 +
@@ -14185,10 +14746,10 @@ index 000000000000..fa60ef84e4ef
 +}
 diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
 new file mode 100644
-index 000000000000..0d46534c3dcd
+index 000000000000..72e6376bce2a
 --- /dev/null
 +++ b/fs/bcachefs/bset.h
-@@ -0,0 +1,615 @@
+@@ -0,0 +1,521 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BSET_H
 +#define _BCACHEFS_BSET_H
@@ -14396,100 +14957,6 @@ index 000000000000..0d46534c3dcd
 +	return btree_aux_data_bytes(b) / sizeof(u64);
 +}
 +
-+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
-+
-+static inline void
-+__bkey_unpack_key_format_checked(const struct btree *b,
-+			       struct bkey *dst,
-+			       const struct bkey_packed *src)
-+{
-+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-+	{
-+		compiled_unpack_fn unpack_fn = b->aux_data;
-+		unpack_fn(dst, src);
-+
-+		if (bch2_expensive_debug_checks) {
-+			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
-+
-+			BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
-+		}
-+	}
-+#else
-+	*dst = __bch2_bkey_unpack_key(&b->format, src);
-+#endif
-+}
-+
-+static inline struct bkey
-+bkey_unpack_key_format_checked(const struct btree *b,
-+			       const struct bkey_packed *src)
-+{
-+	struct bkey dst;
-+
-+	__bkey_unpack_key_format_checked(b, &dst, src);
-+	return dst;
-+}
-+
-+static inline void __bkey_unpack_key(const struct btree *b,
-+				     struct bkey *dst,
-+				     const struct bkey_packed *src)
-+{
-+	if (likely(bkey_packed(src)))
-+		__bkey_unpack_key_format_checked(b, dst, src);
-+	else
-+		*dst = *packed_to_bkey_c(src);
-+}
-+
-+/**
-+ * bkey_unpack_key -- unpack just the key, not the value
-+ */
-+static inline struct bkey bkey_unpack_key(const struct btree *b,
-+					  const struct bkey_packed *src)
-+{
-+	return likely(bkey_packed(src))
-+		? bkey_unpack_key_format_checked(b, src)
-+		: *packed_to_bkey_c(src);
-+}
-+
-+static inline struct bpos
-+bkey_unpack_pos_format_checked(const struct btree *b,
-+			       const struct bkey_packed *src)
-+{
-+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
-+	return bkey_unpack_key_format_checked(b, src).p;
-+#else
-+	return __bkey_unpack_pos(&b->format, src);
-+#endif
-+}
-+
-+static inline struct bpos bkey_unpack_pos(const struct btree *b,
-+					  const struct bkey_packed *src)
-+{
-+	return likely(bkey_packed(src))
-+		? bkey_unpack_pos_format_checked(b, src)
-+		: packed_to_bkey_c(src)->p;
-+}
-+
-+/* Disassembled bkeys */
-+
-+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
-+					       const struct bkey_packed *k,
-+					       struct bkey *u)
-+{
-+	__bkey_unpack_key(b, u, k);
-+
-+	return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
-+}
-+
-+/* non const version: */
-+static inline struct bkey_s __bkey_disassemble(struct btree *b,
-+					       struct bkey_packed *k,
-+					       struct bkey *u)
-+{
-+	__bkey_unpack_key(b, u, k);
-+
-+	return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
-+}
-+
 +#define for_each_bset(_b, _t)						\
 +	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 +
@@ -14806,10 +15273,10 @@ index 000000000000..0d46534c3dcd
 +#endif /* _BCACHEFS_BSET_H */
 diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
 new file mode 100644
-index 000000000000..f84b50869de2
+index 000000000000..75e744792a92
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.c
-@@ -0,0 +1,1149 @@
+@@ -0,0 +1,1204 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -14826,6 +15293,12 @@ index 000000000000..f84b50869de2
 +#include <linux/sched/mm.h>
 +#include <trace/events/bcachefs.h>
 +
++#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
++do {						 \
++	if (shrinker_counter)			 \
++		bc->not_freed_##counter++;	 \
++} while (0)
++
 +const char * const bch2_btree_node_flags[] = {
 +#define x(f)	#f,
 +	BTREE_FLAGS()
@@ -14924,7 +15397,9 @@ index 000000000000..f84b50869de2
 +
 +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 +{
-+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
++	struct btree *b;
++
++	b = kzalloc(sizeof(struct btree), gfp);
 +	if (!b)
 +		return NULL;
 +
@@ -14942,7 +15417,9 @@ index 000000000000..f84b50869de2
 +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 +{
 +	struct btree_cache *bc = &c->btree_cache;
-+	struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
++	struct btree *b;
++
++	b = __btree_node_mem_alloc(c, GFP_KERNEL);
 +	if (!b)
 +		return NULL;
 +
@@ -14961,6 +15438,7 @@ index 000000000000..f84b50869de2
 +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 +{
 +	int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
++
 +	BUG_ON(ret);
 +
 +	/* Cause future lookups for this node to fail: */
@@ -14987,7 +15465,7 @@ index 000000000000..f84b50869de2
 +	mutex_lock(&bc->lock);
 +	ret = __bch2_btree_node_hash_insert(bc, b);
 +	if (!ret)
-+		list_add(&b->list, &bc->live);
++		list_add_tail(&b->list, &bc->live);
 +	mutex_unlock(&bc->lock);
 +
 +	return ret;
@@ -15006,7 +15484,7 @@ index 000000000000..f84b50869de2
 + * this version is for btree nodes that have already been freed (we're not
 + * reaping a real btree node)
 + */
-+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
 +{
 +	struct btree_cache *bc = &c->btree_cache;
 +	int ret = 0;
@@ -15016,38 +15494,64 @@ index 000000000000..f84b50869de2
 +	if (b->flags & ((1U << BTREE_NODE_dirty)|
 +			(1U << BTREE_NODE_read_in_flight)|
 +			(1U << BTREE_NODE_write_in_flight))) {
-+		if (!flush)
++		if (!flush) {
++			if (btree_node_dirty(b))
++				BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
++			else if (btree_node_read_in_flight(b))
++				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
++			else if (btree_node_write_in_flight(b))
++				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
 +			return -ENOMEM;
++		}
 +
 +		/* XXX: waiting on IO with btree cache lock held */
 +		bch2_btree_node_wait_on_read(b);
 +		bch2_btree_node_wait_on_write(b);
 +	}
 +
-+	if (!six_trylock_intent(&b->c.lock))
++	if (!six_trylock_intent(&b->c.lock)) {
++		BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
 +		return -ENOMEM;
++	}
 +
-+	if (!six_trylock_write(&b->c.lock))
++	if (!six_trylock_write(&b->c.lock)) {
++		BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
 +		goto out_unlock_intent;
++	}
 +
 +	/* recheck under lock */
 +	if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
 +			(1U << BTREE_NODE_write_in_flight))) {
-+		if (!flush)
++		if (!flush) {
++			if (btree_node_read_in_flight(b))
++				BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
++			else if (btree_node_write_in_flight(b))
++				BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
 +			goto out_unlock;
++		}
 +		six_unlock_write(&b->c.lock);
 +		six_unlock_intent(&b->c.lock);
 +		goto wait_on_io;
 +	}
 +
-+	if (btree_node_noevict(b) ||
-+	    btree_node_write_blocked(b) ||
-+	    btree_node_will_make_reachable(b))
++	if (btree_node_noevict(b)) {
++		BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
 +		goto out_unlock;
++	}
++	if (btree_node_write_blocked(b)) {
++		BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
++		goto out_unlock;
++	}
++	if (btree_node_will_make_reachable(b)) {
++		BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
++		goto out_unlock;
++	}
 +
 +	if (btree_node_dirty(b)) {
-+		if (!flush)
++		if (!flush) {
++			BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
 +			goto out_unlock;
++		}
 +		/*
 +		 * Using the underscore version because we don't want to compact
 +		 * bsets after the write, since this node is about to be evicted
@@ -15055,9 +15559,11 @@ index 000000000000..f84b50869de2
 +		 * the post write cleanup:
 +		 */
 +		if (bch2_verify_btree_ondisk)
-+			bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
++			bch2_btree_node_write(c, b, SIX_LOCK_intent,
++					      BTREE_WRITE_cache_reclaim);
 +		else
-+			__bch2_btree_node_write(c, b, 0);
++			__bch2_btree_node_write(c, b,
++						BTREE_WRITE_cache_reclaim);
 +
 +		six_unlock_write(&b->c.lock);
 +		six_unlock_intent(&b->c.lock);
@@ -15075,14 +15581,14 @@ index 000000000000..f84b50869de2
 +	goto out;
 +}
 +
-+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
++static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
 +{
-+	return __btree_node_reclaim(c, b, false);
++	return __btree_node_reclaim(c, b, false, shrinker_counter);
 +}
 +
 +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 +{
-+	return __btree_node_reclaim(c, b, true);
++	return __btree_node_reclaim(c, b, true, false);
 +}
 +
 +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -15131,11 +15637,12 @@ index 000000000000..f84b50869de2
 +		if (touched >= nr)
 +			goto out;
 +
-+		if (!btree_node_reclaim(c, b)) {
++		if (!btree_node_reclaim(c, b, true)) {
 +			btree_node_data_free(c, b);
 +			six_unlock_write(&b->c.lock);
 +			six_unlock_intent(&b->c.lock);
 +			freed++;
++			bc->freed++;
 +		}
 +	}
 +restart:
@@ -15144,9 +15651,11 @@ index 000000000000..f84b50869de2
 +
 +		if (btree_node_accessed(b)) {
 +			clear_btree_node_accessed(b);
-+		} else if (!btree_node_reclaim(c, b)) {
++			bc->not_freed_access_bit++;
++		} else if (!btree_node_reclaim(c, b, true)) {
 +			freed++;
 +			btree_node_data_free(c, b);
++			bc->freed++;
 +
 +			bch2_btree_node_hash_remove(bc, b);
 +			six_unlock_write(&b->c.lock);
@@ -15161,7 +15670,7 @@ index 000000000000..f84b50869de2
 +			   six_trylock_read(&b->c.lock)) {
 +			list_move(&bc->live, &b->list);
 +			mutex_unlock(&bc->lock);
-+			__bch2_btree_node_write(c, b, 0);
++			__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 +			six_unlock_read(&b->c.lock);
 +			if (touched >= nr)
 +				goto out_nounlock;
@@ -15202,7 +15711,7 @@ index 000000000000..f84b50869de2
 +	struct bch_fs *c = container_of(shrink, struct bch_fs,
 +					btree_cache.shrink);
 +
-+	bch2_btree_cache_to_text(out, c);
++	bch2_btree_cache_to_text(out, &c->btree_cache);
 +}
 +
 +void bch2_fs_btree_cache_exit(struct bch_fs *c)
@@ -15360,7 +15869,7 @@ index 000000000000..f84b50869de2
 +	struct btree *b;
 +
 +	list_for_each_entry_reverse(b, &bc->live, list)
-+		if (!btree_node_reclaim(c, b))
++		if (!btree_node_reclaim(c, b, false))
 +			return b;
 +
 +	while (1) {
@@ -15395,7 +15904,7 @@ index 000000000000..f84b50869de2
 +	 * disk node. Check the freed list before allocating a new one:
 +	 */
 +	list_for_each_entry(b, freed, list)
-+		if (!btree_node_reclaim(c, b)) {
++		if (!btree_node_reclaim(c, b, false)) {
 +			list_del_init(&b->list);
 +			goto got_node;
 +		}
@@ -15421,7 +15930,7 @@ index 000000000000..f84b50869de2
 +	 * the list. Check if there's any freed nodes there:
 +	 */
 +	list_for_each_entry(b2, &bc->freeable, list)
-+		if (!btree_node_reclaim(c, b2)) {
++		if (!btree_node_reclaim(c, b2, false)) {
 +			swap(b->data, b2->data);
 +			swap(b->aux_data, b2->aux_data);
 +			btree_node_to_freedlist(bc, b2);
@@ -15447,6 +15956,7 @@ index 000000000000..f84b50869de2
 +	b->flags		= 0;
 +	b->written		= 0;
 +	b->nsets		= 0;
++	b->write_type		= 0;
 +	b->sib_u64s[0]		= 0;
 +	b->sib_u64s[1]		= 0;
 +	b->whiteout_u64s	= 0;
@@ -15642,7 +16152,7 @@ index 000000000000..f84b50869de2
 +	if (likely(c->opts.btree_node_mem_ptr_optimization &&
 +		   b &&
 +		   b->hash_val == btree_ptr_hash_val(k)))
-+			goto lock_node;
++		goto lock_node;
 +retry:
 +	b = btree_cache_find(bc, k);
 +	if (unlikely(!b)) {
@@ -15882,7 +16392,7 @@ index 000000000000..f84b50869de2
 +
 +	/* XXX we're called from btree_gc which will be holding other btree
 +	 * nodes locked
-+	 * */
++	 */
 +	__bch2_btree_node_wait_on_read(b);
 +	__bch2_btree_node_wait_on_write(b);
 +
@@ -15890,7 +16400,7 @@ index 000000000000..f84b50869de2
 +	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
 +
 +	if (btree_node_dirty(b)) {
-+		__bch2_btree_node_write(c, b, 0);
++		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
 +		six_unlock_write(&b->c.lock);
 +		six_unlock_intent(&b->c.lock);
 +		goto wait_on_io;
@@ -15953,24 +16463,37 @@ index 000000000000..f84b50869de2
 +	       stats.failed);
 +}
 +
-+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
++void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
 +{
-+	prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
-+	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
-+	prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
++	prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
++	prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
++	prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
++
++	prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
++	prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
++	prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
++	prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
++	prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
++	prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
++	prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
++	prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
++	prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
++	prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
++
 +}
 diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
 new file mode 100644
-index 000000000000..a4df3e866bb8
+index 000000000000..b623c7028273
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.h
-@@ -0,0 +1,105 @@
+@@ -0,0 +1,106 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_CACHE_H
 +#define _BCACHEFS_BTREE_CACHE_H
 +
 +#include "bcachefs.h"
 +#include "btree_types.h"
++#include "bkey_methods.h"
 +
 +extern const char * const bch2_btree_node_flags[];
 +
@@ -16067,12 +16590,12 @@ index 000000000000..a4df3e866bb8
 +
 +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
 +			     struct btree *);
-+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
++void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *);
 +
 +#endif /* _BCACHEFS_BTREE_CACHE_H */
 diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
 new file mode 100644
-index 000000000000..5b7f7cd3252a
+index 000000000000..20e804ecb104
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.c
 @@ -0,0 +1,2106 @@
@@ -16277,7 +16800,7 @@ index 000000000000..5b7f7cd3252a
 +	struct bkey_i_btree_ptr_v2 *new;
 +	int ret;
 +
-+	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
++	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 +	if (!new)
 +		return -ENOMEM;
 +
@@ -16306,7 +16829,7 @@ index 000000000000..5b7f7cd3252a
 +	if (ret)
 +		return ret;
 +
-+	new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
++	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 +	if (!new)
 +		return -ENOMEM;
 +
@@ -16396,7 +16919,7 @@ index 000000000000..5b7f7cd3252a
 +				"  node %s",
 +				bch2_btree_ids[b->c.btree_id], b->c.level,
 +				buf1.buf, buf2.buf))
-+		    ret = set_node_min(c, cur, expected_start);
++			ret = set_node_min(c, cur, expected_start);
 +	}
 +out:
 +fsck_err:
@@ -18044,7 +18567,7 @@ index 000000000000..5b7f7cd3252a
 +	}
 +
 +	for (i = 0; i < BTREE_ID_NR; i++)
-+		if ((1 << i) & BTREE_ID_HAS_PTRS) {
++		if (btree_type_has_ptrs(i)) {
 +			struct btree_iter iter;
 +			struct bkey_s_c k;
 +
@@ -18057,10 +18580,10 @@ index 000000000000..5b7f7cd3252a
 +					NULL, NULL,
 +					BTREE_INSERT_NOFAIL,
 +				gc_btree_gens_key(&trans, &iter, k));
-+			if (ret) {
++			if (ret && ret != -EROFS)
 +				bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
++			if (ret)
 +				goto err;
-+			}
 +		}
 +
 +	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
@@ -18070,10 +18593,10 @@ index 000000000000..5b7f7cd3252a
 +			NULL, NULL,
 +			BTREE_INSERT_NOFAIL,
 +		bch2_alloc_write_oldest_gen(&trans, &iter, k));
-+	if (ret) {
++	if (ret && ret != -EROFS)
 +		bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
++	if (ret)
 +		goto err;
-+	}
 +
 +	c->gc_gens_btree	= 0;
 +	c->gc_gens_pos		= POS_MIN;
@@ -18302,10 +18825,10 @@ index 000000000000..95d803b5743d
 +#endif /* _BCACHEFS_BTREE_GC_H */
 diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
 new file mode 100644
-index 000000000000..13ce29750d28
+index 000000000000..cee3b500d45b
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.c
-@@ -0,0 +1,2154 @@
+@@ -0,0 +1,2203 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -18759,6 +19282,24 @@ index 000000000000..13ce29750d28
 +}
 +
 +/*
++ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one?
++ *
++ * The first bset is going to be of similar order to the size of the node, the
++ * last bset is bounded by btree_write_set_buffer(), which is set to keep the
++ * memmove on insert from being too expensive: the middle bset should, ideally,
++ * be the geometric mean of the first and the last.
++ *
++ * Returns true if the middle bset is greater than that geometric mean:
++ */
++static inline bool should_compact_all(struct bch_fs *c, struct btree *b)
++{
++	unsigned mid_u64s_bits =
++		(ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2;
++
++	return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits;
++}
++
++/*
 + * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
 + * inserted into
 + *
@@ -18775,19 +19316,14 @@ index 000000000000..13ce29750d28
 +
 +	EBUG_ON(!(b->c.lock.state.seq & 1));
 +	BUG_ON(bset_written(b, bset(b, &b->set[1])));
++	BUG_ON(btree_node_just_written(b));
 +
 +	if (b->nsets == MAX_BSETS &&
-+	    !btree_node_write_in_flight(b)) {
-+		unsigned log_u64s[] = {
-+			ilog2(bset_u64s(&b->set[0])),
-+			ilog2(bset_u64s(&b->set[1])),
-+			ilog2(bset_u64s(&b->set[2])),
-+		};
-+
-+		if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
-+			bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
-+			reinit_iter = true;
-+		}
++	    !btree_node_write_in_flight(b) &&
++	    should_compact_all(c, b)) {
++		bch2_btree_node_write(c, b, SIX_LOCK_write,
++				      BTREE_WRITE_init_next_bset);
++		reinit_iter = true;
 +	}
 +
 +	if (b->nsets == MAX_BSETS &&
@@ -19524,6 +20060,7 @@ index 000000000000..13ce29750d28
 +
 +	if (rb->have_ioref) {
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++
 +		bch2_latency_acct(ca, rb->start_time, READ);
 +	}
 +
@@ -19711,6 +20248,7 @@ index 000000000000..13ce29750d28
 +
 +	if (rb->have_ioref) {
 +		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
++
 +		bch2_latency_acct(ca, rb->start_time, READ);
 +	}
 +
@@ -19959,7 +20497,7 @@ index 000000000000..13ce29750d28
 +	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 +
 +	if (new & (1U << BTREE_NODE_write_in_flight))
-+		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
++		__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
 +	else
 +		wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
 +}
@@ -20108,6 +20646,7 @@ index 000000000000..13ce29750d28
 +	bool used_mempool;
 +	unsigned long old, new;
 +	bool validate_before_checksum = false;
++	enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
 +	void *data;
 +	int ret;
 +
@@ -20154,6 +20693,12 @@ index 000000000000..13ce29750d28
 +	if (new & (1U << BTREE_NODE_need_write))
 +		return;
 +do_write:
++	if ((flags & BTREE_WRITE_ONLY_IF_NEED))
++		type = b->write_type;
++	b->write_type = 0;
++
++	BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
++
 +	atomic_dec(&c->btree_cache.dirty);
 +
 +	BUG_ON(btree_node_fake(b));
@@ -20221,6 +20766,8 @@ index 000000000000..13ce29750d28
 +	u64s = bch2_sort_keys(i->start, &sort_iter, false);
 +	le16_add_cpu(&i->u64s, u64s);
 +
++	BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
++
 +	set_needs_whiteout(i, false);
 +
 +	/* do we have data to write? */
@@ -20230,6 +20777,10 @@ index 000000000000..13ce29750d28
 +	bytes_to_write = vstruct_end(i) - data;
 +	sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
 +
++	if (!b->written &&
++	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
++		BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write);
++
 +	memset(data + bytes_to_write, 0,
 +	       (sectors_to_write << 9) - bytes_to_write);
 +
@@ -20318,27 +20869,18 @@ index 000000000000..13ce29750d28
 +
 +	b->written += sectors_to_write;
 +
-+	if (wbio->wbio.first_btree_write &&
-+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-+			cpu_to_le16(b->written);
-+
 +	if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2)
 +		bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
 +			cpu_to_le16(b->written);
 +
-+	atomic64_inc(&c->btree_writes_nr);
-+	atomic64_add(sectors_to_write, &c->btree_writes_sectors);
++	atomic64_inc(&c->btree_write_stats[type].nr);
++	atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
 +
 +	INIT_WORK(&wbio->work, btree_write_submit);
 +	queue_work(c->io_complete_wq, &wbio->work);
 +	return;
 +err:
 +	set_btree_node_noevict(b);
-+	if (!b->written &&
-+	    b->key.k.type == KEY_TYPE_btree_ptr_v2)
-+		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
-+			cpu_to_le16(sectors_to_write);
 +	b->written += sectors_to_write;
 +nowrite:
 +	btree_bounce_free(c, bytes, used_mempool, data);
@@ -20460,12 +21002,42 @@ index 000000000000..13ce29750d28
 +{
 +	return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 +}
++
++const char * const bch2_btree_write_types[] = {
++#define x(t, n) [n] = #t,
++	BCH_BTREE_WRITE_TYPES()
++	NULL
++};
++
++void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
++{
++	printbuf_tabstop_push(out, 20);
++	printbuf_tabstop_push(out, 10);
++
++	prt_tab(out);
++	prt_str(out, "nr");
++	prt_tab(out);
++	prt_str(out, "size");
++	prt_newline(out);
++
++	for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
++		u64 nr		= atomic64_read(&c->btree_write_stats[i].nr);
++		u64 bytes	= atomic64_read(&c->btree_write_stats[i].bytes);
++
++		prt_printf(out, "%s:", bch2_btree_write_types[i]);
++		prt_tab(out);
++		prt_u64(out, nr);
++		prt_tab(out);
++		prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
++		prt_newline(out);
++	}
++}
 diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
 new file mode 100644
-index 000000000000..8af853642123
+index 000000000000..4b1810ad7d91
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.h
-@@ -0,0 +1,222 @@
+@@ -0,0 +1,228 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_IO_H
 +#define _BCACHEFS_BTREE_IO_H
@@ -20607,8 +21179,12 @@ index 000000000000..8af853642123
 +
 +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 +
-+#define BTREE_WRITE_ONLY_IF_NEED	(1U << 0)
-+#define BTREE_WRITE_ALREADY_STARTED	(1U << 1)
++enum btree_write_flags {
++	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
++	__BTREE_WRITE_ALREADY_STARTED,
++};
++#define BTREE_WRITE_ONLY_IF_NEED	(1U << __BTREE_WRITE_ONLY_IF_NEED )
++#define BTREE_WRITE_ALREADY_STARTED	(1U << __BTREE_WRITE_ALREADY_STARTED)
 +
 +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 +void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -20687,13 +21263,15 @@ index 000000000000..8af853642123
 +		bn->min_key = bpos_nosnap_successor(bn->min_key);
 +}
 +
++void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
++
 +#endif /* _BCACHEFS_BTREE_IO_H */
 diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
 new file mode 100644
-index 000000000000..925ffb318445
+index 000000000000..72a3f400f82b
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.c
-@@ -0,0 +1,3043 @@
+@@ -0,0 +1,3121 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -20718,6 +21296,8 @@ index 000000000000..925ffb318445
 +
 +static void btree_trans_verify_sorted(struct btree_trans *);
 +inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
++static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *,
++						   struct btree_path *, int);
 +
 +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
 +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
@@ -20908,6 +21488,7 @@ index 000000000000..925ffb318445
 +
 +	if (p) {
 +		struct bkey uk = bkey_unpack_key(l->b, p);
++
 +		bch2_bkey_to_text(&buf2, &uk);
 +	} else {
 +		prt_printf(&buf2, "(none)");
@@ -20915,6 +21496,7 @@ index 000000000000..925ffb318445
 +
 +	if (k) {
 +		struct bkey uk = bkey_unpack_key(l->b, k);
++
 +		bch2_bkey_to_text(&buf3, &uk);
 +	} else {
 +		prt_printf(&buf3, "(none)");
@@ -21468,7 +22050,7 @@ index 000000000000..925ffb318445
 +
 +	bch2_bkey_buf_init(&tmp);
 +
-+	while (nr && !ret) {
++	while (nr-- && !ret) {
 +		if (!bch2_btree_node_relock(trans, path, path->level))
 +			break;
 +
@@ -21503,7 +22085,7 @@ index 000000000000..925ffb318445
 +
 +	bch2_bkey_buf_init(&tmp);
 +
-+	while (nr && !ret) {
++	while (nr-- && !ret) {
 +		if (!bch2_btree_node_relock(trans, path, path->level))
 +			break;
 +
@@ -21700,14 +22282,9 @@ index 000000000000..925ffb318445
 +	return ret;
 +}
 +
-+static inline bool btree_path_good_node(struct btree_trans *trans,
-+					struct btree_path *path,
-+					unsigned l, int check_pos)
++static inline bool btree_path_check_pos_in_node(struct btree_path *path,
++						unsigned l, int check_pos)
 +{
-+	if (!is_btree_node(path, l) ||
-+	    !bch2_btree_node_relock(trans, path, l))
-+		return false;
-+
 +	if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
 +		return false;
 +	if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
@@ -21715,6 +22292,15 @@ index 000000000000..925ffb318445
 +	return true;
 +}
 +
++static inline bool btree_path_good_node(struct btree_trans *trans,
++					struct btree_path *path,
++					unsigned l, int check_pos)
++{
++	return is_btree_node(path, l) &&
++		bch2_btree_node_relock(trans, path, l) &&
++		btree_path_check_pos_in_node(path, l, check_pos);
++}
++
 +static void btree_path_set_level_down(struct btree_trans *trans,
 +				      struct btree_path *path,
 +				      unsigned new_level)
@@ -21731,9 +22317,9 @@ index 000000000000..925ffb318445
 +	bch2_btree_path_verify(trans, path);
 +}
 +
-+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
-+						     struct btree_path *path,
-+						     int check_pos)
++static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans,
++							 struct btree_path *path,
++							 int check_pos)
 +{
 +	unsigned i, l = path->level;
 +again:
@@ -21754,6 +22340,16 @@ index 000000000000..925ffb318445
 +	return l;
 +}
 +
++static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
++						     struct btree_path *path,
++						     int check_pos)
++{
++	return likely(btree_node_locked(path, path->level) &&
++		      btree_path_check_pos_in_node(path, path->level, check_pos))
++		? path->level
++		: __btree_path_up_until_good_node(trans, path, check_pos);
++}
++
 +/*
 + * This is the main state machine for walking down the btree - walks down to a
 + * specified depth
@@ -21850,7 +22446,7 @@ index 000000000000..925ffb318445
 +		btree_path_traverse_one(trans, path, flags, _RET_IP_);
 +}
 +
-+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
++static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
 +			    struct btree_path *src)
 +{
 +	unsigned i, offset = offsetof(struct btree_path, pos);
@@ -21859,12 +22455,12 @@ index 000000000000..925ffb318445
 +	       (void *) src + offset,
 +	       sizeof(struct btree_path) - offset);
 +
-+	for (i = 0; i < BTREE_MAX_DEPTH; i++)
-+		if (btree_node_locked(dst, i))
-+			six_lock_increment(&dst->l[i].b->c.lock,
-+					   __btree_lock_want(dst, i));
++	for (i = 0; i < BTREE_MAX_DEPTH; i++) {
++		unsigned t = btree_node_locked_type(dst, i);
 +
-+	bch2_btree_path_check_sort(trans, dst, 0);
++		if (t != BTREE_NODE_UNLOCKED)
++			six_lock_increment(&dst->l[i].b->c.lock, t);
++	}
 +}
 +
 +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
@@ -21877,44 +22473,36 @@ index 000000000000..925ffb318445
 +	return new;
 +}
 +
-+inline struct btree_path * __must_check
-+bch2_btree_path_make_mut(struct btree_trans *trans,
++__flatten
++struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
 +			 struct btree_path *path, bool intent,
 +			 unsigned long ip)
 +{
-+	if (path->ref > 1 || path->preserve) {
-+		__btree_path_put(path, intent);
-+		path = btree_path_clone(trans, path, intent);
-+		path->preserve = false;
++	__btree_path_put(path, intent);
++	path = btree_path_clone(trans, path, intent);
++	path->preserve = false;
 +#ifdef CONFIG_BCACHEFS_DEBUG
-+		path->ip_allocated = ip;
++	path->ip_allocated = ip;
 +#endif
-+		btree_trans_verify_sorted(trans);
-+	}
-+
-+	path->should_be_locked = false;
++	btree_trans_verify_sorted(trans);
 +	return path;
 +}
 +
 +struct btree_path * __must_check
-+bch2_btree_path_set_pos(struct btree_trans *trans,
++__bch2_btree_path_set_pos(struct btree_trans *trans,
 +		   struct btree_path *path, struct bpos new_pos,
-+		   bool intent, unsigned long ip)
++		   bool intent, unsigned long ip, int cmp)
 +{
-+	int cmp = bpos_cmp(new_pos, path->pos);
 +	unsigned l = path->level;
 +
 +	EBUG_ON(trans->restarted);
 +	EBUG_ON(!path->ref);
 +
-+	if (!cmp)
-+		return path;
-+
 +	path = bch2_btree_path_make_mut(trans, path, intent, ip);
 +
 +	path->pos = new_pos;
 +
-+	bch2_btree_path_check_sort(trans, path, cmp);
++	bch2_btree_path_check_sort_fast(trans, path, cmp);
 +
 +	if (unlikely(path->cached)) {
 +		btree_node_unlock(trans, path, 0);
@@ -21938,7 +22526,7 @@ index 000000000000..925ffb318445
 +			__btree_path_level_init(path, l);
 +	}
 +
-+	if (l != path->level) {
++	if (unlikely(l != path->level)) {
 +		btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
 +		__bch2_btree_path_unlock(trans, path);
 +	}
@@ -22016,6 +22604,18 @@ index 000000000000..925ffb318445
 +	__bch2_path_free(trans, path);
 +}
 +
++static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
++				 bool intent)
++{
++	EBUG_ON(trans->paths + path->idx != path);
++	EBUG_ON(!path->ref);
++
++	if (!__btree_path_put(path, intent))
++		return;
++
++	__bch2_path_free(trans, path);
++}
++
 +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 +{
 +	struct btree_insert_entry *i;
@@ -22224,15 +22824,17 @@ index 000000000000..925ffb318445
 +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
 +{
 +
++	struct btree_path_level *l = path_l(path);
++	struct bkey_packed *_k;
 +	struct bkey_s_c k;
 +
++	if (unlikely(!l->b))
++		return bkey_s_c_null;
++
 +	EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 +	EBUG_ON(!btree_node_locked(path, path->level));
 +
 +	if (!path->cached) {
-+		struct btree_path_level *l = path_l(path);
-+		struct bkey_packed *_k;
-+
 +		_k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 +		k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 +
@@ -22464,7 +23066,8 @@ index 000000000000..925ffb318445
 +	if (bpos_cmp(start_pos, iter->journal_pos) < 0)
 +		iter->journal_idx = 0;
 +
-+	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0,
++	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
++					iter->path->level,
 +					start_pos, end_pos,
 +					&iter->journal_idx);
 +
@@ -22486,7 +23089,7 @@ index 000000000000..925ffb318445
 +{
 +	struct bkey_i *next_journal =
 +		bch2_btree_journal_peek(trans, iter, iter->path->pos,
-+				k.k ? k.k->p : iter->path->l[0].b->key.k.p);
++				k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
 +
 +	if (next_journal) {
 +		iter->k = next_journal->k;
@@ -22546,10 +23149,12 @@ index 000000000000..925ffb318445
 +	struct bkey_s_c k, k2;
 +	int ret;
 +
-+	EBUG_ON(iter->path->cached || iter->path->level);
++	EBUG_ON(iter->path->cached);
 +	bch2_btree_iter_verify(iter);
 +
 +	while (1) {
++		struct btree_path_level *l;
++
 +		iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
 +					iter->flags & BTREE_ITER_INTENT,
 +					btree_iter_ip_allocated(iter));
@@ -22562,9 +23167,18 @@ index 000000000000..925ffb318445
 +			goto out;
 +		}
 +
++		l = path_l(iter->path);
++
++		if (unlikely(!l->b)) {
++			/* No btree nodes at requested level: */
++			bch2_btree_iter_set_pos(iter, SPOS_MAX);
++			k = bkey_s_c_null;
++			goto out;
++		}
++
 +		btree_path_set_should_be_locked(iter->path);
 +
-+		k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
++		k = btree_path_level_peek_all(trans->c, l, &iter->k);
 +
 +		if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
 +		    k.k &&
@@ -22585,7 +23199,7 @@ index 000000000000..925ffb318445
 +			: NULL;
 +		if (next_update &&
 +		    bpos_cmp(next_update->k.p,
-+			     k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
++			     k.k ? k.k->p : l->b->key.k.p) <= 0) {
 +			iter->k = next_update->k;
 +			k = bkey_i_to_s_c(next_update);
 +		}
@@ -22606,9 +23220,9 @@ index 000000000000..925ffb318445
 +
 +		if (likely(k.k)) {
 +			break;
-+		} else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
++		} else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
 +			/* Advance to next leaf node: */
-+			search_key = bpos_successor(iter->path->l[0].b->key.k.p);
++			search_key = bpos_successor(l->b->key.k.p);
 +		} else {
 +			/* End of btree: */
 +			bch2_btree_iter_set_pos(iter, SPOS_MAX);
@@ -22637,8 +23251,8 @@ index 000000000000..925ffb318445
 +	EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
 +
 +	if (iter->update_path) {
-+		bch2_path_put(trans, iter->update_path,
-+			      iter->flags & BTREE_ITER_INTENT);
++		bch2_path_put_nokeep(trans, iter->update_path,
++				     iter->flags & BTREE_ITER_INTENT);
 +		iter->update_path = NULL;
 +	}
 +
@@ -22669,8 +23283,8 @@ index 000000000000..925ffb318445
 +
 +		if (iter->update_path &&
 +		    bkey_cmp(iter->update_path->pos, k.k->p)) {
-+			bch2_path_put(trans, iter->update_path,
-+				      iter->flags & BTREE_ITER_INTENT);
++			bch2_path_put_nokeep(trans, iter->update_path,
++					     iter->flags & BTREE_ITER_INTENT);
 +			iter->update_path = NULL;
 +		}
 +
@@ -22918,7 +23532,7 @@ index 000000000000..925ffb318445
 +				 * that candidate
 +				 */
 +				if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
-+					bch2_path_put(trans, iter->path,
++					bch2_path_put_nokeep(trans, iter->path,
 +						      iter->flags & BTREE_ITER_INTENT);
 +					iter->path = saved_path;
 +					saved_path = NULL;
@@ -22931,7 +23545,7 @@ index 000000000000..925ffb318445
 +							      iter->snapshot,
 +							      k.k->p.snapshot)) {
 +					if (saved_path)
-+						bch2_path_put(trans, saved_path,
++						bch2_path_put_nokeep(trans, saved_path,
 +						      iter->flags & BTREE_ITER_INTENT);
 +					saved_path = btree_path_clone(trans, iter->path,
 +								iter->flags & BTREE_ITER_INTENT);
@@ -22975,7 +23589,7 @@ index 000000000000..925ffb318445
 +	btree_path_set_should_be_locked(iter->path);
 +out_no_locked:
 +	if (saved_path)
-+		bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
++		bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
 +
 +	bch2_btree_iter_verify_entry_exit(iter);
 +	bch2_btree_iter_verify(iter);
@@ -23057,6 +23671,8 @@ index 000000000000..925ffb318445
 +		}
 +
 +		k = bch2_btree_path_peek_slot(iter->path, &iter->k);
++		if (unlikely(!k.k))
++			goto out_no_locked;
 +	} else {
 +		struct bpos next;
 +
@@ -23187,6 +23803,35 @@ index 000000000000..925ffb318445
 +	btree_path_verify_sorted_ref(trans, r);
 +}
 +
++static inline struct btree_path *sib_btree_path(struct btree_trans *trans,
++						struct btree_path *path, int sib)
++{
++	unsigned idx = (unsigned) path->sorted_idx + sib;
++
++	EBUG_ON(sib != -1 && sib != 1);
++
++	return idx < trans->nr_sorted
++		? trans->paths + trans->sorted[idx]
++		: NULL;
++}
++
++static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans,
++						   struct btree_path *path,
++						   int cmp)
++{
++	struct btree_path *n;
++	int cmp2;
++
++	EBUG_ON(!cmp);
++
++	while ((n = sib_btree_path(trans, path, cmp)) &&
++	       (cmp2 = btree_path_cmp(n, path)) &&
++	       cmp2 != cmp)
++		btree_path_swap(trans, n, path);
++
++	btree_trans_verify_sorted(trans);
++}
++
 +inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
 +				       int cmp)
 +{
@@ -23263,7 +23908,7 @@ index 000000000000..925ffb318445
 +		bch2_path_put(trans, iter->path,
 +			      iter->flags & BTREE_ITER_INTENT);
 +	if (iter->update_path)
-+		bch2_path_put(trans, iter->update_path,
++		bch2_path_put_nokeep(trans, iter->update_path,
 +			      iter->flags & BTREE_ITER_INTENT);
 +	if (iter->key_cache_path)
 +		bch2_path_put(trans, iter->key_cache_path,
@@ -23281,7 +23926,7 @@ index 000000000000..925ffb318445
 +					  unsigned flags,
 +					  unsigned long ip)
 +{
-+	if (trans->restarted)
++	if (unlikely(trans->restarted))
 +		panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n",
 +		      bch2_err_str(trans->restarted),
 +		      (void *) trans->last_restarted_ip);
@@ -23301,7 +23946,7 @@ index 000000000000..925ffb318445
 +	    btree_type_has_snapshots(btree_id))
 +		flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 +
-+	if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
++	if (trans->journal_replay_not_finished)
 +		flags |= BTREE_ITER_WITH_JOURNAL;
 +
 +	iter->trans	= trans;
@@ -23454,7 +24099,7 @@ index 000000000000..925ffb318445
 +
 +	if (!trans->restarted &&
 +	    (need_resched() ||
-+	     ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
++	     local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
 +		bch2_trans_unlock(trans);
 +		cond_resched();
 +		bch2_trans_relock(trans);
@@ -23464,7 +24109,7 @@ index 000000000000..925ffb318445
 +	if (trans->restarted)
 +		bch2_btree_path_traverse_all(trans);
 +
-+	trans->last_begin_time = ktime_get_ns();
++	trans->last_begin_time = local_clock();
 +	return trans->restart_count;
 +}
 +
@@ -23485,7 +24130,7 @@ index 000000000000..925ffb318445
 +	BUG_ON(trans->used_mempool);
 +
 +#ifdef __KERNEL__
-+	p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
++	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
 +#endif
 +	if (!p)
 +		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
@@ -23494,15 +24139,16 @@ index 000000000000..925ffb318445
 +	trans->updates		= p; p += updates_bytes;
 +}
 +
-+static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c,
-+					const char *fn)
++const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
++
++unsigned bch2_trans_get_fn_idx(const char *fn)
 +{
 +	unsigned i;
 +
-+	for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++)
-+		if (!c->btree_transaction_fns[i] ||
-+		    c->btree_transaction_fns[i] == fn) {
-+			c->btree_transaction_fns[i] = fn;
++	for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
++		if (!bch2_btree_transaction_fns[i] ||
++		    bch2_btree_transaction_fns[i] == fn) {
++			bch2_btree_transaction_fns[i] = fn;
 +			return i;
 +		}
 +
@@ -23510,7 +24156,7 @@ index 000000000000..925ffb318445
 +	return i;
 +}
 +
-+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn)
++void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
 +	__acquires(&c->btree_trans_barrier)
 +{
 +	struct btree_transaction_stats *s;
@@ -23520,16 +24166,19 @@ index 000000000000..925ffb318445
 +
 +	memset(trans, 0, sizeof(*trans));
 +	trans->c		= c;
-+	trans->fn		= fn;
-+	trans->last_begin_time	= ktime_get_ns();
-+	trans->fn_idx		= bch2_trans_get_fn_idx(trans, c, fn);
++	trans->fn		= fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
++		? bch2_btree_transaction_fns[fn_idx] : NULL;
++	trans->last_begin_time	= local_clock();
++	trans->fn_idx		= fn_idx;
 +	trans->locking_wait.task = current;
++	trans->journal_replay_not_finished =
++		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 +	closure_init_stack(&trans->ref);
 +
 +	bch2_trans_alloc_paths(trans, c);
 +
 +	s = btree_trans_stats(trans);
-+	if (s) {
++	if (s && s->max_mem) {
 +		unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
 +
 +		trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
@@ -23540,9 +24189,9 @@ index 000000000000..925ffb318445
 +		} else {
 +			trans->mem_bytes = expected_mem_bytes;
 +		}
-+
-+		trans->nr_max_paths = s->nr_max_paths;
 +	}
++	if (s)
++		trans->nr_max_paths = s->nr_max_paths;
 +
 +	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 +
@@ -23648,7 +24297,7 @@ index 000000000000..925ffb318445
 +
 +	rcu_read_lock();
 +	owner = READ_ONCE(b->lock.owner);
-+	pid = owner ? owner->pid : 0;;
++	pid = owner ? owner->pid : 0;
 +	rcu_read_unlock();
 +
 +	prt_tab(out);
@@ -23710,6 +24359,13 @@ index 000000000000..925ffb318445
 +
 +void bch2_fs_btree_iter_exit(struct bch_fs *c)
 +{
++	struct btree_transaction_stats *s;
++
++	for (s = c->btree_transaction_stats;
++	     s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
++	     s++)
++		kfree(s->max_paths_text);
++
 +	if (c->btree_trans_barrier_initialized)
 +		cleanup_srcu_struct(&c->btree_trans_barrier);
 +	mempool_exit(&c->btree_trans_mem_pool);
@@ -23739,10 +24395,10 @@ index 000000000000..925ffb318445
 +}
 diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
 new file mode 100644
-index 000000000000..910f6d7bc961
+index 000000000000..8c35d7d45d8e
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.h
-@@ -0,0 +1,564 @@
+@@ -0,0 +1,599 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_ITER_H
 +#define _BCACHEFS_BTREE_ITER_H
@@ -23876,12 +24532,36 @@ index 000000000000..910f6d7bc961
 +	     _path = __trans_next_path_with_node((_trans), (_b),	\
 +						 (_path)->idx + 1))
 +
-+struct btree_path * __must_check
-+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
++struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
 +			 bool, unsigned long);
++
++static inline struct btree_path * __must_check
++bch2_btree_path_make_mut(struct btree_trans *trans,
++			 struct btree_path *path, bool intent,
++			 unsigned long ip)
++{
++	if (path->ref > 1 || path->preserve)
++		path = __bch2_btree_path_make_mut(trans, path, intent, ip);
++	path->should_be_locked = false;
++	return path;
++}
++
 +struct btree_path * __must_check
-+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
-+			struct bpos, bool, unsigned long);
++__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
++			struct bpos, bool, unsigned long, int);
++
++static inline struct btree_path * __must_check
++bch2_btree_path_set_pos(struct btree_trans *trans,
++		   struct btree_path *path, struct bpos new_pos,
++		   bool intent, unsigned long ip)
++{
++	int cmp = bpos_cmp(new_pos, path->pos);
++
++	return cmp
++		? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
++		: path;
++}
++
 +int __must_check bch2_btree_path_traverse(struct btree_trans *,
 +					  struct btree_path *, unsigned);
 +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
@@ -24296,10 +24976,21 @@ index 000000000000..910f6d7bc961
 +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 +void bch2_dump_trans_updates(struct btree_trans *);
 +void bch2_dump_trans_paths_updates(struct btree_trans *);
-+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, const char *);
++void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
 +void bch2_trans_exit(struct btree_trans *);
 +
-+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) __bch2_trans_init(_trans, _c, __func__)
++extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
++unsigned bch2_trans_get_fn_idx(const char *);
++
++#define bch2_trans_init(_trans, _c, _nr_iters, _mem)			\
++do {									\
++	static unsigned trans_fn_idx;					\
++									\
++	if (unlikely(!trans_fn_idx))					\
++		trans_fn_idx = bch2_trans_get_fn_idx(__func__);		\
++									\
++	__bch2_trans_init(_trans, _c, trans_fn_idx);			\
++} while (0)
 +
 +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 +
@@ -24309,10 +25000,11 @@ index 000000000000..910f6d7bc961
 +#endif /* _BCACHEFS_BTREE_ITER_H */
 diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
 new file mode 100644
-index 000000000000..35e941949f49
+index 000000000000..66fb69801318
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.c
-@@ -0,0 +1,983 @@
+@@ -0,0 +1,1034 @@
++// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_cache.h"
@@ -24418,15 +25110,34 @@ index 000000000000..35e941949f49
 +	six_unlock_intent(&ck->c.lock);
 +}
 +
++#ifdef __KERNEL__
++static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
++						   struct bkey_cached *ck)
++{
++	struct bkey_cached *pos;
++
++	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
++		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
++				 pos->btree_trans_barrier_seq)) {
++			list_move(&ck->list, &pos->list);
++			return;
++		}
++	}
++
++	list_move(&ck->list, &bc->freed_nonpcpu);
++}
++#endif
++
 +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 +					 struct bkey_cached *ck)
 +{
-+	struct btree_key_cache_freelist *f;
-+	bool freed = false;
-+
 +	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 +
 +	if (!ck->c.lock.readers) {
++#ifdef __KERNEL__
++		struct btree_key_cache_freelist *f;
++		bool freed = false;
++
 +		preempt_disable();
 +		f = this_cpu_ptr(bc->pcpu_freed);
 +
@@ -24444,13 +25155,18 @@ index 000000000000..35e941949f49
 +			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 +				struct bkey_cached *ck2 = f->objs[--f->nr];
 +
-+				list_move_tail(&ck2->list, &bc->freed_nonpcpu);
++				__bkey_cached_move_to_freelist_ordered(bc, ck2);
 +			}
 +			preempt_enable();
 +
-+			list_move_tail(&ck->list, &bc->freed_nonpcpu);
++			__bkey_cached_move_to_freelist_ordered(bc, ck);
 +			mutex_unlock(&bc->lock);
 +		}
++#else
++		mutex_lock(&bc->lock);
++		list_move_tail(&ck->list, &bc->freed_nonpcpu);
++		mutex_unlock(&bc->lock);
++#endif
 +	} else {
 +		mutex_lock(&bc->lock);
 +		list_move_tail(&ck->list, &bc->freed_pcpu);
@@ -24485,10 +25201,12 @@ index 000000000000..35e941949f49
 +	struct bch_fs *c = trans->c;
 +	struct btree_key_cache *bc = &c->btree_key_cache;
 +	struct bkey_cached *ck = NULL;
-+	struct btree_key_cache_freelist *f;
 +	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 +
 +	if (!pcpu_readers) {
++#ifdef __KERNEL__
++		struct btree_key_cache_freelist *f;
++
 +		preempt_disable();
 +		f = this_cpu_ptr(bc->pcpu_freed);
 +		if (f->nr)
@@ -24511,6 +25229,14 @@ index 000000000000..35e941949f49
 +			preempt_enable();
 +			mutex_unlock(&bc->lock);
 +		}
++#else
++		mutex_lock(&bc->lock);
++		if (!list_empty(&bc->freed_nonpcpu)) {
++			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
++			list_del_init(&ck->list);
++		}
++		mutex_unlock(&bc->lock);
++#endif
 +	} else {
 +		mutex_lock(&bc->lock);
 +		if (!list_empty(&bc->freed_pcpu)) {
@@ -24543,6 +25269,7 @@ index 000000000000..35e941949f49
 +		return ck;
 +	}
 +
++	/* GFP_NOFS because we're holding btree locks: */
 +	ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 +	if (likely(ck)) {
 +		INIT_LIST_HEAD(&ck->list);
@@ -24567,6 +25294,7 @@ index 000000000000..35e941949f49
 +	struct bkey_cached *ck;
 +	unsigned i;
 +
++	mutex_lock(&c->lock);
 +	rcu_read_lock();
 +	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 +	for (i = 0; i < tbl->size; i++)
@@ -24574,13 +25302,14 @@ index 000000000000..35e941949f49
 +			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 +			    bkey_cached_lock_for_evict(ck)) {
 +				bkey_cached_evict(c, ck);
-+				rcu_read_unlock();
-+				return ck;
++				goto out;
 +			}
 +		}
++	ck = NULL;
++out:
 +	rcu_read_unlock();
-+
-+	return NULL;
++	mutex_unlock(&c->lock);
++	return ck;
 +}
 +
 +static struct bkey_cached *
@@ -24592,7 +25321,7 @@ index 000000000000..35e941949f49
 +	bool was_new = true;
 +
 +	ck = bkey_cached_alloc(trans, path);
-+	if (unlikely(IS_ERR(ck)))
++	if (IS_ERR(ck))
 +		return ck;
 +
 +	if (unlikely(!ck)) {
@@ -24713,7 +25442,7 @@ index 000000000000..35e941949f49
 +	return ret;
 +}
 +
-+noinline static int
++static noinline int
 +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
 +					 unsigned flags)
 +{
@@ -24894,7 +25623,7 @@ index 000000000000..35e941949f49
 +	 * Since journal reclaim depends on us making progress here, and the
 +	 * allocator/copygc depend on journal reclaim making progress, we need
 +	 * to be using alloc reserves:
-+	 * */
++	 */
 +	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 +		bch2_trans_update(trans, &b_iter, ck->k,
 +				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
@@ -25074,12 +25803,7 @@ index 000000000000..35e941949f49
 +	unsigned start, flags;
 +	int srcu_idx;
 +
-+	/* Return -1 if we can't do anything right now */
-+	if (sc->gfp_mask & __GFP_FS)
-+		mutex_lock(&bc->lock);
-+	else if (!mutex_trylock(&bc->lock))
-+		return -1;
-+
++	mutex_lock(&bc->lock);
 +	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 +	flags = memalloc_nofs_save();
 +
@@ -25184,23 +25908,31 @@ index 000000000000..35e941949f49
 +	struct bkey_cached *ck, *n;
 +	struct rhash_head *pos;
 +	unsigned i;
++#ifdef __KERNEL__
 +	int cpu;
++#endif
 +
 +	if (bc->shrink.list.next)
 +		unregister_shrinker(&bc->shrink);
 +
 +	mutex_lock(&bc->lock);
 +
-+	rcu_read_lock();
-+	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-+	if (tbl)
-+		for (i = 0; i < tbl->size; i++)
-+			rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-+				bkey_cached_evict(bc, ck);
-+				list_add(&ck->list, &bc->freed_nonpcpu);
-+			}
-+	rcu_read_unlock();
++	/*
++	 * The loop is needed to guard against racing with rehash:
++	 */
++	while (atomic_long_read(&bc->nr_keys)) {
++		rcu_read_lock();
++		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
++		if (tbl)
++			for (i = 0; i < tbl->size; i++)
++				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
++					bkey_cached_evict(bc, ck);
++					list_add(&ck->list, &bc->freed_nonpcpu);
++				}
++		rcu_read_unlock();
++	}
 +
++#ifdef __KERNEL__
 +	for_each_possible_cpu(cpu) {
 +		struct btree_key_cache_freelist *f =
 +			per_cpu_ptr(bc->pcpu_freed, cpu);
@@ -25210,6 +25942,7 @@ index 000000000000..35e941949f49
 +			list_add(&ck->list, &bc->freed_nonpcpu);
 +		}
 +	}
++#endif
 +
 +	list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
 +
@@ -25225,10 +25958,15 @@ index 000000000000..35e941949f49
 +		kmem_cache_free(bch2_key_cache, ck);
 +	}
 +
-+	BUG_ON(atomic_long_read(&bc->nr_dirty) &&
-+	       !bch2_journal_error(&c->journal) &&
-+	       test_bit(BCH_FS_WAS_RW, &c->flags));
-+	BUG_ON(atomic_long_read(&bc->nr_keys));
++	if (atomic_long_read(&bc->nr_dirty) &&
++	    !bch2_journal_error(&c->journal) &&
++	    test_bit(BCH_FS_WAS_RW, &c->flags))
++		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
++		      atomic_long_read(&bc->nr_dirty));
++
++	if (atomic_long_read(&bc->nr_keys))
++		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
++		      atomic_long_read(&bc->nr_keys));
 +
 +	mutex_unlock(&bc->lock);
 +
@@ -25258,9 +25996,11 @@ index 000000000000..35e941949f49
 +	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 +	int ret;
 +
++#ifdef __KERNEL__
 +	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
 +	if (!bc->pcpu_freed)
 +		return -ENOMEM;
++#endif
 +
 +	ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
 +	if (ret)
@@ -25268,7 +26008,7 @@ index 000000000000..35e941949f49
 +
 +	bc->table_init_done = true;
 +
-+	bc->shrink.seeks		= 1;
++	bc->shrink.seeks		= 0;
 +	bc->shrink.count_objects	= bch2_btree_key_cache_count;
 +	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
 +	bc->shrink.to_text		= bch2_btree_key_cache_shrinker_to_text;
@@ -25277,15 +26017,17 @@ index 000000000000..35e941949f49
 +
 +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 +{
-+	prt_printf(out, "nr_freed:\t%zu\n",	atomic_long_read(&c->nr_freed));
-+	prt_printf(out, "nr_keys:\t%lu\n",	atomic_long_read(&c->nr_keys));
-+	prt_printf(out, "nr_dirty:\t%lu\n",	atomic_long_read(&c->nr_dirty));
++	prt_printf(out, "nr_freed:\t%zu",	atomic_long_read(&c->nr_freed));
++	prt_newline(out);
++	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
++	prt_newline(out);
++	prt_printf(out, "nr_dirty:\t%lu",	atomic_long_read(&c->nr_dirty));
++	prt_newline(out);
 +}
 +
 +void bch2_btree_key_cache_exit(void)
 +{
-+	if (bch2_key_cache)
-+		kmem_cache_destroy(bch2_key_cache);
++	kmem_cache_destroy(bch2_key_cache);
 +}
 +
 +int __init bch2_btree_key_cache_init(void)
@@ -25298,10 +26040,11 @@ index 000000000000..35e941949f49
 +}
 diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
 new file mode 100644
-index 000000000000..670746e72dab
+index 000000000000..eccea15fca79
 --- /dev/null
 +++ b/fs/bcachefs/btree_key_cache.h
-@@ -0,0 +1,47 @@
+@@ -0,0 +1,48 @@
++/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
 +#define _BCACHEFS_BTREE_KEY_CACHE_H
 +
@@ -25351,10 +26094,10 @@ index 000000000000..670746e72dab
 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
 diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
 new file mode 100644
-index 000000000000..f4340086c357
+index 000000000000..9d090437d8f6
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.c
-@@ -0,0 +1,676 @@
+@@ -0,0 +1,679 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -25451,108 +26194,13 @@ index 000000000000..f4340086c357
 +	prt_newline(out);
 +}
 +
-+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
-+{
-+	int ret;
-+
-+	if (i == g->g) {
-+		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
-+		ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
-+	} else {
-+		i->trans->lock_must_abort = true;
-+		ret = 0;
-+	}
-+
-+	for (i = g->g + 1; i < g->g + g->nr; i++)
-+		wake_up_process(i->trans->locking_wait.task);
-+	return ret;
-+}
-+
-+static noinline int break_cycle(struct lock_graph *g)
-+{
-+	struct trans_waiting_for_lock *i;
-+
-+	for (i = g->g; i < g->g + g->nr; i++) {
-+		if (i->trans->lock_may_not_fail ||
-+		    i->trans->locking_wait.lock_want == SIX_LOCK_write)
-+			continue;
-+
-+		return abort_lock(g, i);
-+	}
-+
-+	for (i = g->g; i < g->g + g->nr; i++) {
-+		if (i->trans->lock_may_not_fail ||
-+		    !i->trans->in_traverse_all)
-+			continue;
-+
-+		return abort_lock(g, i);
-+	}
-+
-+	for (i = g->g; i < g->g + g->nr; i++) {
-+		if (i->trans->lock_may_not_fail)
-+			continue;
-+
-+		return abort_lock(g, i);
-+	}
-+
-+	BUG();
-+}
-+
-+static void lock_graph_pop(struct lock_graph *g)
++static void lock_graph_up(struct lock_graph *g)
 +{
 +	closure_put(&g->g[--g->nr].trans->ref);
 +}
 +
-+static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above,
-+				 struct printbuf *cycle)
++static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans)
 +{
-+	if (g->nr > 1 && cycle)
-+		print_chain(cycle, g);
-+
-+	while (g->g + g->nr > above)
-+		lock_graph_pop(g);
-+}
-+
-+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
-+			      struct printbuf *cycle)
-+{
-+	struct btree_trans *orig_trans = g->g->trans;
-+	struct trans_waiting_for_lock *i;
-+	int ret = 0;
-+
-+	for (i = g->g; i < g->g + g->nr; i++) {
-+		if (i->trans->locking != i->node_want) {
-+			lock_graph_pop_above(g, i - 1, cycle);
-+			return 0;
-+		}
-+
-+		if (i->trans == trans) {
-+			if (cycle) {
-+				/* Only checking: */
-+				print_cycle(cycle, g);
-+				ret = -1;
-+			} else {
-+				ret = break_cycle(g);
-+			}
-+
-+			if (ret)
-+				goto deadlock;
-+			/*
-+			 * If we didn't abort (instead telling another
-+			 * transaction to abort), keep checking:
-+			 */
-+		}
-+	}
-+
-+	if (g->nr == ARRAY_SIZE(g->g)) {
-+		if (orig_trans->lock_may_not_fail)
-+			return 0;
-+
-+		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
-+		ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
-+		goto deadlock;
-+	}
-+
 +	closure_get(&trans->ref);
 +
 +	g->g[g->nr++] = (struct trans_waiting_for_lock) {
@@ -25560,25 +26208,124 @@ index 000000000000..f4340086c357
 +		.node_want	= trans->locking,
 +		.lock_want	= trans->locking_wait.lock_want,
 +	};
-+
-+	return 0;
-+deadlock:
-+	lock_graph_pop_above(g, g->g, cycle);
-+	return ret;
 +}
 +
-+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g,
-+						   struct printbuf *cycle)
++static bool lock_graph_remove_non_waiters(struct lock_graph *g)
 +{
 +	struct trans_waiting_for_lock *i;
 +
 +	for (i = g->g + 1; i < g->g + g->nr; i++)
 +		if (i->trans->locking != i->node_want ||
 +		    i->trans->locking_wait.start_time != i[-1].lock_start_time) {
-+			lock_graph_pop_above(g, i - 1, cycle);
-+			return;
++			while (g->g + g->nr > i)
++				lock_graph_up(g);
++			return true;
 +		}
-+	BUG();
++
++	return false;
++}
++
++static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
++{
++	if (i == g->g) {
++		trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
++		return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
++	} else {
++		i->trans->lock_must_abort = true;
++		wake_up_process(i->trans->locking_wait.task);
++		return 0;
++	}
++}
++
++static int btree_trans_abort_preference(struct btree_trans *trans)
++{
++	if (trans->lock_may_not_fail)
++		return 0;
++	if (trans->locking_wait.lock_want == SIX_LOCK_write)
++		return 1;
++	if (!trans->in_traverse_all)
++		return 2;
++	return 3;
++}
++
++static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
++{
++	struct trans_waiting_for_lock *i, *abort = NULL;
++	unsigned best = 0, pref;
++	int ret;
++
++	if (lock_graph_remove_non_waiters(g))
++		return 0;
++
++	/* Only checking, for debugfs: */
++	if (cycle) {
++		print_cycle(cycle, g);
++		ret = -1;
++		goto out;
++	}
++
++	for (i = g->g; i < g->g + g->nr; i++) {
++		pref = btree_trans_abort_preference(i->trans);
++		if (pref > best) {
++			abort = i;
++			best = pref;
++		}
++	}
++
++	if (unlikely(!best)) {
++		struct bch_fs *c = g->g->trans->c;
++		struct printbuf buf = PRINTBUF;
++
++		bch_err(c, "cycle of nofail locks");
++
++		for (i = g->g; i < g->g + g->nr; i++) {
++			struct btree_trans *trans = i->trans;
++
++			bch2_btree_trans_to_text(&buf, trans);
++
++			prt_printf(&buf, "backtrace:");
++			prt_newline(&buf);
++			printbuf_indent_add(&buf, 2);
++			bch2_prt_backtrace(&buf, trans->locking_wait.task);
++			printbuf_indent_sub(&buf, 2);
++			prt_newline(&buf);
++		}
++
++		bch2_print_string_as_lines(KERN_ERR, buf.buf);
++		printbuf_exit(&buf);
++		BUG();
++	}
++
++	ret = abort_lock(g, abort);
++out:
++	if (ret)
++		while (g->nr)
++			lock_graph_up(g);
++	return ret;
++}
++
++static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
++			      struct printbuf *cycle)
++{
++	struct btree_trans *orig_trans = g->g->trans;
++	struct trans_waiting_for_lock *i;
++
++	for (i = g->g; i < g->g + g->nr; i++)
++		if (i->trans == trans)
++			return break_cycle(g, cycle);
++
++	if (g->nr == ARRAY_SIZE(g->g)) {
++		if (orig_trans->lock_may_not_fail)
++			return 0;
++
++		while (g->nr)
++			lock_graph_up(g);
++		trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
++		return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
++	}
++
++	lock_graph_down(g, trans);
++	return 0;
 +}
 +
 +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
@@ -25600,8 +26347,7 @@ index 000000000000..f4340086c357
 +	}
 +
 +	g.nr = 0;
-+	ret = lock_graph_descend(&g, trans, cycle);
-+	BUG_ON(ret);
++	lock_graph_down(&g, trans);
 +next:
 +	if (!g.nr)
 +		return 0;
@@ -25628,8 +26374,8 @@ index 000000000000..f4340086c357
 +
 +			b = &READ_ONCE(path->l[top->level].b)->c;
 +
-+			if (unlikely(IS_ERR_OR_NULL(b))) {
-+				lock_graph_remove_non_waiters(&g, cycle);
++			if (IS_ERR_OR_NULL(b)) {
++				BUG_ON(!lock_graph_remove_non_waiters(&g));
 +				goto next;
 +			}
 +
@@ -25655,7 +26401,7 @@ index 000000000000..f4340086c357
 +				raw_spin_unlock(&b->lock.wait_lock);
 +
 +				if (ret)
-+					return ret < 0 ? ret : 0;
++					return ret;
 +				goto next;
 +
 +			}
@@ -25665,7 +26411,7 @@ index 000000000000..f4340086c357
 +
 +	if (g.nr > 1 && cycle)
 +		print_chain(cycle, &g);
-+	lock_graph_pop(&g);
++	lock_graph_up(&g);
 +	goto next;
 +}
 +
@@ -25959,7 +26705,7 @@ index 000000000000..f4340086c357
 +	struct btree_path *path;
 +
 +	if (unlikely(trans->restarted))
-+		return - ((int) trans->restarted);
++		return -((int) trans->restarted);
 +
 +	trans_for_each_path(trans, path)
 +		if (path->should_be_locked &&
@@ -26033,10 +26779,10 @@ index 000000000000..f4340086c357
 +#endif
 diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
 new file mode 100644
-index 000000000000..d91b42bf1de1
+index 000000000000..fb237c95ee13
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.h
-@@ -0,0 +1,418 @@
+@@ -0,0 +1,419 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_LOCKING_H
 +#define _BCACHEFS_BTREE_LOCKING_H
@@ -26127,7 +26873,7 @@ index 000000000000..d91b42bf1de1
 +{
 +	mark_btree_node_locked_noreset(path, level, type);
 +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-+	path->l[level].lock_taken_time = ktime_get_ns();
++	path->l[level].lock_taken_time = local_clock();
 +#endif
 +}
 +
@@ -26159,7 +26905,7 @@ index 000000000000..d91b42bf1de1
 +	if (s)
 +		__bch2_time_stats_update(&s->lock_hold_times,
 +					 path->l[level].lock_taken_time,
-+					 ktime_get_ns());
++					 local_clock());
 +#endif
 +}
 +
@@ -26299,7 +27045,7 @@ index 000000000000..d91b42bf1de1
 +	    btree_node_lock_increment(trans, b, level, type) ||
 +	    !(ret = btree_node_lock_nopath(trans, b, type))) {
 +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
-+		path->l[b->level].lock_taken_time = ktime_get_ns();
++		path->l[b->level].lock_taken_time = local_clock();
 +#endif
 +	}
 +
@@ -26335,6 +27081,7 @@ index 000000000000..d91b42bf1de1
 +					      struct btree_bkey_cached_common *b)
 +{
 +	int ret = __btree_node_lock_write(trans, path, b, true);
++
 +	BUG_ON(ret);
 +}
 +
@@ -26457,10 +27204,10 @@ index 000000000000..d91b42bf1de1
 +#endif /* _BCACHEFS_BTREE_LOCKING_H */
 diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
 new file mode 100644
-index 000000000000..af226eed818b
+index 000000000000..d89489e4e4a5
 --- /dev/null
 +++ b/fs/bcachefs/btree_types.h
-@@ -0,0 +1,696 @@
+@@ -0,0 +1,708 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_TYPES_H
 +#define _BCACHEFS_BTREE_TYPES_H
@@ -26469,7 +27216,7 @@ index 000000000000..af226eed818b
 +#include <linux/rhashtable.h>
 +#include <linux/six.h>
 +
-+#include "bkey_methods.h"
++//#include "bkey_methods.h"
 +#include "buckets_types.h"
 +#include "darray.h"
 +#include "journal_types.h"
@@ -26540,6 +27287,7 @@ index 000000000000..af226eed818b
 +	u8			nsets;
 +	u8			nr_key_bits;
 +	u16			version_ondisk;
++	u8			write_type;
 +
 +	struct bkey_format	format;
 +
@@ -26623,6 +27371,16 @@ index 000000000000..af226eed818b
 +	/* Number of elements in live + freeable lists */
 +	unsigned		used;
 +	unsigned		reserve;
++	unsigned		freed;
++	unsigned		not_freed_lock_intent;
++	unsigned		not_freed_lock_write;
++	unsigned		not_freed_dirty;
++	unsigned		not_freed_read_in_flight;
++	unsigned		not_freed_write_in_flight;
++	unsigned		not_freed_noevict;
++	unsigned		not_freed_write_blocked;
++	unsigned		not_freed_will_make_reachable;
++	unsigned		not_freed_access_bit;
 +	atomic_t		dirty;
 +	struct shrinker		shrink;
 +
@@ -26779,7 +27537,7 @@ index 000000000000..af226eed818b
 +struct bkey_cached_key {
 +	u32			btree_id;
 +	struct bpos		pos;
-+} __attribute__((packed, aligned(4)));
++} __packed __aligned(4);
 +
 +#define BKEY_CACHED_ACCESSED		0
 +#define BKEY_CACHED_DIRTY		1
@@ -26871,6 +27629,7 @@ index 000000000000..af226eed818b
 +	bool			in_traverse_all:1;
 +	bool			memory_allocation_failure:1;
 +	bool			is_initial_gc:1;
++	bool			journal_replay_not_finished:1;
 +	enum bch_errcode	restarted:16;
 +	u32			restart_count;
 +	unsigned long		last_restarted_ip;
@@ -27159,7 +27918,7 @@ index 000000000000..af226eed818b
 +#endif /* _BCACHEFS_BTREE_TYPES_H */
 diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
 new file mode 100644
-index 000000000000..89941fb8caa0
+index 000000000000..1c2e7b2b4ed5
 --- /dev/null
 +++ b/fs/bcachefs/btree_update.h
 @@ -0,0 +1,158 @@
@@ -27173,8 +27932,8 @@ index 000000000000..89941fb8caa0
 +struct bch_fs;
 +struct btree;
 +
-+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
-+				     struct btree *);
++void bch2_btree_node_prep_for_write(struct btree_trans *,
++				    struct btree_path *, struct btree *);
 +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
 +				struct btree *, struct btree_node_iter *,
 +				struct bkey_i *);
@@ -27323,10 +28082,10 @@ index 000000000000..89941fb8caa0
 +#endif /* _BCACHEFS_BTREE_UPDATE_H */
 diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
 new file mode 100644
-index 000000000000..578ba747826e
+index 000000000000..dac2fa6b08ee
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.c
-@@ -0,0 +1,2352 @@
+@@ -0,0 +1,2437 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -27352,9 +28111,9 @@ index 000000000000..578ba747826e
 +#include <linux/random.h>
 +#include <trace/events/bcachefs.h>
 +
-+static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-+				   struct btree_path *, struct btree *,
-+				   struct keylist *, unsigned);
++static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
++				  struct btree_path *, struct btree *,
++				  struct keylist *, unsigned);
 +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 +
 +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
@@ -27366,8 +28125,8 @@ index 000000000000..578ba747826e
 +
 +	path = bch2_path_get(trans, btree_id, pos, level + 1, level,
 +			     BTREE_ITER_NOPRESERVE|
-+			     BTREE_ITER_INTENT, _THIS_IP_);
-+	path = bch2_btree_path_make_mut(trans, path, true, _THIS_IP_);
++			     BTREE_ITER_INTENT, _RET_IP_);
++	path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
 +	bch2_btree_path_downgrade(trans, path);
 +	__bch2_btree_path_unlock(trans, path);
 +	return path;
@@ -27524,6 +28283,43 @@ index 000000000000..578ba747826e
 +		}
 +}
 +
++static void bch2_btree_node_free_never_used(struct btree_update *as,
++					    struct btree_trans *trans,
++					    struct btree *b)
++{
++	struct bch_fs *c = as->c;
++	struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
++	struct btree_path *path;
++	unsigned level = b->c.level;
++
++	BUG_ON(!list_empty(&b->write_blocked));
++	BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
++
++	b->will_make_reachable = 0;
++	closure_put(&as->cl);
++
++	clear_btree_node_will_make_reachable(b);
++	clear_btree_node_accessed(b);
++	clear_btree_node_dirty_acct(c, b);
++	clear_btree_node_need_write(b);
++
++	mutex_lock(&c->btree_cache.lock);
++	list_del_init(&b->list);
++	bch2_btree_node_hash_remove(&c->btree_cache, b);
++	mutex_unlock(&c->btree_cache.lock);
++
++	BUG_ON(p->nr >= ARRAY_SIZE(p->b));
++	p->b[p->nr++] = b;
++
++	six_unlock_intent(&b->c.lock);
++
++	trans_for_each_path(trans, path)
++		if (path->l[level].b == b) {
++			btree_node_unlock(trans, path, level);
++			path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
++		}
++}
++
 +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 +					     struct disk_reservation *res,
 +					     struct closure *cl,
@@ -27538,6 +28334,7 @@ index 000000000000..578ba747826e
 +	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 +	unsigned nr_reserve;
 +	enum alloc_reserve alloc_reserve;
++	int ret;
 +
 +	if (flags & BTREE_INSERT_USE_RESERVE) {
 +		nr_reserve	= 0;
@@ -27560,7 +28357,7 @@ index 000000000000..578ba747826e
 +	mutex_unlock(&c->btree_reserve_cache_lock);
 +
 +retry:
-+	wp = bch2_alloc_sectors_start_trans(trans,
++	ret = bch2_alloc_sectors_start_trans(trans,
 +				      c->opts.metadata_target ?:
 +				      c->opts.foreground_target,
 +				      0,
@@ -27568,9 +28365,9 @@ index 000000000000..578ba747826e
 +				      &devs_have,
 +				      res->nr_replicas,
 +				      c->opts.metadata_replicas_required,
-+				      alloc_reserve, 0, cl);
-+	if (IS_ERR(wp))
-+		return ERR_CAST(wp);
++				      alloc_reserve, 0, cl, &wp);
++	if (unlikely(ret))
++		return ERR_PTR(ret);
 +
 +	if (wp->sectors_free < btree_sectors(c)) {
 +		struct open_bucket *ob;
@@ -27722,9 +28519,6 @@ index 000000000000..578ba747826e
 +	btree_node_set_format(b, b->data->format);
 +	bch2_btree_build_aux_trees(b);
 +
-+	bch2_btree_update_add_new_node(as, b);
-+	six_unlock_write(&b->c.lock);
-+
 +	return b;
 +}
 +
@@ -27960,7 +28754,7 @@ index 000000000000..578ba747826e
 +	bch2_trans_unlock(&trans);
 +
 +	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-+			     "error %i in btree_update_nodes_written()", ret);
++			     "%s(): error %s", __func__, bch2_err_str(ret));
 +err:
 +	if (as->b) {
 +		struct btree_path *path;
@@ -28188,6 +28982,14 @@ index 000000000000..578ba747826e
 +	mutex_unlock(&c->btree_interior_update_lock);
 +
 +	btree_update_add_key(as, &as->new_keys, b);
++
++	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
++		unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data;
++		unsigned sectors = round_up(bytes, block_bytes(c)) >> 9;
++
++		bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written =
++			cpu_to_le16(sectors);
++	}
 +}
 +
 +/*
@@ -28355,24 +29157,24 @@ index 000000000000..578ba747826e
 +		nr_nodes[!!update_level] += 1 + split;
 +		update_level++;
 +
-+		if (!btree_path_node(path, update_level))
++		ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
++		if (ret)
++			return ERR_PTR(ret);
++
++		if (!btree_path_node(path, update_level)) {
++			/* Allocating new root? */
++			nr_nodes[1] += split;
++			update_level = BTREE_MAX_DEPTH;
++			break;
++		}
++
++		if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
++					BKEY_BTREE_PTR_U64s_MAX * (1 + split)))
 +			break;
 +
-+		/*
-+		 * XXX: figure out how far we might need to split,
-+		 * instead of locking/reserving all the way to the root:
-+		 */
-+		split = update_level + 1 < BTREE_MAX_DEPTH;
++		split = true;
 +	}
 +
-+	/* Might have to allocate a new root: */
-+	if (update_level < BTREE_MAX_DEPTH)
-+		nr_nodes[1] += 1;
-+
-+	ret = bch2_btree_path_upgrade(trans, path, U8_MAX);
-+	if (ret)
-+		return ERR_PTR(ret);
-+
 +	if (flags & BTREE_INSERT_GC_LOCK_HELD)
 +		lockdep_assert_held(&c->gc_lock);
 +	else if (!down_read_trylock(&c->gc_lock)) {
@@ -28393,6 +29195,7 @@ index 000000000000..578ba747826e
 +	as->mode	= BTREE_INTERIOR_NO_UPDATE;
 +	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
 +	as->btree_id	= path->btree_id;
++	as->update_level = update_level;
 +	INIT_LIST_HEAD(&as->list);
 +	INIT_LIST_HEAD(&as->unwritten_list);
 +	INIT_LIST_HEAD(&as->write_blocked_list);
@@ -28464,7 +29267,8 @@ index 000000000000..578ba747826e
 +	}
 +
 +	if (ret) {
-+		trace_and_count(c, btree_reserve_get_fail, trans->fn, _RET_IP_, nr_nodes[0] + nr_nodes[1]);
++		trace_and_count(c, btree_reserve_get_fail, trans->fn,
++				_RET_IP_, nr_nodes[0] + nr_nodes[1], ret);
 +		goto err;
 +	}
 +
@@ -28520,7 +29324,6 @@ index 000000000000..578ba747826e
 +	struct btree *old;
 +
 +	trace_and_count(c, btree_node_set_root, c, b);
-+	BUG_ON(!b->written);
 +
 +	old = btree_node_root(c, b);
 +
@@ -28594,6 +29397,7 @@ index 000000000000..578ba747826e
 +	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
 +	set_btree_node_dirty_acct(c, b);
 +	set_btree_node_need_write(b);
++	b->write_type = BTREE_WRITE_interior;
 +
 +	printbuf_exit(&buf);
 +}
@@ -28644,8 +29448,6 @@ index 000000000000..578ba747826e
 +	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
 +	n2->key.k.p = n1->key.k.p;
 +
-+	bch2_btree_update_add_new_node(as, n2);
-+
 +	set1 = btree_bset_first(n1);
 +	set2 = btree_bset_first(n2);
 +
@@ -28787,18 +29589,19 @@ index 000000000000..578ba747826e
 +	btree_node_interior_verify(as->c, b);
 +}
 +
-+static void btree_split(struct btree_update *as, struct btree_trans *trans,
-+			struct btree_path *path, struct btree *b,
-+			struct keylist *keys, unsigned flags)
++static int btree_split(struct btree_update *as, struct btree_trans *trans,
++		       struct btree_path *path, struct btree *b,
++		       struct keylist *keys, unsigned flags)
 +{
 +	struct bch_fs *c = as->c;
 +	struct btree *parent = btree_node_parent(path, b);
 +	struct btree *n1, *n2 = NULL, *n3 = NULL;
 +	struct btree_path *path1 = NULL, *path2 = NULL;
 +	u64 start_time = local_clock();
++	int ret = 0;
 +
 +	BUG_ON(!parent && (b != btree_node_root(c, b)));
-+	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
++	BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
 +
 +	bch2_btree_interior_update_will_free_node(as, b);
 +
@@ -28814,6 +29617,9 @@ index 000000000000..578ba747826e
 +
 +		bch2_btree_build_aux_trees(n2);
 +		bch2_btree_build_aux_trees(n1);
++
++		bch2_btree_update_add_new_node(as, n1);
++		bch2_btree_update_add_new_node(as, n2);
 +		six_unlock_write(&n2->c.lock);
 +		six_unlock_write(&n1->c.lock);
 +
@@ -28827,11 +29633,6 @@ index 000000000000..578ba747826e
 +		mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
 +		bch2_btree_path_level_init(trans, path2, n2);
 +
-+		bch2_btree_update_add_new_node(as, n1);
-+
-+		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
-+		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
-+
 +		/*
 +		 * Note that on recursive parent_keys == keys, so we
 +		 * can't start adding new keys to parent_keys before emptying it
@@ -28844,6 +29645,9 @@ index 000000000000..578ba747826e
 +			/* Depth increases, make a new root */
 +			n3 = __btree_root_alloc(as, trans, b->c.level + 1);
 +
++			bch2_btree_update_add_new_node(as, n3);
++			six_unlock_write(&n3->c.lock);
++
 +			path2->locks_want++;
 +			BUG_ON(btree_node_locked(path2, n3->c.level));
 +			six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
@@ -28854,13 +29658,12 @@ index 000000000000..578ba747826e
 +			n3->sib_u64s[1] = U16_MAX;
 +
 +			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
-+
-+			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
 +		}
 +	} else {
 +		trace_and_count(c, btree_node_compact, c, b);
 +
 +		bch2_btree_build_aux_trees(n1);
++		bch2_btree_update_add_new_node(as, n1);
 +		six_unlock_write(&n1->c.lock);
 +
 +		path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
@@ -28868,10 +29671,6 @@ index 000000000000..578ba747826e
 +		mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
 +		bch2_btree_path_level_init(trans, path1, n1);
 +
-+		bch2_btree_update_add_new_node(as, n1);
-+
-+		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
-+
 +		if (parent)
 +			bch2_keylist_add(&as->parent_keys, &n1->key);
 +	}
@@ -28880,7 +29679,9 @@ index 000000000000..578ba747826e
 +
 +	if (parent) {
 +		/* Split a non root node */
-+		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++		if (ret)
++			goto err;
 +	} else if (n3) {
 +		bch2_btree_set_root(as, trans, path, n3);
 +	} else {
@@ -28888,11 +29689,16 @@ index 000000000000..578ba747826e
 +		bch2_btree_set_root(as, trans, path, n1);
 +	}
 +
-+	bch2_btree_update_get_open_buckets(as, n1);
-+	if (n2)
-+		bch2_btree_update_get_open_buckets(as, n2);
-+	if (n3)
++	if (n3) {
 +		bch2_btree_update_get_open_buckets(as, n3);
++		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
++	}
++	if (n2) {
++		bch2_btree_update_get_open_buckets(as, n2);
++		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
++	}
++	bch2_btree_update_get_open_buckets(as, n1);
++	bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 +
 +	/*
 +	 * The old node must be freed (in memory) _before_ unlocking the new
@@ -28913,7 +29719,7 @@ index 000000000000..578ba747826e
 +	if (n2)
 +		six_unlock_intent(&n2->c.lock);
 +	six_unlock_intent(&n1->c.lock);
-+
++out:
 +	if (path2) {
 +		__bch2_btree_path_unlock(trans, path2);
 +		bch2_path_put(trans, path2, true);
@@ -28929,6 +29735,14 @@ index 000000000000..578ba747826e
 +			       ? BCH_TIME_btree_node_split
 +			       : BCH_TIME_btree_node_compact],
 +			       start_time);
++	return ret;
++err:
++	if (n3)
++		bch2_btree_node_free_never_used(as, trans, n3);
++	if (n2)
++		bch2_btree_node_free_never_used(as, trans, n2);
++	bch2_btree_node_free_never_used(as, trans, n1);
++	goto out;
 +}
 +
 +static void
@@ -28963,22 +29777,30 @@ index 000000000000..578ba747826e
 + * If a split occurred, this function will return early. This can only happen
 + * for leaf nodes -- inserts into interior nodes have to be atomic.
 + */
-+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
-+				   struct btree_path *path, struct btree *b,
-+				   struct keylist *keys, unsigned flags)
++static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
++				  struct btree_path *path, struct btree *b,
++				  struct keylist *keys, unsigned flags)
 +{
 +	struct bch_fs *c = as->c;
 +	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
 +	int old_live_u64s = b->nr.live_u64s;
 +	int live_u64s_added, u64s_added;
++	int ret;
 +
 +	lockdep_assert_held(&c->gc_lock);
-+	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
++	BUG_ON(!btree_node_intent_locked(path, b->c.level));
 +	BUG_ON(!b->c.level);
 +	BUG_ON(!as || as->b);
 +	bch2_verify_keylist_sorted(keys);
 +
-+	bch2_btree_node_lock_for_insert(trans, path, b);
++	if (!(local_clock() & 63))
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
++
++	ret = bch2_btree_node_lock_write(trans, path, &b->c);
++	if (ret)
++		return ret;
++
++	bch2_btree_node_prep_for_write(trans, path, b);
 +
 +	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
 +		bch2_btree_node_unlock_write(trans, path, b);
@@ -29004,9 +29826,16 @@ index 000000000000..578ba747826e
 +	bch2_btree_node_unlock_write(trans, path, b);
 +
 +	btree_node_interior_verify(c, b);
-+	return;
++	return 0;
 +split:
-+	btree_split(as, trans, path, b, keys, flags);
++	/*
++	 * We could attempt to avoid the transaction restart, by calling
++	 * bch2_btree_path_upgrade() and allocating more nodes:
++	 */
++	if (b->c.level >= as->update_level)
++		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
++
++	return btree_split(as, trans, path, b, keys, flags);
 +}
 +
 +int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -29023,10 +29852,15 @@ index 000000000000..578ba747826e
 +	if (IS_ERR(as))
 +		return PTR_ERR(as);
 +
-+	btree_split(as, trans, path, b, NULL, flags);
++	ret = btree_split(as, trans, path, b, NULL, flags);
++	if (ret) {
++		bch2_btree_update_free(as, trans);
++		return ret;
++	}
++
 +	bch2_btree_update_done(as, trans);
 +
-+	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
++	for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
 +		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 +
 +	return ret;
@@ -29095,10 +29929,10 @@ index 000000000000..578ba747826e
 +		bch2_bpos_to_text(&buf1, prev->data->max_key);
 +		bch2_bpos_to_text(&buf2, next->data->min_key);
 +		bch_err(c,
-+			"btree topology error in btree merge:\n"
++			"%s(): btree topology error:\n"
 +			"  prev ends at   %s\n"
 +			"  next starts at %s",
-+			buf1.buf, buf2.buf);
++			__func__, buf1.buf, buf2.buf);
 +		printbuf_exit(&buf1);
 +		printbuf_exit(&buf2);
 +		bch2_topology_error(c);
@@ -29152,8 +29986,6 @@ index 000000000000..578ba747826e
 +	btree_set_min(n, prev->data->min_key);
 +	btree_set_max(n, next->data->max_key);
 +
-+	bch2_btree_update_add_new_node(as, n);
-+
 +	n->data->format	 = new_f;
 +	btree_node_set_format(n, new_f);
 +
@@ -29161,6 +29993,7 @@ index 000000000000..578ba747826e
 +	bch2_btree_sort_into(c, n, next);
 +
 +	bch2_btree_build_aux_trees(n);
++	bch2_btree_update_add_new_node(as, n);
 +	six_unlock_write(&n->c.lock);
 +
 +	new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
@@ -29168,8 +30001,6 @@ index 000000000000..578ba747826e
 +	mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
 +	bch2_btree_path_level_init(trans, new_path, n);
 +
-+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-+
 +	bkey_init(&delete.k);
 +	delete.k.p = prev->key.k.p;
 +	bch2_keylist_add(&as->parent_keys, &delete);
@@ -29177,11 +30008,14 @@ index 000000000000..578ba747826e
 +
 +	bch2_trans_verify_paths(trans);
 +
-+	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
++	if (ret)
++		goto err_free_update;
 +
 +	bch2_trans_verify_paths(trans);
 +
 +	bch2_btree_update_get_open_buckets(as, n);
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 +
 +	bch2_btree_node_free_inmem(trans, path, b);
 +	bch2_btree_node_free_inmem(trans, sib_path, m);
@@ -29202,6 +30036,10 @@ index 000000000000..578ba747826e
 +	bch2_path_put(trans, sib_path, true);
 +	bch2_trans_verify_locks(trans);
 +	return ret;
++err_free_update:
++	bch2_btree_node_free_never_used(as, trans, n);
++	bch2_btree_update_free(as, trans);
++	goto out;
 +}
 +
 +/**
@@ -29230,9 +30068,9 @@ index 000000000000..578ba747826e
 +	bch2_btree_interior_update_will_free_node(as, b);
 +
 +	n = bch2_btree_node_alloc_replacement(as, trans, b);
-+	bch2_btree_update_add_new_node(as, n);
 +
 +	bch2_btree_build_aux_trees(n);
++	bch2_btree_update_add_new_node(as, n);
 +	six_unlock_write(&n->c.lock);
 +
 +	new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
@@ -29242,17 +30080,18 @@ index 000000000000..578ba747826e
 +
 +	trace_and_count(c, btree_node_rewrite, c, b);
 +
-+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
-+
 +	if (parent) {
 +		bch2_keylist_add(&as->parent_keys, &n->key);
-+		bch2_btree_insert_node(as, trans, iter->path, parent,
-+				       &as->parent_keys, flags);
++		ret = bch2_btree_insert_node(as, trans, iter->path, parent,
++					     &as->parent_keys, flags);
++		if (ret)
++			goto err;
 +	} else {
 +		bch2_btree_set_root(as, trans, iter->path, n);
 +	}
 +
 +	bch2_btree_update_get_open_buckets(as, n);
++	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 +
 +	bch2_btree_node_free_inmem(trans, iter->path, b);
 +
@@ -29260,10 +30099,15 @@ index 000000000000..578ba747826e
 +	six_unlock_intent(&n->c.lock);
 +
 +	bch2_btree_update_done(as, trans);
-+	bch2_path_put(trans, new_path, true);
 +out:
++	if (new_path)
++		bch2_path_put(trans, new_path, true);
 +	bch2_btree_path_downgrade(trans, iter->path);
 +	return ret;
++err:
++	bch2_btree_node_free_never_used(as, trans, n);
++	bch2_btree_update_free(as, trans);
++	goto out;
 +}
 +
 +struct async_btree_rewrite {
@@ -29293,7 +30137,7 @@ index 000000000000..578ba747826e
 +		goto out;
 +
 +	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
-+out :
++out:
 +	bch2_trans_iter_exit(trans, &iter);
 +
 +	return ret;
@@ -29681,10 +30525,10 @@ index 000000000000..578ba747826e
 +}
 diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
 new file mode 100644
-index 000000000000..7af810df8348
+index 000000000000..2e6d220c3bcd
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.h
-@@ -0,0 +1,322 @@
+@@ -0,0 +1,324 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
 +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
@@ -29739,6 +30583,7 @@ index 000000000000..7af810df8348
 +	unsigned			took_gc_lock:1;
 +
 +	enum btree_id			btree_id;
++	unsigned			update_level;
 +
 +	struct disk_reservation		disk_res;
 +	struct journal_preres		journal_preres;
@@ -29968,6 +30813,7 @@ index 000000000000..7af810df8348
 +	struct bkey_packed k;
 +
 +	BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
++	EBUG_ON(btree_node_just_written(b));
 +
 +	if (!bkey_pack_pos(&k, pos, b)) {
 +		struct bkey *u = (void *) &k;
@@ -30009,10 +30855,10 @@ index 000000000000..7af810df8348
 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
 diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
 new file mode 100644
-index 000000000000..08d7001f7217
+index 000000000000..b930b788410d
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_leaf.c
-@@ -0,0 +1,1745 @@
+@@ -0,0 +1,1760 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -30071,9 +30917,9 @@ index 000000000000..08d7001f7217
 +		insert_l(&i[0])->b == insert_l(&i[1])->b;
 +}
 +
-+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-+						  struct btree_path *path,
-+						  struct btree *b)
++inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
++					   struct btree_path *path,
++					   struct btree *b)
 +{
 +	struct bch_fs *c = trans->c;
 +
@@ -30092,14 +30938,6 @@ index 000000000000..08d7001f7217
 +		bch2_btree_init_next(trans, b);
 +}
 +
-+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-+				     struct btree_path *path,
-+				     struct btree *b)
-+{
-+	bch2_btree_node_lock_write_nofail(trans, path, &b->c);
-+	bch2_btree_node_prep_for_write(trans, path, b);
-+}
-+
 +/* Inserting into a given leaf node (last stage of insert): */
 +
 +/* Handle overwrites and do insert, for non extents: */
@@ -30204,6 +31042,8 @@ index 000000000000..08d7001f7217
 +		new |= 1 << BTREE_NODE_need_write;
 +	} while ((v = cmpxchg(&b->flags, old, new)) != old);
 +
++	b->write_type = BTREE_WRITE_journal_reclaim;
++
 +	btree_node_write_if_need(c, b, SIX_LOCK_read);
 +	six_unlock_read(&b->c.lock);
 +
@@ -30312,7 +31152,7 @@ index 000000000000..08d7001f7217
 +	return 0;
 +}
 +
-+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
++static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
 +					     unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
@@ -30362,7 +31202,7 @@ index 000000000000..08d7001f7217
 +{
 +	struct bch_fs *c = trans->c;
 +	struct bkey_cached *ck = (void *) path->l[0].b;
-+	unsigned old_u64s = ck->u64s, new_u64s;
++	unsigned new_u64s;
 +	struct bkey_i *new_k;
 +
 +	EBUG_ON(path->level);
@@ -30391,12 +31231,7 @@ index 000000000000..08d7001f7217
 +
 +	ck->u64s	= new_u64s;
 +	ck->k		= new_k;
-+	/*
-+	 * Keys returned by peek() are no longer valid pointers, so we need a
-+	 * transaction restart:
-+	 */
-+	trace_and_count(c, trans_restart_key_cache_key_realloced, trans, _RET_IP_, path, old_u64s, new_u64s);
-+	return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_key_cache_realloced);
++	return 0;
 +}
 +
 +/* Triggers: */
@@ -30749,33 +31584,34 @@ index 000000000000..08d7001f7217
 +	return ret;
 +}
 +
++static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
++{
++	while (--i >= trans->updates) {
++		if (same_leaf_as_prev(trans, i))
++			continue;
++
++		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
++	}
++
++	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
++}
++
 +static inline int trans_lock_write(struct btree_trans *trans)
 +{
 +	struct btree_insert_entry *i;
-+	int ret;
 +
 +	trans_for_each_update(trans, i) {
 +		if (same_leaf_as_prev(trans, i))
 +			continue;
 +
-+		ret = bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c);
-+		if (ret)
-+			goto fail;
++		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
++			return trans_lock_write_fail(trans, i);
 +
 +		bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
 +	}
 +
 +	return 0;
-+fail:
-+	while (--i >= trans->updates) {
-+		if (same_leaf_as_prev(trans, i))
-+			continue;
-+
-+		bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
-+	}
-+
-+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
 +}
 +
 +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
@@ -30786,6 +31622,33 @@ index 000000000000..08d7001f7217
 +		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 +}
 +
++static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
++						   struct btree_insert_entry *i,
++						   struct printbuf *err)
++{
++	struct bch_fs *c = trans->c;
++	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
++
++	printbuf_reset(err);
++	prt_printf(err, "invalid bkey on insert from %s -> %ps",
++		   trans->fn, (void *) i->ip_allocated);
++	prt_newline(err);
++	printbuf_indent_add(err, 2);
++
++	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
++	prt_newline(err);
++
++	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++			  i->bkey_type, rw, err);
++	bch2_print_string_as_lines(KERN_ERR, err->buf);
++
++	bch2_inconsistent_error(c);
++	bch2_dump_trans_updates(trans);
++	printbuf_exit(err);
++
++	return -EINVAL;
++}
++
 +/*
 + * Get journal reservation, take write locks, and attempt to do btree update(s):
 + */
@@ -30800,24 +31663,9 @@ index 000000000000..08d7001f7217
 +	int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
 +
 +	trans_for_each_update(trans, i) {
-+		if (bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-+				      i->bkey_type, rw, &buf)) {
-+			printbuf_reset(&buf);
-+			prt_printf(&buf, "invalid bkey on insert from %s -> %ps",
-+			       trans->fn, (void *) i->ip_allocated);
-+			prt_newline(&buf);
-+			printbuf_indent_add(&buf, 2);
-+
-+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-+			prt_newline(&buf);
-+
-+			bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-+					  i->bkey_type, rw, &buf);
-+
-+			bch2_trans_inconsistent(trans, "%s", buf.buf);
-+			printbuf_exit(&buf);
-+			return -EINVAL;
-+		}
++		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++					       i->bkey_type, rw, &buf)))
++			return bch2_trans_commit_bkey_invalid(trans, i, &buf);
 +		btree_insert_entry_checks(trans, i);
 +	}
 +
@@ -31411,11 +32259,41 @@ index 000000000000..08d7001f7217
 +static int __must_check
 +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
 +				struct bkey_i *k, enum btree_update_flags flags,
++				unsigned long ip);
++
++static noinline int flush_new_cached_update(struct btree_trans *trans,
++					    struct btree_path *path,
++					    struct btree_insert_entry *i,
++					    enum btree_update_flags flags,
++					    unsigned long ip)
++{
++	struct btree_path *btree_path;
++	int ret;
++
++	i->key_cache_already_flushed = true;
++	i->flags |= BTREE_TRIGGER_NORUN;
++
++	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++				   BTREE_ITER_INTENT, _THIS_IP_);
++
++	ret = bch2_btree_path_traverse(trans, btree_path, 0);
++	if (ret)
++		goto err;
++
++	btree_path_set_should_be_locked(btree_path);
++	ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip);
++err:
++	bch2_path_put(trans, btree_path, true);
++	return ret;
++}
++
++static int __must_check
++bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
++				struct bkey_i *k, enum btree_update_flags flags,
 +				unsigned long ip)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct btree_insert_entry *i, n;
-+	int ret = 0;
 +
 +	BUG_ON(!path->should_be_locked);
 +
@@ -31484,27 +32362,10 @@ index 000000000000..08d7001f7217
 +	 * the key cache - but the key has to exist in the btree for that to
 +	 * work:
 +	 */
-+	if (path->cached &&
-+	    bkey_deleted(&i->old_k)) {
-+		struct btree_path *btree_path;
++	if (unlikely(path->cached && bkey_deleted(&i->old_k)))
++		return flush_new_cached_update(trans, path, i, flags, ip);
 +
-+		i->key_cache_already_flushed = true;
-+		i->flags |= BTREE_TRIGGER_NORUN;
-+
-+		btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-+					   BTREE_ITER_INTENT, _THIS_IP_);
-+
-+		ret = bch2_btree_path_traverse(trans, btree_path, 0);
-+		if (ret)
-+			goto err;
-+
-+		btree_path_set_should_be_locked(btree_path);
-+		ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
-+err:
-+		bch2_path_put(trans, btree_path, true);
-+	}
-+
-+	return ret;
++	return 0;
 +}
 +
 +static int __must_check
@@ -31760,10 +32621,10 @@ index 000000000000..08d7001f7217
 +}
 diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
 new file mode 100644
-index 000000000000..8af0dd022fda
+index 000000000000..bf01837e1362
 --- /dev/null
 +++ b/fs/bcachefs/buckets.c
-@@ -0,0 +1,2113 @@
+@@ -0,0 +1,2117 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -31855,20 +32716,17 @@ index 000000000000..8af0dd022fda
 +			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
 +}
 +
-+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
++void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
 +{
 +	struct bch_fs *c = ca->fs;
-+	struct bch_dev_usage ret;
 +	unsigned seq, i, u64s = dev_usage_u64s();
 +
 +	do {
 +		seq = read_seqcount_begin(&c->usage_lock);
-+		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
++		memcpy(usage, ca->usage_base, u64s * sizeof(u64));
 +		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
-+			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
++			acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
 +	} while (read_seqcount_retry(&c->usage_lock, seq));
-+
-+	return ret;
 +}
 +
 +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -32341,10 +33199,11 @@ index 000000000000..8af0dd022fda
 +	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
 +	    old_a.cached_sectors) {
 +		ret = update_cached_sectors(c, new, ca->dev_idx,
-+					    -old_a.cached_sectors,
++					    -((s64) old_a.cached_sectors),
 +					    journal_seq, gc);
 +		if (ret) {
-+			bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
++			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
++					    __func__);
 +			return ret;
 +		}
 +	}
@@ -32440,6 +33299,10 @@ index 000000000000..8af0dd022fda
 +	if (bucket_data_type == BCH_DATA_cached)
 +		bucket_data_type = BCH_DATA_user;
 +
++	if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
++	    (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
++		bucket_data_type = ptr_data_type = BCH_DATA_stripe;
++
 +	if (gen_after(ptr->gen, b_gen)) {
 +		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 +			"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
@@ -32685,7 +33548,7 @@ index 000000000000..8af0dd022fda
 +{
 +	u64 journal_seq = trans->journal_res.seq;
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 +	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 +	const union bch_extent_entry *entry;
 +	struct extent_ptr_decoded p;
@@ -32724,7 +33587,8 @@ index 000000000000..8af0dd022fda
 +				ret = update_cached_sectors(c, k, p.ptr.dev,
 +						disk_sectors, journal_seq, true);
 +				if (ret) {
-+					bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
++					bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
++							    __func__);
 +					return ret;
 +				}
 +			}
@@ -32752,7 +33616,7 @@ index 000000000000..8af0dd022fda
 +			struct printbuf buf = PRINTBUF;
 +
 +			bch2_bkey_val_to_text(&buf, c, k);
-+			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
++			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
 +			printbuf_exit(&buf);
 +			return ret;
 +		}
@@ -32877,10 +33741,10 @@ index 000000000000..8af0dd022fda
 +	u64 journal_seq = trans->journal_res.seq;
 +
 +	if (flags & BTREE_TRIGGER_INSERT) {
-+		struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
++		struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
 +
 +		BUG_ON(!journal_seq);
-+		BUG_ON(new.k->type != KEY_TYPE_inode_v2);
++		BUG_ON(new.k->type != KEY_TYPE_inode_v3);
 +
 +		v->bi_journal_seq = cpu_to_le64(journal_seq);
 +	}
@@ -32904,7 +33768,7 @@ index 000000000000..8af0dd022fda
 +			  unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 +	struct bch_fs_usage __percpu *fs_usage;
 +	unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 +	s64 sectors = (s64) k.k->size;
@@ -32983,7 +33847,7 @@ index 000000000000..8af0dd022fda
 +			unsigned flags)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
++	struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
 +	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 +	struct reflink_gc *ref;
 +	size_t l, r, m;
@@ -33026,23 +33890,24 @@ index 000000000000..8af0dd022fda
 +	struct btree_insert_entry *i;
 +	struct printbuf buf = PRINTBUF;
 +
-+	bch_err(c, "disk usage increased %lli more than %u sectors reserved",
-+		should_not_have_added, disk_res_sectors);
++	prt_printf(&buf,
++		   bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"),
++		   should_not_have_added, disk_res_sectors);
 +
 +	trans_for_each_update(trans, i) {
 +		struct bkey_s_c old = { &i->old_k, i->old_v };
 +
-+		pr_err("while inserting");
-+		printbuf_reset(&buf);
++		prt_str(&buf, "new ");
 +		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
-+		pr_err("  %s", buf.buf);
-+		pr_err("overlapping with");
-+		printbuf_reset(&buf);
++		prt_newline(&buf);
++
++		prt_str(&buf, "old ");
 +		bch2_bkey_val_to_text(&buf, c, old);
-+		pr_err("  %s", buf.buf);
++		prt_newline(&buf);
 +	}
 +
 +	__WARN();
++	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 +	printbuf_exit(&buf);
 +}
 +
@@ -33712,7 +34577,7 @@ index 000000000000..8af0dd022fda
 +
 +#define SECTORS_CACHE	1024
 +
-+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
++int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 +			      u64 sectors, int flags)
 +{
 +	struct bch_fs_pcpu *pcpu;
@@ -33875,14 +34740,14 @@ index 000000000000..8af0dd022fda
 +			return -ENOMEM;
 +	}
 +
-+	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
++	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
 +}
 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
 new file mode 100644
-index 000000000000..6881502d95f1
+index 000000000000..01c706b73cee
 --- /dev/null
 +++ b/fs/bcachefs/buckets.h
-@@ -0,0 +1,300 @@
+@@ -0,0 +1,326 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -34024,7 +34889,15 @@ index 000000000000..6881502d95f1
 +
 +/* Device usage: */
 +
-+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
++void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *);
++static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
++{
++	struct bch_dev_usage ret;
++
++	bch2_dev_usage_read_fast(ca, &ret);
++	return ret;
++}
++
 +void bch2_dev_usage_init(struct bch_dev *);
 +
 +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve)
@@ -34125,8 +34998,6 @@ index 000000000000..6881502d95f1
 +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 +
-+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
-+
 +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 +
 +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
@@ -34138,15 +35009,35 @@ index 000000000000..6881502d95f1
 +static inline void bch2_disk_reservation_put(struct bch_fs *c,
 +					     struct disk_reservation *res)
 +{
-+	this_cpu_sub(*c->online_reserved, res->sectors);
-+	res->sectors = 0;
++	if (res->sectors) {
++		this_cpu_sub(*c->online_reserved, res->sectors);
++		res->sectors = 0;
++	}
 +}
 +
 +#define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
 +
-+int bch2_disk_reservation_add(struct bch_fs *,
-+			      struct disk_reservation *,
-+			      u64, int);
++int __bch2_disk_reservation_add(struct bch_fs *,
++				struct disk_reservation *,
++				u64, int);
++
++static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
++					    u64 sectors, int flags)
++{
++	u64 old, new;
++
++	do {
++		old = this_cpu_read(c->pcpu->sectors_available);
++		if (sectors > old)
++			return __bch2_disk_reservation_add(c, res, sectors, flags);
++
++		new = old - sectors;
++	} while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old);
++
++	this_cpu_add(*c->online_reserved, sectors);
++	res->sectors			+= sectors;
++	return 0;
++}
 +
 +static inline struct disk_reservation
 +bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
@@ -35320,7 +36211,7 @@ index 000000000000..3a4890d39ff9
 +#endif /* _BCACHEFS_CHARDEV_H */
 diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
 new file mode 100644
-index 000000000000..b5850a761b91
+index 000000000000..43d22fe8131b
 --- /dev/null
 +++ b/fs/bcachefs/checksum.c
 @@ -0,0 +1,712 @@
@@ -35457,7 +36348,7 @@ index 000000000000..b5850a761b91
 +		size_t orig_len = len;
 +		int ret, i;
 +
-+		sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL);
++		sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL);
 +		if (!sg)
 +			return -ENOMEM;
 +
@@ -35642,7 +36533,7 @@ index 000000000000..b5850a761b91
 +	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
 +}
 +
-+int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
++int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 +		     struct nonce nonce, struct bio *bio)
 +{
 +	struct bio_vec bv;
@@ -36038,10 +36929,10 @@ index 000000000000..b5850a761b91
 +}
 diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
 new file mode 100644
-index 000000000000..c86c3c05d620
+index 000000000000..f7ccef7a5520
 --- /dev/null
 +++ b/fs/bcachefs/checksum.h
-@@ -0,0 +1,204 @@
+@@ -0,0 +1,212 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_CHECKSUM_H
 +#define _BCACHEFS_CHECKSUM_H
@@ -36105,8 +36996,16 @@ index 000000000000..c86c3c05d620
 +			struct bch_extent_crc_unpacked *,
 +			unsigned, unsigned, unsigned);
 +
-+int bch2_encrypt_bio(struct bch_fs *, unsigned,
-+		     struct nonce, struct bio *);
++int __bch2_encrypt_bio(struct bch_fs *, unsigned,
++		       struct nonce, struct bio *);
++
++static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
++				   struct nonce nonce, struct bio *bio)
++{
++	return bch2_csum_type_is_encryption(type)
++		? __bch2_encrypt_bio(c, type, nonce, bio)
++		: 0;
++}
 +
 +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 +			struct bch_key *);
@@ -36122,15 +37021,15 @@ index 000000000000..c86c3c05d620
 +{
 +	switch (type) {
 +	case BCH_CSUM_OPT_none:
-+	     return BCH_CSUM_none;
++		return BCH_CSUM_none;
 +	case BCH_CSUM_OPT_crc32c:
-+	     return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
++		return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
 +	case BCH_CSUM_OPT_crc64:
-+	     return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
++		return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
 +	case BCH_CSUM_OPT_xxhash:
-+	     return BCH_CSUM_xxhash;
++		return BCH_CSUM_xxhash;
 +	default:
-+	     BUG();
++		BUG();
 +	}
 +}
 +
@@ -36532,7 +37431,7 @@ index 000000000000..5fae0012d808
 +#endif /* _BCACHEFS_CLOCK_TYPES_H */
 diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
 new file mode 100644
-index 000000000000..f692f35a6a98
+index 000000000000..2b7080b67eca
 --- /dev/null
 +++ b/fs/bcachefs/compress.c
 @@ -0,0 +1,639 @@
@@ -36915,7 +37814,7 @@ index 000000000000..f692f35a6a98
 +
 +	/* If it's only one block, don't bother trying to compress: */
 +	if (src->bi_iter.bi_size <= c->opts.block_size)
-+		return 0;
++		return BCH_COMPRESSION_TYPE_incompressible;
 +
 +	dst_data = bio_map_or_bounce(c, dst, WRITE);
 +	src_data = bio_map_or_bounce(c, src, READ);
@@ -37420,10 +38319,10 @@ index 000000000000..519ab9b96e67
 +#endif /* _BCACHEFS_DARRAY_H */
 diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
 new file mode 100644
-index 000000000000..cb25efb68d3f
+index 000000000000..d304c6cf77c6
 --- /dev/null
 +++ b/fs/bcachefs/data_update.c
-@@ -0,0 +1,373 @@
+@@ -0,0 +1,387 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -37523,14 +38422,13 @@ index 000000000000..cb25efb68d3f
 +			ptr->cached = true;
 +}
 +
-+static int bch2_data_update_index_update(struct bch_write_op *op)
++int bch2_data_update_index_update(struct bch_write_op *op)
 +{
 +	struct bch_fs *c = op->c;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct data_update *m =
 +		container_of(op, struct data_update, op);
-+	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 +	struct keylist *keys = &op->insert_keys;
 +	struct bkey_buf _new, _insert;
 +	int ret = 0;
@@ -37652,15 +38550,12 @@ index 000000000000..cb25efb68d3f
 +			bch2_trans_update(&trans, &iter, insert,
 +				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 +			bch2_trans_commit(&trans, &op->res,
-+				op_journal_seq(op),
++				&op->journal_seq,
 +				BTREE_INSERT_NOFAIL|
 +				m->data_opts.btree_insert_flags);
 +		if (!ret) {
 +			bch2_btree_iter_set_pos(&iter, next_pos);
 +
-+			if (ec_ob)
-+				bch2_ob_add_backpointer(c, ec_ob, &insert->k);
-+
 +			this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
 +			trace_move_extent_finish(&new->k);
 +		}
@@ -37700,8 +38595,7 @@ index 000000000000..cb25efb68d3f
 +}
 +
 +void bch2_data_update_read_done(struct data_update *m,
-+				struct bch_extent_crc_unpacked crc,
-+				struct closure *cl)
++				struct bch_extent_crc_unpacked crc)
 +{
 +	/* write bio must own pages: */
 +	BUG_ON(!m->op.wbio.bio.bi_vcnt);
@@ -37709,7 +38603,7 @@ index 000000000000..cb25efb68d3f
 +	m->op.crc = crc;
 +	m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
 +
-+	closure_call(&m->op.cl, bch2_write, NULL, cl);
++	closure_call(&m->op.cl, bch2_write, NULL, NULL);
 +}
 +
 +void bch2_data_update_exit(struct data_update *update)
@@ -37742,24 +38636,25 @@ index 000000000000..cb25efb68d3f
 +	bch2_write_op_init(&m->op, c, io_opts);
 +	m->op.pos	= bkey_start_pos(k.k);
 +	m->op.version	= k.k->version;
-+	m->op.target	= data_opts.target,
++	m->op.target	= data_opts.target;
 +	m->op.write_point = wp;
 +	m->op.flags	|= BCH_WRITE_PAGES_STABLE|
 +		BCH_WRITE_PAGES_OWNED|
 +		BCH_WRITE_DATA_ENCODED|
 +		BCH_WRITE_FROM_INTERNAL|
++		BCH_WRITE_MOVE|
 +		m->data_opts.write_flags;
 +	m->op.compression_type =
 +		bch2_compression_opt_to_type[io_opts.background_compression ?:
 +					     io_opts.compression];
 +	if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
 +		m->op.alloc_reserve = RESERVE_movinggc;
-+	m->op.index_update_fn	= bch2_data_update_index_update;
 +
 +	i = 0;
 +	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-+		if (p.ptr.cached)
-+			m->data_opts.rewrite_ptrs &= ~(1U << i);
++		if (((1U << i) & m->data_opts.rewrite_ptrs) &&
++		    p.ptr.cached)
++			BUG();
 +
 +		if (!((1U << i) & m->data_opts.rewrite_ptrs))
 +			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
@@ -37795,14 +38690,32 @@ index 000000000000..cb25efb68d3f
 +
 +	m->op.nr_replicas = m->op.nr_replicas_required =
 +		hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas;
++
++	BUG_ON(!m->op.nr_replicas);
 +	return 0;
 +}
++
++void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
++{
++	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
++	const struct bch_extent_ptr *ptr;
++	unsigned i = 0;
++
++	bkey_for_each_ptr(ptrs, ptr) {
++		if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
++			opts->kill_ptrs |= 1U << i;
++			opts->rewrite_ptrs ^= 1U << i;
++		}
++
++		i++;
++	}
++}
 diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
 new file mode 100644
-index 000000000000..e64505453a55
+index 000000000000..5d8690795959
 --- /dev/null
 +++ b/fs/bcachefs/data_update.h
-@@ -0,0 +1,38 @@
+@@ -0,0 +1,41 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +
 +#ifndef _BCACHEFS_DATA_UPDATE_H
@@ -37815,6 +38728,7 @@ index 000000000000..e64505453a55
 +
 +struct data_update_opts {
 +	unsigned	rewrite_ptrs;
++	unsigned	kill_ptrs;
 +	u16		target;
 +	u8		extra_replicas;
 +	unsigned	btree_insert_flags;
@@ -37830,23 +38744,25 @@ index 000000000000..e64505453a55
 +	struct bch_write_op	op;
 +};
 +
++int bch2_data_update_index_update(struct bch_write_op *);
++
 +void bch2_data_update_read_done(struct data_update *,
-+				struct bch_extent_crc_unpacked,
-+				struct closure *);
++				struct bch_extent_crc_unpacked);
 +
 +void bch2_data_update_exit(struct data_update *);
 +int bch2_data_update_init(struct bch_fs *, struct data_update *,
 +			  struct write_point_specifier,
 +			  struct bch_io_opts, struct data_update_opts,
 +			  enum btree_id, struct bkey_s_c);
++void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
 +
 +#endif /* _BCACHEFS_DATA_UPDATE_H */
 diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
 new file mode 100644
-index 000000000000..1d2a16155073
+index 000000000000..57602c8e6c34
 --- /dev/null
 +++ b/fs/bcachefs/debug.c
-@@ -0,0 +1,831 @@
+@@ -0,0 +1,811 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Assorted bcachefs debug code
@@ -38326,7 +39242,7 @@ index 000000000000..1d2a16155073
 +		if (i->iter < tbl->size) {
 +			rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
 +				bch2_cached_btree_node_to_text(&i->buf, c, b);
-+			i->iter++;;
++			i->iter++;
 +		} else {
 +			done = true;
 +		}
@@ -38350,26 +39266,6 @@ index 000000000000..1d2a16155073
 +	.read		= bch2_cached_btree_nodes_read,
 +};
 +
-+static int prt_backtrace(struct printbuf *out, struct task_struct *task)
-+{
-+	unsigned long entries[32];
-+	unsigned i, nr_entries;
-+	int ret;
-+
-+	ret = down_read_killable(&task->signal->exec_update_lock);
-+	if (ret)
-+		return ret;
-+
-+	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
-+	for (i = 0; i < nr_entries; i++) {
-+		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
-+		prt_newline(out);
-+	}
-+
-+	up_read(&task->signal->exec_update_lock);
-+	return 0;
-+}
-+
 +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
 +					    size_t size, loff_t *ppos)
 +{
@@ -38396,7 +39292,7 @@ index 000000000000..1d2a16155073
 +		prt_printf(&i->buf, "backtrace:");
 +		prt_newline(&i->buf);
 +		printbuf_indent_add(&i->buf, 2);
-+		prt_backtrace(&i->buf, trans->locking_wait.task);
++		bch2_prt_backtrace(&i->buf, trans->locking_wait.task);
 +		printbuf_indent_sub(&i->buf, 2);
 +		prt_newline(&i->buf);
 +
@@ -38506,11 +39402,11 @@ index 000000000000..1d2a16155073
 +		if (!i->size)
 +			break;
 +
-+		if (i->iter == ARRAY_SIZE(c->btree_transaction_fns) ||
-+		    !c->btree_transaction_fns[i->iter])
++		if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) ||
++		    !bch2_btree_transaction_fns[i->iter])
 +			break;
 +
-+		prt_printf(&i->buf, "%s: ", c->btree_transaction_fns[i->iter]);
++		prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]);
 +		prt_newline(&i->buf);
 +		printbuf_indent_add(&i->buf, 2);
 +
@@ -38716,7 +39612,7 @@ index 000000000000..0b86736e5e1b
 +#endif /* _BCACHEFS_DEBUG_H */
 diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
 new file mode 100644
-index 000000000000..4d942d224a08
+index 000000000000..288f46b55876
 --- /dev/null
 +++ b/fs/bcachefs/dirent.c
 @@ -0,0 +1,565 @@
@@ -38825,7 +39721,7 @@ index 000000000000..4d942d224a08
 +
 +	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
 +		prt_printf(err, "value too big (%zu > %u)",
-+		       bkey_val_u64s(k.k),dirent_val_u64s(len));
++		       bkey_val_u64s(k.k), dirent_val_u64s(len));
 +		return -EINVAL;
 +	}
 +
@@ -39287,7 +40183,7 @@ index 000000000000..4d942d224a08
 +}
 diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
 new file mode 100644
-index 000000000000..b1466932c768
+index 000000000000..1a2c9108f864
 --- /dev/null
 +++ b/fs/bcachefs/dirent.h
 @@ -0,0 +1,67 @@
@@ -39302,10 +40198,10 @@ index 000000000000..b1466932c768
 +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_dirent (struct bkey_ops) {	\
++#define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_dirent_invalid,		\
 +	.val_to_text	= bch2_dirent_to_text,		\
-+}
++})
 +
 +struct qstr;
 +struct file;
@@ -39968,16 +40864,17 @@ index 000000000000..e4470c357a66
 +#endif /* _BCACHEFS_DISK_GROUPS_H */
 diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
 new file mode 100644
-index 000000000000..aa8301146382
+index 000000000000..dfe37965d516
 --- /dev/null
 +++ b/fs/bcachefs/ec.c
-@@ -0,0 +1,1673 @@
+@@ -0,0 +1,1680 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +/* erasure coding */
 +
 +#include "bcachefs.h"
 +#include "alloc_foreground.h"
++#include "backpointers.h"
 +#include "bkey_buf.h"
 +#include "bset.h"
 +#include "btree_gc.h"
@@ -40794,17 +41691,13 @@ index 000000000000..aa8301146382
 +static int ec_stripe_update_extent(struct btree_trans *trans,
 +				   struct btree_iter *iter,
 +				   struct bkey_s_c k,
-+				   struct ec_stripe_buf *s,
-+				   struct bpos end)
++				   struct ec_stripe_buf *s)
 +{
 +	const struct bch_extent_ptr *ptr_c;
 +	struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 +	struct bkey_i *n;
 +	int ret, dev, block;
 +
-+	if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-+		return 1;
-+
 +	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
 +		return 0;
 +
@@ -40834,19 +41727,74 @@ index 000000000000..aa8301146382
 +	return bch2_trans_update(trans, iter, n, 0);
 +}
 +
-+static int ec_stripe_update_extents(struct bch_fs *c,
-+				 struct ec_stripe_buf *s,
-+				 struct bkey *pos)
++static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s,
++				   unsigned block)
 +{
++	struct bch_fs *c = trans->c;
++	struct bch_extent_ptr bucket = s->key.v.ptrs[block];
++	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
++	struct bch_backpointer bp;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
++	u64 bp_offset = 0;
++	int ret = 0;
++retry:
++	while (1) {
++		bch2_trans_begin(trans);
 +
-+	return bch2_trans_run(c,
-+		for_each_btree_key_commit(&trans, iter,
-+			BTREE_ID_extents, bkey_start_pos(pos),
-+			BTREE_ITER_NOT_EXTENTS|BTREE_ITER_INTENT, k,
-+			NULL, NULL, BTREE_INSERT_NOFAIL,
-+		ec_stripe_update_extent(&trans, &iter, k, s, pos->p)));
++		ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen,
++						&bp_offset, &bp,
++						BTREE_ITER_CACHED);
++		if (ret)
++			break;
++		if (bp_offset == U64_MAX)
++			break;
++
++		if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) {
++			ret = -EIO;
++			break;
++		}
++
++		k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++		if (!k.k)
++			continue;
++
++		ret = ec_stripe_update_extent(trans, &iter, k, s);
++		bch2_trans_iter_exit(trans, &iter);
++		if (ret)
++			break;
++
++		bp_offset++;
++	}
++
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	return ret;
++}
++
++static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
++{
++	struct btree_trans trans;
++	struct bch_stripe *v = &s->key.v;
++	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
++	int ret = 0;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	for (i = 0; i < nr_data; i++) {
++		ret = ec_stripe_update_bucket(&trans, s, i);
++		if (ret)
++			break;
++	}
++
++
++	bch2_trans_exit(&trans);
++
++	return ret;
 +}
 +
 +/*
@@ -40856,7 +41804,6 @@ index 000000000000..aa8301146382
 +{
 +	struct bch_fs *c = s->c;
 +	struct open_bucket *ob;
-+	struct bkey_i *k;
 +	struct stripe *m;
 +	struct bch_stripe *v = &s->new_stripe.key.v;
 +	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
@@ -40916,14 +41863,10 @@ index 000000000000..aa8301146382
 +		goto err_put_writes;
 +	}
 +
-+	for_each_keylist_key(&s->keys, k) {
-+		ret = ec_stripe_update_extents(c, &s->new_stripe, &k->k);
-+		if (ret) {
-+			bch_err(c, "error creating stripe: error updating pointers: %s",
-+				bch2_err_str(ret));
-+			break;
-+		}
-+	}
++	ret = ec_stripe_update_extents(c, &s->new_stripe);
++	if (ret)
++		bch_err(c, "error creating stripe: error updating pointers: %s",
++			bch2_err_str(ret));
 +
 +	spin_lock(&c->ec_stripes_heap_lock);
 +	m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
@@ -40948,8 +41891,6 @@ index 000000000000..aa8301146382
 +			}
 +		}
 +
-+	bch2_keylist_free(&s->keys, s->inline_keys);
-+
 +	ec_stripe_buf_exit(&s->existing_stripe);
 +	ec_stripe_buf_exit(&s->new_stripe);
 +	closure_debug_destroy(&s->iodone);
@@ -41032,30 +41973,6 @@ index 000000000000..aa8301146382
 +	return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 +}
 +
-+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
-+			     struct bkey *k)
-+{
-+	struct ec_stripe_new *ec = ob->ec;
-+
-+	if (!ec)
-+		return;
-+
-+	mutex_lock(&ec->lock);
-+
-+	if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
-+				 ARRAY_SIZE(ec->inline_keys),
-+				 BKEY_U64s)) {
-+		BUG();
-+	}
-+
-+	bkey_init(&ec->keys.top->k);
-+	ec->keys.top->k.p	= k->p;
-+	ec->keys.top->k.size	= k->size;
-+	bch2_keylist_push(&ec->keys);
-+
-+	mutex_unlock(&ec->lock);
-+}
-+
 +static int unsigned_cmp(const void *_l, const void *_r)
 +{
 +	unsigned l = *((const unsigned *) _l);
@@ -41148,8 +42065,6 @@ index 000000000000..aa8301146382
 +				BCH_BKEY_PTRS_MAX) - h->redundancy;
 +	s->nr_parity	= h->redundancy;
 +
-+	bch2_keylist_init(&s->keys, s->inline_keys);
-+
 +	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
 +			   s->nr_parity, h->blocksize);
 +
@@ -41360,10 +42275,8 @@ index 000000000000..aa8301146382
 +	int ret;
 +
 +	idx = get_existing_stripe(c, h);
-+	if (idx < 0) {
-+		bch_err(c, "failed to find an existing stripe");
++	if (idx < 0)
 +		return -BCH_ERR_ENOSPC_stripe_reuse;
-+	}
 +
 +	h->s->have_existing_stripe = true;
 +	ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@@ -41401,21 +42314,9 @@ index 000000000000..aa8301146382
 +static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
 +							struct ec_stripe_head *h)
 +{
-+	int ret;
-+
-+	ret = bch2_disk_reservation_get(c, &h->s->res,
-+			h->blocksize,
-+			h->s->nr_parity, 0);
-+
-+	if (ret) {
-+		/*
-+		 * This means we need to wait for copygc to
-+		 * empty out buckets from existing stripes:
-+		 */
-+		bch_err(c, "failed to reserve stripe");
-+	}
-+
-+	return ret;
++	return bch2_disk_reservation_get(c, &h->s->res,
++					 h->blocksize,
++					 h->s->nr_parity, 0);
 +}
 +
 +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
@@ -41457,8 +42358,10 @@ index 000000000000..aa8301146382
 +		ret = __bch2_ec_stripe_head_reserve(c, h);
 +	if (ret && needs_stripe_new)
 +		ret = __bch2_ec_stripe_head_reuse(c, h);
-+	if (ret)
++	if (ret) {
++		bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret));
 +		goto err;
++	}
 +
 +	if (!h->s->allocated) {
 +		ret = new_stripe_alloc_buckets(c, h, cl);
@@ -41647,30 +42550,29 @@ index 000000000000..aa8301146382
 +}
 diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
 new file mode 100644
-index 000000000000..a4c13d61af10
+index 000000000000..aba1e82bc889
 --- /dev/null
 +++ b/fs/bcachefs/ec.h
-@@ -0,0 +1,230 @@
+@@ -0,0 +1,224 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EC_H
 +#define _BCACHEFS_EC_H
 +
 +#include "ec_types.h"
 +#include "buckets_types.h"
-+#include "keylist_types.h"
 +
 +int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
 +			int rw, struct printbuf *);
 +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 +			 struct bkey_s_c);
 +
-+#define bch2_bkey_ops_stripe (struct bkey_ops) {	\
++#define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_stripe_invalid,		\
 +	.val_to_text	= bch2_stripe_to_text,		\
 +	.swab		= bch2_ptr_swab,		\
 +	.trans_trigger	= bch2_trans_mark_stripe,	\
 +	.atomic_trigger	= bch2_mark_stripe,		\
-+}
++})
 +
 +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
 +{
@@ -41819,9 +42721,6 @@ index 000000000000..a4c13d61af10
 +	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
 +	struct disk_reservation	res;
 +
-+	struct keylist		keys;
-+	u64			inline_keys[BKEY_U64s * 8];
-+
 +	struct ec_stripe_buf	new_stripe;
 +	struct ec_stripe_buf	existing_stripe;
 +};
@@ -41849,8 +42748,6 @@ index 000000000000..a4c13d61af10
 +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 +
 +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
-+			     struct bkey *);
 +
 +void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
@@ -41935,10 +42832,10 @@ index 000000000000..edd93da663c1
 +#endif /* _BCACHEFS_EC_TYPES_H */
 diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
 new file mode 100644
-index 000000000000..cc9ce0be356e
+index 000000000000..dc906fc9176f
 --- /dev/null
 +++ b/fs/bcachefs/errcode.c
-@@ -0,0 +1,62 @@
+@@ -0,0 +1,63 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -41964,6 +42861,7 @@ index 000000000000..cc9ce0be356e
 +const char *bch2_err_str(int err)
 +{
 +	const char *errstr;
++
 +	err = abs(err);
 +
 +	BUG_ON(err >= BCH_ERR_MAX);
@@ -42003,10 +42901,10 @@ index 000000000000..cc9ce0be356e
 +}
 diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
 new file mode 100644
-index 000000000000..fc0bb5f8873a
+index 000000000000..9f293040b253
 --- /dev/null
 +++ b/fs/bcachefs/errcode.h
-@@ -0,0 +1,96 @@
+@@ -0,0 +1,97 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ERRCODE_H
 +#define _BCACHEFS_ERRCODE_H
@@ -42051,6 +42949,7 @@ index 000000000000..fc0bb5f8873a
 +	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_raced)	\
 +	x(BCH_ERR_transaction_restart,	transaction_restart_key_cache_realloced)\
 +	x(BCH_ERR_transaction_restart,	transaction_restart_journal_preres_get)	\
++	x(BCH_ERR_transaction_restart,	transaction_restart_split_race)		\
 +	x(BCH_ERR_transaction_restart,	transaction_restart_nested)		\
 +	x(0,				no_btree_node)				\
 +	x(BCH_ERR_no_btree_node,	no_btree_node_relock)			\
@@ -42105,10 +43004,10 @@ index 000000000000..fc0bb5f8873a
 +#endif /* _BCACHFES_ERRCODE_H */
 diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
 new file mode 100644
-index 000000000000..762abdf2f283
+index 000000000000..2fb5102ee31d
 --- /dev/null
 +++ b/fs/bcachefs/error.c
-@@ -0,0 +1,218 @@
+@@ -0,0 +1,221 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "error.h"
@@ -42215,7 +43114,7 @@ index 000000000000..762abdf2f283
 +{
 +	struct fsck_err_state *s = NULL;
 +	va_list args;
-+	bool print = true, suppressing = false;
++	bool print = true, suppressing = false, inconsistent = false;
 +	struct printbuf buf = PRINTBUF, *out = &buf;
 +	int ret = -BCH_ERR_fsck_ignore;
 +
@@ -42247,7 +43146,7 @@ index 000000000000..762abdf2f283
 +		if (c->opts.errors != BCH_ON_ERROR_continue ||
 +		    !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
 +			prt_str(out, ", shutting down");
-+			bch2_inconsistent_error(c);
++			inconsistent = true;
 +			ret = -BCH_ERR_fsck_errors_not_fixed;
 +		} else if (flags & FSCK_CAN_FIX) {
 +			prt_str(out, ", fixing");
@@ -42300,6 +43199,9 @@ index 000000000000..762abdf2f283
 +
 +	printbuf_exit(&buf);
 +
++	if (inconsistent)
++		bch2_inconsistent_error(c);
++
 +	if (ret == -BCH_ERR_fsck_fix) {
 +		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
 +	} else {
@@ -42759,7 +43661,7 @@ index 000000000000..6f5cf449361a
 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */
 diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
 new file mode 100644
-index 000000000000..2ca13014b9c4
+index 000000000000..9e2a4ed48b42
 --- /dev/null
 +++ b/fs/bcachefs/extents.c
 @@ -0,0 +1,1324 @@
@@ -43057,7 +43959,7 @@ index 000000000000..2ca13014b9c4
 +		if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
 +		    lp.crc.uncompressed_size) {
 +			/* can use left extent's crc entry */
-+		} else if (lp.crc.live_size <= rp.crc.offset ) {
++		} else if (lp.crc.live_size <= rp.crc.offset) {
 +			/* can use right extent's crc entry */
 +		} else {
 +			/* check if checksums can be merged: */
@@ -43116,7 +44018,7 @@ index 000000000000..2ca13014b9c4
 +			if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
 +			    crc_l.uncompressed_size) {
 +				/* can use left extent's crc entry */
-+			} else if (crc_l.live_size <= crc_r.offset ) {
++			} else if (crc_l.live_size <= crc_r.offset) {
 +				/* can use right extent's crc entry */
 +				crc_r.offset -= crc_l.live_size;
 +				bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
@@ -44089,10 +44991,10 @@ index 000000000000..2ca13014b9c4
 +}
 diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
 new file mode 100644
-index 000000000000..3c17b81130bb
+index 000000000000..224df17206cb
 --- /dev/null
 +++ b/fs/bcachefs/extents.h
-@@ -0,0 +1,685 @@
+@@ -0,0 +1,689 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EXTENTS_H
 +#define _BCACHEFS_EXTENTS_H
@@ -44293,6 +45195,7 @@ index 000000000000..3c17b81130bb
 +	switch (k.k->type) {
 +	case KEY_TYPE_btree_ptr: {
 +		struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
++
 +		return (struct bkey_ptrs_c) {
 +			to_entry(&e.v->start[0]),
 +			to_entry(extent_entry_last(e))
@@ -44300,6 +45203,7 @@ index 000000000000..3c17b81130bb
 +	}
 +	case KEY_TYPE_extent: {
 +		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
++
 +		return (struct bkey_ptrs_c) {
 +			e.v->start,
 +			extent_entry_last(e)
@@ -44307,6 +45211,7 @@ index 000000000000..3c17b81130bb
 +	}
 +	case KEY_TYPE_stripe: {
 +		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
++
 +		return (struct bkey_ptrs_c) {
 +			to_entry(&s.v->ptrs[0]),
 +			to_entry(&s.v->ptrs[s.v->nr_blocks]),
@@ -44322,6 +45227,7 @@ index 000000000000..3c17b81130bb
 +	}
 +	case KEY_TYPE_btree_ptr_v2: {
 +		struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
++
 +		return (struct bkey_ptrs_c) {
 +			to_entry(&e.v->start[0]),
 +			to_entry(extent_entry_last(e))
@@ -44437,7 +45343,7 @@ index 000000000000..3c17b81130bb
 +
 +#define extent_for_each_entry_from(_e, _entry, _start)			\
 +	__bkey_extent_entry_for_each_from(_start,			\
-+				extent_entry_last(_e),_entry)
++				extent_entry_last(_e), _entry)
 +
 +#define extent_for_each_entry(_e, _entry)				\
 +	extent_for_each_entry_from(_e, _entry, (_e).v->start)
@@ -44471,28 +45377,28 @@ index 000000000000..3c17b81130bb
 +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 +			      int, struct bkey_s);
 +
-+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) {		\
++#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_btree_ptr_invalid,		\
 +	.val_to_text	= bch2_btree_ptr_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
 +	.trans_trigger	= bch2_trans_mark_extent,		\
 +	.atomic_trigger	= bch2_mark_extent,			\
-+}
++})
 +
-+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) {		\
++#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
 +	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
 +	.compat		= bch2_btree_ptr_v2_compat,		\
 +	.trans_trigger	= bch2_trans_mark_extent,		\
 +	.atomic_trigger	= bch2_mark_extent,			\
-+}
++})
 +
 +/* KEY_TYPE_extent: */
 +
 +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_extent (struct bkey_ops) {		\
++#define bch2_bkey_ops_extent ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_bkey_ptrs_invalid,		\
 +	.val_to_text	= bch2_bkey_ptrs_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
@@ -44500,7 +45406,7 @@ index 000000000000..3c17b81130bb
 +	.key_merge	= bch2_extent_merge,			\
 +	.trans_trigger	= bch2_trans_mark_extent,		\
 +	.atomic_trigger	= bch2_mark_extent,			\
-+}
++})
 +
 +/* KEY_TYPE_reservation: */
 +
@@ -44509,13 +45415,13 @@ index 000000000000..3c17b81130bb
 +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_reservation (struct bkey_ops) {		\
++#define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_reservation_invalid,		\
 +	.val_to_text	= bch2_reservation_to_text,		\
 +	.key_merge	= bch2_reservation_merge,		\
 +	.trans_trigger	= bch2_trans_mark_reservation,		\
 +	.atomic_trigger	= bch2_mark_reservation,		\
-+}
++})
 +
 +/* Extent checksum entries: */
 +
@@ -45113,7 +46019,7 @@ index 000000000000..05429c9631cd
 +#endif /* _EYTZINGER_H */
 diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
 new file mode 100644
-index 000000000000..cdb272708a4b
+index 000000000000..66b945be10c2
 --- /dev/null
 +++ b/fs/bcachefs/fifo.h
 @@ -0,0 +1,127 @@
@@ -45184,7 +46090,7 @@ index 000000000000..cdb272708a4b
 +	   (((p) - (fifo)->data)))
 +
 +#define fifo_entry_idx(fifo, p)	(((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
-+#define fifo_idx_entry(fifo, i)	(fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
++#define fifo_idx_entry(fifo, i)	((fifo)->data[((fifo)->front + (i)) & (fifo)->mask])
 +
 +#define fifo_push_back_ref(f)						\
 +	(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
@@ -45246,10 +46152,10 @@ index 000000000000..cdb272708a4b
 +#endif /* _BCACHEFS_FIFO_H */
 diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
 new file mode 100644
-index 000000000000..53ffc684223c
+index 000000000000..1f2e1fc4f6b2
 --- /dev/null
 +++ b/fs/bcachefs/fs-common.c
-@@ -0,0 +1,496 @@
+@@ -0,0 +1,501 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -45464,6 +46370,11 @@ index 000000000000..53ffc684223c
 +	if (ret)
 +		goto err;
 +
++	if (bch2_reinherit_attrs(inode_u, dir_u)) {
++		ret = -EXDEV;
++		goto err;
++	}
++
 +	dir_u->bi_mtime = dir_u->bi_ctime = now;
 +
 +	dir_hash = bch2_hash_info_init(c, dir_u);
@@ -45734,11 +46645,11 @@ index 000000000000..53ffc684223c
 +	ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
 +		(src_dir.inum != dst_dir.inum
 +		 ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
-+		 : 0 ) ?:
++		 : 0) ?:
 +		bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
 +		(dst_inum.inum
 +		 ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
-+		 : 0 );
++		 : 0);
 +err:
 +	bch2_trans_iter_exit(trans, &dst_inode_iter);
 +	bch2_trans_iter_exit(trans, &src_inode_iter);
@@ -45797,10 +46708,10 @@ index 000000000000..dde237859514
 +#endif /* _BCACHEFS_FS_COMMON_H */
 diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
 new file mode 100644
-index 000000000000..7d45f4863469
+index 000000000000..fd3c3ea3ce18
 --- /dev/null
 +++ b/fs/bcachefs/fs-io.c
-@@ -0,0 +1,3421 @@
+@@ -0,0 +1,3577 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -45868,7 +46779,6 @@ index 000000000000..7d45f4863469
 +};
 +
 +struct bch_writepage_io {
-+	struct closure			cl;
 +	struct bch_inode_info		*inode;
 +
 +	/* must be last: */
@@ -45876,8 +46786,9 @@ index 000000000000..7d45f4863469
 +};
 +
 +struct dio_write {
-+	struct completion		done;
 +	struct kiocb			*req;
++	struct address_space		*mapping;
++	struct bch_inode_info		*inode;
 +	struct mm_struct		*mm;
 +	unsigned			loop:1,
 +					sync:1,
@@ -45901,7 +46812,7 @@ index 000000000000..7d45f4863469
 +};
 +
 +/* pagecache_block must be held */
-+static int write_invalidate_inode_pages_range(struct address_space *mapping,
++static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
 +					      loff_t start, loff_t end)
 +{
 +	int ret;
@@ -45954,7 +46865,7 @@ index 000000000000..7d45f4863469
 +static int bch2_quota_reservation_add(struct bch_fs *c,
 +				      struct bch_inode_info *inode,
 +				      struct quota_res *res,
-+				      unsigned sectors,
++				      u64 sectors,
 +				      bool check_enospc)
 +{
 +	int ret;
@@ -46409,7 +47320,7 @@ index 000000000000..7d45f4863469
 +static int bch2_page_reservation_get(struct bch_fs *c,
 +			struct bch_inode_info *inode, struct page *page,
 +			struct bch2_page_reservation *res,
-+			unsigned offset, unsigned len, bool check_enospc)
++			unsigned offset, unsigned len)
 +{
 +	struct bch_page_state *s = bch2_page_state_create(page, 0);
 +	unsigned i, disk_sectors = 0, quota_sectors = 0;
@@ -46429,19 +47340,14 @@ index 000000000000..7d45f4863469
 +	}
 +
 +	if (disk_sectors) {
-+		ret = bch2_disk_reservation_add(c, &res->disk,
-+						disk_sectors,
-+						!check_enospc
-+						? BCH_DISK_RESERVATION_NOFAIL
-+						: 0);
++		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
 +		if (unlikely(ret))
 +			return ret;
 +	}
 +
 +	if (quota_sectors) {
 +		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-+						 quota_sectors,
-+						 check_enospc);
++						 quota_sectors, true);
 +		if (unlikely(ret)) {
 +			struct disk_reservation tmp = {
 +				.sectors = disk_sectors
@@ -46625,7 +47531,7 @@ index 000000000000..7d45f4863469
 +		}
 +	}
 +
-+	if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
++	if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) {
 +		unlock_page(page);
 +		ret = VM_FAULT_SIGBUS;
 +		goto out;
@@ -47008,18 +47914,10 @@ index 000000000000..7d45f4863469
 +	};
 +}
 +
-+static void bch2_writepage_io_free(struct closure *cl)
++static void bch2_writepage_io_done(struct bch_write_op *op)
 +{
-+	struct bch_writepage_io *io = container_of(cl,
-+					struct bch_writepage_io, cl);
-+
-+	bio_put(&io->op.wbio.bio);
-+}
-+
-+static void bch2_writepage_io_done(struct closure *cl)
-+{
-+	struct bch_writepage_io *io = container_of(cl,
-+					struct bch_writepage_io, cl);
++	struct bch_writepage_io *io =
++		container_of(op, struct bch_writepage_io, op);
 +	struct bch_fs *c = io->op.c;
 +	struct bio *bio = &io->op.wbio.bio;
 +	struct bvec_iter_all iter;
@@ -47081,7 +47979,7 @@ index 000000000000..7d45f4863469
 +			end_page_writeback(bvec->bv_page);
 +	}
 +
-+	closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
++	bio_put(&io->op.wbio.bio);
 +}
 +
 +static void bch2_writepage_do_io(struct bch_writepage_state *w)
@@ -47089,8 +47987,7 @@ index 000000000000..7d45f4863469
 +	struct bch_writepage_io *io = w->io;
 +
 +	w->io = NULL;
-+	closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
-+	continue_at(&io->cl, bch2_writepage_io_done, NULL);
++	closure_call(&io->op.cl, bch2_write, NULL, NULL);
 +}
 +
 +/*
@@ -47112,9 +48009,7 @@ index 000000000000..7d45f4863469
 +					      &c->writepage_bioset),
 +			     struct bch_writepage_io, op.wbio.bio);
 +
-+	closure_init(&w->io->cl, NULL);
 +	w->io->inode		= inode;
-+
 +	op			= &w->io->op;
 +	bch2_write_op_init(op, c, w->opts);
 +	op->target		= w->opts.foreground_target;
@@ -47123,6 +48018,7 @@ index 000000000000..7d45f4863469
 +	op->write_point		= writepoint_hashed(inode->ei_last_dirtied);
 +	op->subvol		= inode->ei_subvol;
 +	op->pos			= POS(inode->v.i_ino, sector);
++	op->end_io		= bch2_writepage_io_done;
 +	op->wbio.bio.bi_iter.bi_sector = sector;
 +	op->wbio.bio.bi_opf	= wbc_to_write_flags(wbc);
 +}
@@ -47246,7 +48142,8 @@ index 000000000000..7d45f4863469
 +
 +		/* Check for writing past i_size: */
 +		WARN_ON_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-+			     round_up(i_size, block_bytes(c)));
++			     round_up(i_size, block_bytes(c)) &&
++			     !test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 +
 +		w->io->op.res.sectors += reserved_sectors;
 +		w->io->op.i_sectors_delta -= dirty_sectors;
@@ -47330,11 +48227,10 @@ index 000000000000..7d45f4863469
 +	if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
 +		ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
 +		if (ret)
-+			goto out;
++			goto err;
 +	}
 +
-+	ret = bch2_page_reservation_get(c, inode, page, res,
-+					offset, len, true);
++	ret = bch2_page_reservation_get(c, inode, page, res, offset, len);
 +	if (ret) {
 +		if (!PageUptodate(page)) {
 +			/*
@@ -47475,10 +48371,21 @@ index 000000000000..7d45f4863469
 +				goto out;
 +		}
 +
++		/*
++		 * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
++		 * supposed to write as much as we have disk space for.
++		 *
++		 * On failure here we should still write out a partial page if
++		 * we aren't completely out of disk space - we don't do that
++		 * yet:
++		 */
 +		ret = bch2_page_reservation_get(c, inode, page, &res,
-+						pg_offset, pg_len, true);
-+		if (ret)
-+			goto out;
++						pg_offset, pg_len);
++		if (unlikely(ret)) {
++			if (!reserved)
++				goto out;
++			break;
++		}
 +
 +		reserved += pg_len;
 +	}
@@ -47487,13 +48394,13 @@ index 000000000000..7d45f4863469
 +		for (i = 0; i < nr_pages; i++)
 +			flush_dcache_page(pages[i]);
 +
-+	while (copied < len) {
++	while (copied < reserved) {
 +		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
 +		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-+		unsigned pg_len = min_t(unsigned, len - copied,
++		unsigned pg_len = min_t(unsigned, reserved - copied,
 +					PAGE_SIZE - pg_offset);
 +		unsigned pg_copied = copy_page_from_iter_atomic(page,
-+						pg_offset, pg_len,iter);
++						pg_offset, pg_len, iter);
 +
 +		if (!pg_copied)
 +			break;
@@ -47779,11 +48686,13 @@ index 000000000000..7d45f4863469
 +	if (iocb->ki_flags & IOCB_DIRECT) {
 +		struct blk_plug plug;
 +
-+		ret = filemap_write_and_wait_range(mapping,
-+					iocb->ki_pos,
-+					iocb->ki_pos + count - 1);
-+		if (ret < 0)
-+			goto out;
++		if (unlikely(mapping->nrpages)) {
++			ret = filemap_write_and_wait_range(mapping,
++						iocb->ki_pos,
++						iocb->ki_pos + count - 1);
++			if (ret < 0)
++				goto out;
++		}
 +
 +		file_accessed(file);
 +
@@ -47848,31 +48757,107 @@ index 000000000000..7d45f4863469
 +	return err ? false : ret;
 +}
 +
++static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
++{
++	struct bch_fs *c = dio->op.c;
++	struct bch_inode_info *inode = dio->inode;
++	struct bio *bio = &dio->op.wbio.bio;
++
++	return bch2_check_range_allocated(c, inode_inum(inode),
++				dio->op.pos.offset, bio_sectors(bio),
++				dio->op.opts.data_replicas,
++				dio->op.opts.compression != 0);
++}
++
 +static void bch2_dio_write_loop_async(struct bch_write_op *);
 +
-+static long bch2_dio_write_loop(struct dio_write *dio)
++static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 +{
-+	bool kthread = (current->flags & PF_KTHREAD) != 0;
++	struct iovec *iov = dio->inline_vecs;
++
++	if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
++		iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
++				    GFP_KERNEL);
++		if (unlikely(!iov))
++			return -ENOMEM;
++
++		dio->free_iov = true;
++	}
++
++	memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
++	dio->iter.iov = iov;
++	return 0;
++}
++
++static __always_inline long bch2_dio_write_done(struct dio_write *dio)
++{
++	struct bch_fs *c = dio->op.c;
 +	struct kiocb *req = dio->req;
-+	struct address_space *mapping = req->ki_filp->f_mapping;
-+	struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
-+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_inode_info *inode = dio->inode;
++	bool sync = dio->sync;
++	long ret = dio->op.error ?: ((long) dio->written << 9);
++
++	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
++	bch2_quota_reservation_put(c, inode, &dio->quota_res);
++
++	if (dio->free_iov)
++		kfree(dio->iter.iov);
++	bio_put(&dio->op.wbio.bio);
++
++	/* inode->i_dio_count is our ref on inode and thus bch_fs */
++	inode_dio_end(&inode->v);
++
++	if (ret < 0)
++		ret = bch2_err_class(ret);
++
++	if (!sync) {
++		req->ki_complete(req, ret);
++		ret = -EIOCBQUEUED;
++	}
++	return ret;
++}
++
++static __always_inline void bch2_dio_write_end(struct dio_write *dio)
++{
++	struct bch_fs *c = dio->op.c;
++	struct kiocb *req = dio->req;
++	struct bch_inode_info *inode = dio->inode;
 +	struct bio *bio = &dio->op.wbio.bio;
 +	struct bvec_iter_all iter;
 +	struct bio_vec *bv;
++
++	i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
++	req->ki_pos += (u64) dio->op.written << 9;
++	dio->written += dio->op.written;
++
++	spin_lock(&inode->v.i_lock);
++	if (req->ki_pos > inode->v.i_size)
++		i_size_write(&inode->v, req->ki_pos);
++	spin_unlock(&inode->v.i_lock);
++
++	if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
++		bio_for_each_segment_all(bv, bio, iter)
++			put_page(bv->bv_page);
++
++	if (unlikely(dio->op.error))
++		set_bit(EI_INODE_ERROR, &inode->ei_flags);
++}
++
++static long bch2_dio_write_loop(struct dio_write *dio)
++{
++	struct bch_fs *c = dio->op.c;
++	struct kiocb *req = dio->req;
++	struct address_space *mapping = dio->mapping;
++	struct bch_inode_info *inode = dio->inode;
++	struct bio *bio = &dio->op.wbio.bio;
 +	unsigned unaligned, iter_count;
 +	bool sync = dio->sync, dropped_locks;
 +	long ret;
 +
-+	if (dio->loop)
-+		goto loop;
-+
 +	while (1) {
 +		iter_count = dio->iter.count;
 +
-+		if (kthread && dio->mm)
-+			kthread_use_mm(dio->mm);
-+		BUG_ON(current->faults_disabled_mapping);
++		EBUG_ON(current->faults_disabled_mapping);
 +		current->faults_disabled_mapping = mapping;
 +
 +		ret = bio_iov_iter_get_pages(bio, &dio->iter);
@@ -47880,8 +48865,6 @@ index 000000000000..7d45f4863469
 +		dropped_locks = fdm_dropped_locks();
 +
 +		current->faults_disabled_mapping = NULL;
-+		if (kthread && dio->mm)
-+			kthread_unuse_mm(dio->mm);
 +
 +		/*
 +		 * If the fault handler returned an error but also signalled
@@ -47919,13 +48902,17 @@ index 000000000000..7d45f4863469
 +		}
 +
 +		bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
-+		dio->op.end_io		= bch2_dio_write_loop_async;
++		dio->op.end_io		= sync
++			? NULL
++			: bch2_dio_write_loop_async;
 +		dio->op.target		= dio->op.opts.foreground_target;
 +		dio->op.write_point	= writepoint_hashed((unsigned long) current);
 +		dio->op.nr_replicas	= dio->op.opts.data_replicas;
 +		dio->op.subvol		= inode->ei_subvol;
 +		dio->op.pos		= POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 +
++		if (sync)
++			dio->op.flags |= BCH_WRITE_SYNC;
 +		if ((req->ki_flags & IOCB_DSYNC) &&
 +		    !c->opts.journal_flush_disabled)
 +			dio->op.flags |= BCH_WRITE_FLUSH;
@@ -47934,98 +48921,64 @@ index 000000000000..7d45f4863469
 +		ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
 +						dio->op.opts.data_replicas, 0);
 +		if (unlikely(ret) &&
-+		    !bch2_check_range_allocated(c, inode_inum(inode),
-+				dio->op.pos.offset, bio_sectors(bio),
-+				dio->op.opts.data_replicas,
-+				dio->op.opts.compression != 0))
++		    !bch2_dio_write_check_allocated(dio))
 +			goto err;
 +
 +		task_io_account_write(bio->bi_iter.bi_size);
 +
-+		if (!dio->sync && !dio->loop && dio->iter.count) {
-+			struct iovec *iov = dio->inline_vecs;
++		if (unlikely(dio->iter.count) &&
++		    !dio->sync &&
++		    !dio->loop &&
++		    bch2_dio_write_copy_iov(dio))
++			dio->sync = sync = true;
 +
-+			if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-+				iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
-+					      GFP_KERNEL);
-+				if (unlikely(!iov)) {
-+					dio->sync = sync = true;
-+					goto do_io;
-+				}
-+
-+				dio->free_iov = true;
-+			}
-+
-+			memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
-+			dio->iter.iov = iov;
-+		}
-+do_io:
 +		dio->loop = true;
 +		closure_call(&dio->op.cl, bch2_write, NULL, NULL);
 +
-+		if (sync)
-+			wait_for_completion(&dio->done);
-+		else
++		if (!sync)
 +			return -EIOCBQUEUED;
-+loop:
-+		i_sectors_acct(c, inode, &dio->quota_res,
-+			       dio->op.i_sectors_delta);
-+		req->ki_pos += (u64) dio->op.written << 9;
-+		dio->written += dio->op.written;
 +
-+		spin_lock(&inode->v.i_lock);
-+		if (req->ki_pos > inode->v.i_size)
-+			i_size_write(&inode->v, req->ki_pos);
-+		spin_unlock(&inode->v.i_lock);
++		bch2_dio_write_end(dio);
 +
-+		if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
-+			bio_for_each_segment_all(bv, bio, iter)
-+				put_page(bv->bv_page);
-+		bio->bi_vcnt = 0;
-+
-+		if (dio->op.error) {
-+			set_bit(EI_INODE_ERROR, &inode->ei_flags);
-+			break;
-+		}
-+
-+		if (!dio->iter.count)
++		if (likely(!dio->iter.count) || dio->op.error)
 +			break;
 +
 +		bio_reset(bio, NULL, REQ_OP_WRITE);
-+		reinit_completion(&dio->done);
 +	}
-+
-+	ret = dio->op.error ?: ((long) dio->written << 9);
++out:
++	return bch2_dio_write_done(dio);
 +err:
-+	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-+	bch2_quota_reservation_put(c, inode, &dio->quota_res);
++	dio->op.error = ret;
 +
-+	if (dio->free_iov)
-+		kfree(dio->iter.iov);
++	if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
++		struct bvec_iter_all iter;
++		struct bio_vec *bv;
 +
-+	if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
 +		bio_for_each_segment_all(bv, bio, iter)
 +			put_page(bv->bv_page);
-+	bio_put(bio);
-+
-+	/* inode->i_dio_count is our ref on inode and thus bch_fs */
-+	inode_dio_end(&inode->v);
-+
-+	if (!sync) {
-+		req->ki_complete(req, ret);
-+		ret = -EIOCBQUEUED;
 +	}
-+	return ret;
++	goto out;
 +}
 +
 +static void bch2_dio_write_loop_async(struct bch_write_op *op)
 +{
 +	struct dio_write *dio = container_of(op, struct dio_write, op);
++	struct mm_struct *mm = dio->mm;
 +
-+	if (dio->sync)
-+		complete(&dio->done);
-+	else
-+		bch2_dio_write_loop(dio);
++	bch2_dio_write_end(dio);
++
++	if (likely(!dio->iter.count) || dio->op.error) {
++		bch2_dio_write_done(dio);
++		return;
++	}
++
++	bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
++
++	if (mm)
++		kthread_use_mm(mm);
++	bch2_dio_write_loop(dio);
++	if (mm)
++		kthread_unuse_mm(mm);
 +}
 +
 +static noinline
@@ -48077,8 +49030,9 @@ index 000000000000..7d45f4863469
 +			       GFP_KERNEL,
 +			       &c->dio_write_bioset);
 +	dio = container_of(bio, struct dio_write, op.wbio.bio);
-+	init_completion(&dio->done);
 +	dio->req		= req;
++	dio->mapping		= mapping;
++	dio->inode		= inode;
 +	dio->mm			= current->mm;
 +	dio->loop		= false;
 +	dio->sync		= is_sync_kiocb(req) || extending;
@@ -48086,17 +49040,20 @@ index 000000000000..7d45f4863469
 +	dio->quota_res.sectors	= 0;
 +	dio->written		= 0;
 +	dio->iter		= *iter;
++	dio->op.c		= c;
 +
 +	ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
 +					 iter->count >> 9, true);
 +	if (unlikely(ret))
 +		goto err_put_bio;
 +
-+	ret = write_invalidate_inode_pages_range(mapping,
-+					req->ki_pos,
-+					req->ki_pos + iter->count - 1);
-+	if (unlikely(ret))
-+		goto err_put_bio;
++	if (unlikely(mapping->nrpages)) {
++		ret = write_invalidate_inode_pages_range(mapping,
++						req->ki_pos,
++						req->ki_pos + iter->count - 1);
++		if (unlikely(ret))
++			goto err_put_bio;
++	}
 +
 +	ret = bch2_dio_write_loop(dio);
 +err:
@@ -48519,7 +49476,7 @@ index 000000000000..7d45f4863469
 +
 +	truncate_pagecache_range(&inode->v, offset, end - 1);
 +
-+	if (block_start < block_end ) {
++	if (block_start < block_end) {
 +		s64 i_sectors_delta = 0;
 +
 +		ret = bch2_fpunch(c, inode_inum(inode),
@@ -48902,6 +49859,10 @@ index 000000000000..7d45f4863469
 +	inode_dio_wait(&inode->v);
 +	bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 +
++	ret = file_modified(file);
++	if (ret)
++		goto err;
++
 +	if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
 +		ret = bchfs_fallocate(inode, mode, offset, len);
 +	else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -48912,8 +49873,7 @@ index 000000000000..7d45f4863469
 +		ret = bchfs_fcollapse_finsert(inode, offset, len, false);
 +	else
 +		ret = -EOPNOTSUPP;
-+
-+
++err:
 +	bch2_pagecache_block_put(&inode->ei_pagecache_lock);
 +	inode_unlock(&inode->v);
 +	percpu_ref_put(&c->writes);
@@ -48921,6 +49881,55 @@ index 000000000000..7d45f4863469
 +	return bch2_err_class(ret);
 +}
 +
++static int quota_reserve_range(struct bch_inode_info *inode,
++			       struct quota_res *res,
++			       u64 start, u64 end)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u32 snapshot;
++	u64 sectors = end - start;
++	u64 pos = start;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
++			     SPOS(inode->v.i_ino, pos, snapshot), 0);
++
++	while (!(ret = btree_trans_too_many_iters(&trans)) &&
++	       (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
++	       !(ret = bkey_err(k))) {
++		if (bkey_extent_is_allocation(k.k)) {
++			u64 s = min(end, k.k->p.offset) -
++				max(start, bkey_start_offset(k.k));
++			BUG_ON(s > sectors);
++			sectors -= s;
++		}
++		bch2_btree_iter_advance(&iter);
++	}
++	pos = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	bch2_trans_exit(&trans);
++
++	if (ret)
++		return ret;
++
++	return bch2_quota_reservation_add(c, inode, res, sectors, true);
++}
++
 +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 +			     struct file *file_dst, loff_t pos_dst,
 +			     loff_t len, unsigned remap_flags)
@@ -48928,6 +49937,7 @@ index 000000000000..7d45f4863469
 +	struct bch_inode_info *src = file_bch_inode(file_src);
 +	struct bch_inode_info *dst = file_bch_inode(file_dst);
 +	struct bch_fs *c = src->v.i_sb->s_fs_info;
++	struct quota_res quota_res = { 0 };
 +	s64 i_sectors_delta = 0;
 +	u64 aligned_len;
 +	loff_t ret = 0;
@@ -48948,8 +49958,6 @@ index 000000000000..7d45f4863469
 +
 +	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 +
-+	file_update_time(file_dst);
-+
 +	inode_dio_wait(&src->v);
 +	inode_dio_wait(&dst->v);
 +
@@ -48966,6 +49974,13 @@ index 000000000000..7d45f4863469
 +	if (ret)
 +		goto err;
 +
++	ret = quota_reserve_range(dst, &quota_res, pos_dst >> 9,
++				  (pos_dst + aligned_len) >> 9);
++	if (ret)
++		goto err;
++
++	file_update_time(file_dst);
++
 +	mark_pagecache_unallocated(src, pos_src >> 9,
 +				   (pos_src + aligned_len) >> 9);
 +
@@ -48982,8 +49997,7 @@ index 000000000000..7d45f4863469
 +	 */
 +	ret = min((u64) ret << 9, (u64) len);
 +
-+	/* XXX get a quota reservation */
-+	i_sectors_acct(c, dst, NULL, i_sectors_delta);
++	i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 +
 +	spin_lock(&dst->v.i_lock);
 +	if (pos_dst + ret > dst->v.i_size)
@@ -48994,6 +50008,7 @@ index 000000000000..7d45f4863469
 +	    IS_SYNC(file_inode(file_dst)))
 +		ret = bch2_flush_inode(c, inode_inum(dst));
 +err:
++	bch2_quota_reservation_put(c, dst, &quota_res);
 +	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 +
 +	return bch2_err_class(ret);
@@ -49001,6 +50016,62 @@ index 000000000000..7d45f4863469
 +
 +/* fseek: */
 +
++static int page_data_offset(struct page *page, unsigned offset)
++{
++	struct bch_page_state *s = bch2_page_state(page);
++	unsigned i;
++
++	if (s)
++		for (i = offset >> 9; i < PAGE_SECTORS; i++)
++			if (s->s[i].state >= SECTOR_DIRTY)
++				return i << 9;
++
++	return -1;
++}
++
++static loff_t bch2_seek_pagecache_data(struct inode *vinode,
++				       loff_t start_offset,
++				       loff_t end_offset)
++{
++	struct folio_batch fbatch;
++	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
++	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
++	pgoff_t index		= start_index;
++	unsigned i;
++	loff_t ret;
++	int offset;
++
++	folio_batch_init(&fbatch);
++
++	while (filemap_get_folios(vinode->i_mapping,
++				  &index, end_index, &fbatch)) {
++		for (i = 0; i < folio_batch_count(&fbatch); i++) {
++			struct folio *folio = fbatch.folios[i];
++
++			folio_lock(folio);
++
++			offset = page_data_offset(&folio->page,
++					folio->index == start_index
++					? start_offset & (PAGE_SIZE - 1)
++					: 0);
++			if (offset >= 0) {
++				ret = clamp(((loff_t) folio->index << PAGE_SHIFT) +
++					    offset,
++					    start_offset, end_offset);
++				folio_unlock(folio);
++				folio_batch_release(&fbatch);
++				return ret;
++			}
++
++			folio_unlock(folio);
++		}
++		folio_batch_release(&fbatch);
++		cond_resched();
++	}
++
++	return end_offset;
++}
++
 +static loff_t bch2_seek_data(struct file *file, u64 offset)
 +{
 +	struct bch_inode_info *inode = file_bch_inode(file);
@@ -49044,13 +50115,9 @@ index 000000000000..7d45f4863469
 +	if (ret)
 +		return ret;
 +
-+	if (next_data > offset) {
-+		loff_t pagecache_next_data =
-+			mapping_seek_hole_data(inode->v.i_mapping, offset,
-+					       next_data, SEEK_DATA);
-+		if (pagecache_next_data >= 0)
-+			next_data = min_t(u64, next_data, pagecache_next_data);
-+	}
++	if (next_data > offset)
++		next_data = bch2_seek_pagecache_data(&inode->v,
++						     offset, next_data);
 +
 +	if (next_data >= isize)
 +		return -ENXIO;
@@ -49284,10 +50351,10 @@ index 000000000000..a8835298613a
 +#endif /* _BCACHEFS_FS_IO_H */
 diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
 new file mode 100644
-index 000000000000..bab0707bc2f4
+index 000000000000..2bb680827b44
 --- /dev/null
 +++ b/fs/bcachefs/fs-ioctl.c
-@@ -0,0 +1,539 @@
+@@ -0,0 +1,555 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -49316,6 +50383,9 @@ index 000000000000..bab0707bc2f4
 +	unsigned		flags;
 +
 +	unsigned		projid;
++
++	bool			set_projinherit;
++	bool			projinherit;
 +};
 +
 +static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -49340,6 +50410,11 @@ index 000000000000..bab0707bc2f4
 +	    (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
 +		return -EINVAL;
 +
++	if (s->set_projinherit) {
++		bi->bi_fields_set &= ~(1 << Inode_opt_project);
++		bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
++	}
++
 +	bi->bi_flags &= ~s->mask;
 +	bi->bi_flags |= newflags;
 +
@@ -49397,6 +50472,10 @@ index 000000000000..bab0707bc2f4
 +	struct fsxattr fa = { 0 };
 +
 +	fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
++
++	if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
++		fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
++
 +	fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 +
 +	return copy_to_user(arg, &fa, sizeof(fa));
@@ -49428,6 +50507,10 @@ index 000000000000..bab0707bc2f4
 +	if (copy_from_user(&fa, arg, sizeof(fa)))
 +		return -EFAULT;
 +
++	s.set_projinherit = true;
++	s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
++	fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
++
 +	s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
 +	if (fa.fsx_xflags)
 +		return -EOPNOTSUPP;
@@ -49916,10 +50999,10 @@ index 000000000000..f201980ef2c3
 +#endif /* _BCACHEFS_FS_IOCTL_H */
 diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
 new file mode 100644
-index 000000000000..57e6e21896e1
+index 000000000000..186faa54b590
 --- /dev/null
 +++ b/fs/bcachefs/fs.c
-@@ -0,0 +1,1942 @@
+@@ -0,0 +1,1941 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -50341,7 +51424,7 @@ index 000000000000..57e6e21896e1
 +			      (subvol_inum) { 0 }, 0);
 +
 +	if (IS_ERR(inode))
-+		return PTR_ERR(inode);
++		return bch2_err_class(PTR_ERR(inode));
 +
 +	d_instantiate(dentry, &inode->v);
 +	return 0;
@@ -50450,8 +51533,8 @@ index 000000000000..57e6e21896e1
 +
 +	inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
 +			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
-+	if (unlikely(IS_ERR(inode)))
-+		return PTR_ERR(inode);
++	if (IS_ERR(inode))
++		return bch2_err_class(PTR_ERR(inode));
 +
 +	inode_lock(&inode->v);
 +	ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
@@ -50760,7 +51843,7 @@ index 000000000000..57e6e21896e1
 +			      (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 +
 +	if (IS_ERR(inode))
-+		return PTR_ERR(inode);
++		return bch2_err_class(PTR_ERR(inode));
 +
 +	d_mark_tmpfile(dentry, &inode->v);
 +	d_instantiate(dentry, &inode->v);
@@ -51768,7 +52851,7 @@ index 000000000000..57e6e21896e1
 +	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 +	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
 +	c->vfs_sb		= sb;
-+	strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
++	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 +
 +	ret = super_setup_bdi(sb);
 +	if (ret)
@@ -51839,8 +52922,7 @@ index 000000000000..57e6e21896e1
 +void bch2_vfs_exit(void)
 +{
 +	unregister_filesystem(&bcache_fs_type);
-+	if (bch2_inode_cache)
-+		kmem_cache_destroy(bch2_inode_cache);
++	kmem_cache_destroy(bch2_inode_cache);
 +}
 +
 +int __init bch2_vfs_init(void)
@@ -52078,7 +53160,7 @@ index 000000000000..9f4b57e30e2a
 +#endif /* _BCACHEFS_FS_H */
 diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
 new file mode 100644
-index 000000000000..12f2ef4417cb
+index 000000000000..f4f0e0cec85d
 --- /dev/null
 +++ b/fs/bcachefs/fsck.c
 @@ -0,0 +1,2395 @@
@@ -52405,7 +53487,7 @@ index 000000000000..12f2ef4417cb
 +	bch2_trans_iter_exit(trans, &iter);
 +err:
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(c, "error from __remove_dirent(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -52590,7 +53672,7 @@ index 000000000000..12f2ef4417cb
 +			break;
 +
 +		if (i->equiv == n.equiv) {
-+			bch_err(c, "adding duplicate snapshot in snapshots_seen_add()");
++			bch_err(c, "%s(): adding duplicate snapshot", __func__);
 +			return -EINVAL;
 +		}
 +	}
@@ -52932,8 +54014,7 @@ index 000000000000..12f2ef4417cb
 +	printbuf_exit(&buf);
 +	return ret;
 +bad_hash:
-+	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
-+		     "hashed to %llu\n%s",
++	if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
 +		     bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
 +		     (printbuf_reset(&buf),
 +		      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
@@ -53084,7 +54165,7 @@ index 000000000000..12f2ef4417cb
 +err:
 +fsck_err:
 +	if (ret)
-+		bch_err(c, "error from check_inode(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53110,7 +54191,7 @@ index 000000000000..12f2ef4417cb
 +	bch2_trans_exit(&trans);
 +	snapshots_seen_exit(&s);
 +	if (ret)
-+		bch_err(c, "error from check_inodes(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53243,7 +54324,7 @@ index 000000000000..12f2ef4417cb
 +	}
 +fsck_err:
 +	if (ret)
-+		bch_err(c, "error from check_i_sectors(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	if (!ret && trans_was_restarted(trans, restart_count))
 +		ret = -BCH_ERR_transaction_restart_nested;
 +	return ret;
@@ -53379,7 +54460,7 @@ index 000000000000..12f2ef4417cb
 +	printbuf_exit(&buf);
 +
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(c, "error from check_extent(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53421,7 +54502,7 @@ index 000000000000..12f2ef4417cb
 +	snapshots_seen_exit(&s);
 +
 +	if (ret)
-+		bch_err(c, "error from check_extents(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53460,7 +54541,7 @@ index 000000000000..12f2ef4417cb
 +	}
 +fsck_err:
 +	if (ret)
-+		bch_err(c, "error from check_subdir_count(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	if (!ret && trans_was_restarted(trans, restart_count))
 +		ret = -BCH_ERR_transaction_restart_nested;
 +	return ret;
@@ -53581,7 +54662,7 @@ index 000000000000..12f2ef4417cb
 +	printbuf_exit(&buf);
 +
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(c, "error from check_target(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53751,7 +54832,7 @@ index 000000000000..12f2ef4417cb
 +	printbuf_exit(&buf);
 +
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(c, "error from check_dirent(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53790,7 +54871,7 @@ index 000000000000..12f2ef4417cb
 +	inode_walker_exit(&target);
 +
 +	if (ret)
-+		bch_err(c, "error from check_dirents(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53826,7 +54907,7 @@ index 000000000000..12f2ef4417cb
 +	ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 +fsck_err:
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(c, "error from check_xattr(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -53858,7 +54939,7 @@ index 000000000000..12f2ef4417cb
 +	bch2_trans_exit(&trans);
 +
 +	if (ret)
-+		bch_err(c, "error from check_xattrs(): %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -54128,7 +55209,8 @@ index 000000000000..12f2ef4417cb
 +{
 +	if (t->nr == t->size) {
 +		size_t new_size = max_t(size_t, 128UL, t->size * 2);
-+		void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
++		void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
++
 +		if (!d) {
 +			bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
 +				new_size);
@@ -54493,10 +55575,10 @@ index 000000000000..264f2706b12d
 +#endif /* _BCACHEFS_FSCK_H */
 diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
 new file mode 100644
-index 000000000000..1f2782fc5a2d
+index 000000000000..1a0d2608c058
 --- /dev/null
 +++ b/fs/bcachefs/inode.c
-@@ -0,0 +1,771 @@
+@@ -0,0 +1,892 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -54559,11 +55641,10 @@ index 000000000000..1f2782fc5a2d
 +	return bytes;
 +}
 +
-+void bch2_inode_pack(struct bch_fs *c,
-+		     struct bkey_inode_buf *packed,
-+		     const struct bch_inode_unpacked *inode)
++static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
++					   const struct bch_inode_unpacked *inode)
 +{
-+	struct bkey_i_inode_v2 *k = &packed->inode;
++	struct bkey_i_inode_v3 *k = &packed->inode;
 +	u8 *out = k->v.fields;
 +	u8 *end = (void *) &packed[1];
 +	u8 *last_nonzero_field = out;
@@ -54571,13 +55652,17 @@ index 000000000000..1f2782fc5a2d
 +	unsigned bytes;
 +	int ret;
 +
-+	bkey_inode_v2_init(&packed->inode.k_i);
++	bkey_inode_v3_init(&packed->inode.k_i);
 +	packed->inode.k.p.offset	= inode->bi_inum;
 +	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
 +	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
 +	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-+	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
-+	packed->inode.v.bi_mode		= cpu_to_le16(inode->bi_mode);
++	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
++	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
++	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
++	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
++	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
++
 +
 +#define x(_name, _bits)							\
 +	nr_fields++;							\
@@ -54598,7 +55683,7 @@ index 000000000000..1f2782fc5a2d
 +			*out++ = 0;					\
 +	}
 +
-+	BCH_INODE_FIELDS()
++	BCH_INODE_FIELDS_v3()
 +#undef  x
 +	BUG_ON(out > end);
 +
@@ -54609,7 +55694,7 @@ index 000000000000..1f2782fc5a2d
 +	set_bkey_val_bytes(&packed->inode.k, bytes);
 +	memset_u64s_tail(&packed->inode.v, 0, bytes);
 +
-+	SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
++	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 +
 +	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 +		struct bch_inode_unpacked unpacked;
@@ -54619,16 +55704,25 @@ index 000000000000..1f2782fc5a2d
 +		BUG_ON(ret);
 +		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 +		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
++		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
++		BUG_ON(unpacked.bi_size		!= inode->bi_size);
++		BUG_ON(unpacked.bi_version	!= inode->bi_version);
 +		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 +
 +#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
 +			panic("unpacked %llu should be %llu",		\
 +			      (u64) unpacked._name, (u64) inode->_name);
-+		BCH_INODE_FIELDS()
++		BCH_INODE_FIELDS_v3()
 +#undef  x
 +	}
 +}
 +
++void bch2_inode_pack(struct bkey_inode_buf *packed,
++		     const struct bch_inode_unpacked *inode)
++{
++	bch2_inode_pack_inlined(packed, inode);
++}
++
 +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 +				struct bch_inode_unpacked *unpacked)
 +{
@@ -54656,7 +55750,7 @@ index 000000000000..1f2782fc5a2d
 +	unpacked->_name = field[1];					\
 +	in += ret;
 +
-+	BCH_INODE_FIELDS()
++	BCH_INODE_FIELDS_v2()
 +#undef  x
 +
 +	/* XXX: signal if there were more fields than expected? */
@@ -54695,15 +55789,66 @@ index 000000000000..1f2782fc5a2d
 +		return -1;						\
 +	fieldnr++;
 +
-+	BCH_INODE_FIELDS()
++	BCH_INODE_FIELDS_v2()
 +#undef  x
 +
 +	/* XXX: signal if there were more fields than expected? */
 +	return 0;
 +}
 +
-+int bch2_inode_unpack(struct bkey_s_c k,
-+		      struct bch_inode_unpacked *unpacked)
++static int bch2_inode_unpack_v3(struct bkey_s_c k,
++				struct bch_inode_unpacked *unpacked)
++{
++	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
++	const u8 *in = inode.v->fields;
++	const u8 *end = bkey_val_end(inode);
++	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
++	unsigned fieldnr = 0;
++	int ret;
++	u64 v[2];
++
++	unpacked->bi_inum	= inode.k->p.offset;
++	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
++	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
++	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
++	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
++	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
++	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
++	unpacked->bi_mode	= INODEv3_MODE(inode.v);
++
++#define x(_name, _bits)							\
++	if (fieldnr < nr_fields) {					\
++		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
++		if (ret < 0)						\
++			return ret;					\
++		in += ret;						\
++									\
++		if (_bits > 64) {					\
++			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
++			if (ret < 0)					\
++				return ret;				\
++			in += ret;					\
++		} else {						\
++			v[1] = 0;					\
++		}							\
++	} else {							\
++		v[0] = v[1] = 0;					\
++	}								\
++									\
++	unpacked->_name = v[0];						\
++	if (v[1] || v[0] != unpacked->_name)				\
++		return -1;						\
++	fieldnr++;
++
++	BCH_INODE_FIELDS_v3()
++#undef  x
++
++	/* XXX: signal if there were more fields than expected? */
++	return 0;
++}
++
++static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
++					       struct bch_inode_unpacked *unpacked)
 +{
 +	switch (k.k->type) {
 +	case KEY_TYPE_inode: {
@@ -54742,6 +55887,14 @@ index 000000000000..1f2782fc5a2d
 +	}
 +}
 +
++int bch2_inode_unpack(struct bkey_s_c k,
++		      struct bch_inode_unpacked *unpacked)
++{
++	if (likely(k.k->type == KEY_TYPE_inode_v3))
++		return bch2_inode_unpack_v3(k, unpacked);
++	return bch2_inode_unpack_slowpath(k, unpacked);
++}
++
 +int bch2_inode_peek(struct btree_trans *trans,
 +		    struct btree_iter *iter,
 +		    struct bch_inode_unpacked *inode,
@@ -54787,11 +55940,29 @@ index 000000000000..1f2782fc5a2d
 +	if (IS_ERR(inode_p))
 +		return PTR_ERR(inode_p);
 +
-+	bch2_inode_pack(trans->c, inode_p, inode);
++	bch2_inode_pack_inlined(inode_p, inode);
 +	inode_p->inode.k.p.snapshot = iter->snapshot;
 +	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
 +}
 +
++struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k)
++{
++	struct bch_inode_unpacked u;
++	struct bkey_inode_buf *inode_p;
++	int ret;
++
++	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
++	if (IS_ERR(inode_p))
++		return bkey_s_c_err(PTR_ERR(inode_p));
++
++	ret = bch2_inode_unpack(k, &u);
++	if (ret)
++		return bkey_s_c_err(ret);
++
++	bch2_inode_pack(inode_p, &u);
++	return bkey_i_to_s_c(&inode_p->inode.k_i);
++}
++
 +static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
 +{
 +	struct bch_inode_unpacked unpacked;
@@ -54806,7 +55977,7 @@ index 000000000000..1f2782fc5a2d
 +		return -EINVAL;
 +	}
 +
-+	if (bch2_inode_unpack(k, &unpacked)){
++	if (bch2_inode_unpack(k, &unpacked)) {
 +		prt_printf(err, "invalid variable length fields");
 +		return -EINVAL;
 +	}
@@ -54877,15 +56048,48 @@ index 000000000000..1f2782fc5a2d
 +	return __bch2_inode_invalid(k, err);
 +}
 +
-+static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
++int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  int rw, struct printbuf *err)
 +{
-+	prt_printf(out, "mode %o flags %x journal_seq %llu",
++	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
++
++	if (bkey_val_bytes(k.k) < sizeof(*inode.v)) {
++		prt_printf(err, "incorrect value size (%zu < %zu)",
++		       bkey_val_bytes(k.k), sizeof(*inode.v));
++		return -EINVAL;
++	}
++
++	if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
++	    INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
++		prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
++		       INODEv3_FIELDS_START(inode.v),
++		       INODEv3_FIELDS_START_INITIAL,
++		       bkey_val_u64s(inode.k));
++		return -EINVAL;
++	}
++
++	if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
++		prt_printf(err, "invalid str hash type (%llu >= %u)",
++		       INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
++		return -EINVAL;
++	}
++
++	return __bch2_inode_invalid(k, err);
++}
++
++static void __bch2_inode_unpacked_to_text(struct printbuf *out,
++					  struct bch_inode_unpacked *inode)
++{
++	prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
 +	       inode->bi_mode, inode->bi_flags,
-+	       inode->bi_journal_seq);
++	       inode->bi_journal_seq,
++	       inode->bi_size,
++	       inode->bi_sectors,
++	       inode->bi_version);
 +
 +#define x(_name, _bits)						\
 +	prt_printf(out, " "#_name " %llu", (u64) inode->_name);
-+	BCH_INODE_FIELDS()
++	BCH_INODE_FIELDS_v3()
 +#undef  x
 +}
 +
@@ -54895,8 +56099,7 @@ index 000000000000..1f2782fc5a2d
 +	__bch2_inode_unpacked_to_text(out, inode);
 +}
 +
-+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
-+		       struct bkey_s_c k)
++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 +{
 +	struct bch_inode_unpacked inode;
 +
@@ -55270,50 +56473,60 @@ index 000000000000..1f2782fc5a2d
 +}
 diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
 new file mode 100644
-index 000000000000..2ac2fc10513b
+index 000000000000..a9742bb63809
 --- /dev/null
 +++ b/fs/bcachefs/inode.h
-@@ -0,0 +1,189 @@
+@@ -0,0 +1,202 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_INODE_H
 +#define _BCACHEFS_INODE_H
 +
++#include "bkey.h"
 +#include "opts.h"
 +
 +extern const char * const bch2_inode_opts[];
 +
 +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
++int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_inode (struct bkey_ops) {		\
++#define bch2_bkey_ops_inode ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_inode_invalid,		\
 +	.val_to_text	= bch2_inode_to_text,		\
 +	.trans_trigger	= bch2_trans_mark_inode,	\
 +	.atomic_trigger	= bch2_mark_inode,		\
-+}
++})
 +
-+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {	\
++#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_inode_v2_invalid,	\
 +	.val_to_text	= bch2_inode_to_text,		\
 +	.trans_trigger	= bch2_trans_mark_inode,	\
 +	.atomic_trigger	= bch2_mark_inode,		\
-+}
++})
++
++#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
++	.key_invalid	= bch2_inode_v3_invalid,	\
++	.val_to_text	= bch2_inode_to_text,		\
++	.trans_trigger	= bch2_trans_mark_inode,	\
++	.atomic_trigger	= bch2_mark_inode,		\
++})
 +
 +static inline bool bkey_is_inode(const struct bkey *k)
 +{
 +	return  k->type == KEY_TYPE_inode ||
-+		k->type == KEY_TYPE_inode_v2;
++		k->type == KEY_TYPE_inode_v2 ||
++		k->type == KEY_TYPE_inode_v3;
 +}
 +
 +int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
 +				  int, struct printbuf *);
 +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_inode_generation (struct bkey_ops) {	\
++#define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_inode_generation_invalid,	\
 +	.val_to_text	= bch2_inode_generation_to_text,	\
-+}
++})
 +
 +#if 0
 +typedef struct {
@@ -55327,25 +56540,28 @@ index 000000000000..2ac2fc10513b
 +	u64			bi_inum;
 +	u64			bi_journal_seq;
 +	__le64			bi_hash_seed;
++	u64			bi_size;
++	u64			bi_sectors;
++	u64			bi_version;
 +	u32			bi_flags;
 +	u16			bi_mode;
 +
 +#define x(_name, _bits)	u##_bits _name;
-+	BCH_INODE_FIELDS()
++	BCH_INODE_FIELDS_v3()
 +#undef  x
 +};
 +
 +struct bkey_inode_buf {
-+	struct bkey_i_inode_v2	inode;
++	struct bkey_i_inode_v3	inode;
 +
 +#define x(_name, _bits)		+ 8 + _bits / 8
-+	u8		_pad[0 + BCH_INODE_FIELDS()];
++	u8		_pad[0 + BCH_INODE_FIELDS_v3()];
 +#undef  x
-+} __attribute__((packed, aligned(8)));
++} __packed __aligned(8);
 +
-+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
-+		     const struct bch_inode_unpacked *);
++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
 +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
++struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c);
 +
 +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 +
@@ -55465,10 +56681,10 @@ index 000000000000..2ac2fc10513b
 +#endif /* _BCACHEFS_INODE_H */
 diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
 new file mode 100644
-index 000000000000..e047ef28f127
+index 000000000000..6348bc2d12c0
 --- /dev/null
 +++ b/fs/bcachefs/io.c
-@@ -0,0 +1,2436 @@
+@@ -0,0 +1,2469 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Some low level IO code, and hacks for various block layer limitations
@@ -55487,6 +56703,7 @@ index 000000000000..e047ef28f127
 +#include "checksum.h"
 +#include "compress.h"
 +#include "clock.h"
++#include "data_update.h"
 +#include "debug.h"
 +#include "disk_groups.h"
 +#include "ec.h"
@@ -55713,8 +56930,10 @@ index 000000000000..e047ef28f127
 +		       s64 *i_sectors_delta_total,
 +		       bool check_enospc)
 +{
-+	struct btree_iter inode_iter;
-+	struct bch_inode_unpacked inode_u;
++	struct btree_iter inode_iter = { NULL };
++	struct bkey_s_c inode_k;
++	struct bkey_s_c_inode_v3 inode;
++	struct bkey_i_inode_v3 *new_inode;
 +	struct bpos next_pos;
 +	bool usage_increasing;
 +	s64 i_sectors_delta = 0, disk_sectors_delta = 0;
@@ -55754,32 +56973,62 @@ index 000000000000..e047ef28f127
 +			return ret;
 +	}
 +
-+	ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
-+			      BTREE_ITER_INTENT);
-+	if (ret)
-+		return ret;
++	bch2_trans_iter_init(trans, &inode_iter, BTREE_ID_inodes,
++			     SPOS(0, inum.inum, iter->snapshot),
++			     BTREE_ITER_INTENT|BTREE_ITER_CACHED);
++	inode_k = bch2_btree_iter_peek_slot(&inode_iter);
++	ret = bkey_err(inode_k);
++	if (unlikely(ret))
++		goto err;
 +
-+	if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-+	    new_i_size > inode_u.bi_size)
-+		inode_u.bi_size = new_i_size;
++	ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
++	if (unlikely(ret))
++		goto err;
 +
-+	inode_u.bi_sectors += i_sectors_delta;
++	if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
++		inode_k = bch2_inode_to_v3(trans, inode_k);
++		ret = bkey_err(inode_k);
++		if (unlikely(ret))
++			goto err;
++	}
 +
-+	ret =   bch2_trans_update(trans, iter, k, 0) ?:
-+		bch2_inode_write(trans, &inode_iter, &inode_u) ?:
++	inode = bkey_s_c_to_inode_v3(inode_k);
++
++	new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
++	ret = PTR_ERR_OR_ZERO(new_inode);
++	if (unlikely(ret))
++		goto err;
++
++	bkey_reassemble(&new_inode->k_i, inode.s_c);
++
++	if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
++	    new_i_size > le64_to_cpu(inode.v->bi_size))
++		new_inode->v.bi_size = cpu_to_le64(new_i_size);
++
++	le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
++
++	new_inode->k.p.snapshot = iter->snapshot;
++
++	/*
++	 * Note:
++	 * We always have to do an inode updated - even when i_size/i_sectors
++	 * aren't changing - for fsync to work properly; fsync relies on
++	 * inode->bi_journal_seq which is updated by the trigger code:
++	 */
++	ret =   bch2_trans_update(trans, &inode_iter, &new_inode->k_i, 0) ?:
++		bch2_trans_update(trans, iter, k, 0) ?:
 +		bch2_trans_commit(trans, disk_res, journal_seq,
 +				BTREE_INSERT_NOCHECK_RW|
 +				BTREE_INSERT_NOFAIL);
-+	bch2_trans_iter_exit(trans, &inode_iter);
-+
-+	if (ret)
-+		return ret;
++	if (unlikely(ret))
++		goto err;
 +
 +	if (i_sectors_delta_total)
 +		*i_sectors_delta_total += i_sectors_delta;
 +	bch2_btree_iter_set_pos(iter, next_pos);
-+
-+	return 0;
++err:
++	bch2_trans_iter_exit(trans, &inode_iter);
++	return ret;
 +}
 +
 +/*
@@ -55862,11 +57111,10 @@ index 000000000000..e047ef28f127
 +	return ret;
 +}
 +
-+int bch2_write_index_default(struct bch_write_op *op)
++static int bch2_write_index_default(struct bch_write_op *op)
 +{
 +	struct bch_fs *c = op->c;
 +	struct bkey_buf sk;
-+	struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
 +	struct keylist *keys = &op->insert_keys;
 +	struct bkey_i *k = bch2_keylist_front(keys);
 +	struct btree_trans trans;
@@ -55900,7 +57148,7 @@ index 000000000000..e047ef28f127
 +				     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 +
 +		ret = bch2_extent_update(&trans, inum, &iter, sk.k,
-+					 &op->res, op_journal_seq(op),
++					 &op->res, &op->journal_seq,
 +					 op->new_i_size, &op->i_sectors_delta,
 +					 op->flags & BCH_WRITE_CHECK_ENOSPC);
 +		bch2_trans_iter_exit(&trans, &iter);
@@ -55910,9 +57158,6 @@ index 000000000000..e047ef28f127
 +		if (ret)
 +			break;
 +
-+		if (ec_ob)
-+			bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
-+
 +		if (bkey_cmp(iter.pos, k->k.p) >= 0)
 +			bch2_keylist_pop_front(&op->insert_keys);
 +		else
@@ -55981,9 +57226,9 @@ index 000000000000..e047ef28f127
 +	}
 +}
 +
-+static void __bch2_write(struct closure *);
++static void __bch2_write(struct bch_write_op *);
 +
-+static void bch2_write_done(struct closure *cl)
++static void __bch2_write_done(struct closure *cl)
 +{
 +	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 +	struct bch_fs *c = op->c;
@@ -55997,12 +57242,23 @@ index 000000000000..e047ef28f127
 +
 +	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 +
-+	if (op->end_io) {
-+		EBUG_ON(cl->parent);
-+		closure_debug_destroy(cl);
++	closure_debug_destroy(cl);
++	if (op->end_io)
 +		op->end_io(op);
++}
++
++static __always_inline void bch2_write_done(struct bch_write_op *op)
++{
++	if (likely(!(op->flags & BCH_WRITE_FLUSH) || op->error)) {
++		__bch2_write_done(&op->cl);
++	} else if (!(op->flags & BCH_WRITE_SYNC)) {
++		bch2_journal_flush_seq_async(&op->c->journal,
++					     op->journal_seq,
++					     &op->cl);
++		continue_at(&op->cl, __bch2_write_done, index_update_wq(op));
 +	} else {
-+		closure_return(cl);
++		bch2_journal_flush_seq(&op->c->journal, op->journal_seq);
++		__bch2_write_done(&op->cl);
 +	}
 +}
 +
@@ -56041,7 +57297,7 @@ index 000000000000..e047ef28f127
 +	struct keylist *keys = &op->insert_keys;
 +	struct bkey_i *k;
 +	unsigned dev;
-+	int ret;
++	int ret = 0;
 +
 +	if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
 +		ret = bch2_write_drop_io_error_ptrs(op);
@@ -56064,7 +57320,10 @@ index 000000000000..e047ef28f127
 +
 +	if (!bch2_keylist_empty(keys)) {
 +		u64 sectors_start = keylist_sectors(keys);
-+		int ret = op->index_update_fn(op);
++
++		ret = !(op->flags & BCH_WRITE_MOVE)
++			? bch2_write_index_default(op)
++			: bch2_data_update_index_update(op);
 +
 +		BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 +		BUG_ON(keylist_sectors(keys) && !ret);
@@ -56074,7 +57333,7 @@ index 000000000000..e047ef28f127
 +		if (ret) {
 +			bch_err_inum_ratelimited(c, op->pos.inode,
 +				"write error while doing btree update: %s", bch2_err_str(ret));
-+			op->error = ret;
++			goto err;
 +		}
 +	}
 +out:
@@ -56087,25 +57346,45 @@ index 000000000000..e047ef28f127
 +err:
 +	keys->top = keys->keys;
 +	op->error = ret;
++	op->flags |= BCH_WRITE_DONE;
 +	goto out;
 +}
 +
 +static void bch2_write_index(struct closure *cl)
 +{
 +	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-+	struct bch_fs *c = op->c;
++	struct write_point *wp = op->wp;
++	struct workqueue_struct *wq = index_update_wq(op);
 +
-+	__bch2_write_index(op);
++	barrier();
++	op->btree_update_ready = true;
++	queue_work(wq, &wp->index_update_work);
++}
 +
-+	if (!(op->flags & BCH_WRITE_DONE)) {
-+		continue_at(cl, __bch2_write, index_update_wq(op));
-+	} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
-+		bch2_journal_flush_seq_async(&c->journal,
-+					     *op_journal_seq(op),
-+					     cl);
-+		continue_at(cl, bch2_write_done, index_update_wq(op));
-+	} else {
-+		continue_at_nobarrier(cl, bch2_write_done, NULL);
++void bch2_write_point_do_index_updates(struct work_struct *work)
++{
++	struct write_point *wp =
++		container_of(work, struct write_point, index_update_work);
++	struct bch_write_op *op;
++
++	while (1) {
++		spin_lock(&wp->writes_lock);
++		op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
++		if (op && !op->btree_update_ready)
++			op = NULL;
++		if (op)
++			list_del(&op->wp_list);
++		spin_unlock(&wp->writes_lock);
++
++		if (!op)
++			break;
++
++		__bch2_write_index(op);
++
++		if (!(op->flags & BCH_WRITE_DONE))
++			__bch2_write(op);
++		else
++			bch2_write_done(op);
 +	}
 +}
 +
@@ -56138,12 +57417,12 @@ index 000000000000..e047ef28f127
 +	if (wbio->put_bio)
 +		bio_put(bio);
 +
-+	if (parent)
++	if (parent) {
 +		bio_endio(&parent->bio);
-+	else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
-+		closure_put(cl);
-+	else
-+		continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
++		return;
++	}
++
++	closure_put(cl);
 +}
 +
 +static void init_append_extent(struct bch_write_op *op,
@@ -56401,8 +57680,7 @@ index 000000000000..e047ef28f127
 +	saved_iter = dst->bi_iter;
 +
 +	do {
-+		struct bch_extent_crc_unpacked crc =
-+			(struct bch_extent_crc_unpacked) { 0 };
++		struct bch_extent_crc_unpacked crc = { 0 };
 +		struct bversion version = op->version;
 +		size_t dst_len, src_len;
 +
@@ -56454,6 +57732,8 @@ index 000000000000..e047ef28f127
 +		    !crc_is_compressed(crc) &&
 +		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
 +		    bch2_csum_type_is_encryption(op->csum_type)) {
++			u8 compression_type = crc.compression_type;
++			u16 nonce = crc.nonce;
 +			/*
 +			 * Note: when we're using rechecksum(), we need to be
 +			 * checksumming @src because it has all the data our
@@ -56472,6 +57752,13 @@ index 000000000000..e047ef28f127
 +					bio_sectors(src) - (src_len >> 9),
 +					op->csum_type))
 +				goto csum_err;
++			/*
++			 * rchecksum_bio sets compression_type on crc from op->crc,
++			 * this isn't always correct as sometimes we're changing
++			 * an extent from uncompressed to incompressible.
++			 */
++			crc.compression_type = compression_type;
++			crc.nonce = nonce;
 +		} else {
 +			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
 +			    bch2_rechecksum_bio(c, src, version, op->crc,
@@ -56542,19 +57829,18 @@ index 000000000000..e047ef28f127
 +	return ret;
 +}
 +
-+static void __bch2_write(struct closure *cl)
++static void __bch2_write(struct bch_write_op *op)
 +{
-+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 +	struct bch_fs *c = op->c;
-+	struct write_point *wp;
++	struct write_point *wp = NULL;
 +	struct bio *bio = NULL;
-+	bool skip_put = true;
 +	unsigned nofs_flags;
 +	int ret;
 +
 +	nofs_flags = memalloc_nofs_save();
 +again:
 +	memset(&op->failed, 0, sizeof(op->failed));
++	op->btree_update_ready = false;
 +
 +	do {
 +		struct bkey_i *key_to_write;
@@ -56564,76 +57850,60 @@ index 000000000000..e047ef28f127
 +		/* +1 for possible cache device: */
 +		if (op->open_buckets.nr + op->nr_replicas + 1 >
 +		    ARRAY_SIZE(op->open_buckets.v))
-+			goto flush_io;
++			break;
 +
 +		if (bch2_keylist_realloc(&op->insert_keys,
 +					op->inline_keys,
 +					ARRAY_SIZE(op->inline_keys),
 +					BKEY_EXTENT_U64s_MAX))
-+			goto flush_io;
++			break;
 +
 +		/*
 +		 * The copygc thread is now global, which means it's no longer
 +		 * freeing up space on specific disks, which means that
 +		 * allocations for specific disks may hang arbitrarily long:
 +		 */
-+		wp = bch2_alloc_sectors_start(c,
-+			op->target,
-+			op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
-+			op->write_point,
-+			&op->devs_have,
-+			op->nr_replicas,
-+			op->nr_replicas_required,
-+			op->alloc_reserve,
-+			op->flags,
-+			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-+				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
-+		EBUG_ON(!wp);
-+
-+		if (unlikely(IS_ERR(wp))) {
-+			if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
-+				ret = PTR_ERR(wp);
-+				goto err;
++		ret = bch2_trans_do(c, NULL, NULL, 0,
++			bch2_alloc_sectors_start_trans(&trans,
++				op->target,
++				op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
++				op->write_point,
++				&op->devs_have,
++				op->nr_replicas,
++				op->nr_replicas_required,
++				op->alloc_reserve,
++				op->flags,
++				(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
++					      BCH_WRITE_ONLY_SPECIFIED_DEVS))
++				? NULL : &op->cl, &wp));
++		if (unlikely(ret)) {
++			if (unlikely(ret != -EAGAIN)) {
++				op->error = ret;
++				op->flags |= BCH_WRITE_DONE;
 +			}
 +
-+			goto flush_io;
++			break;
 +		}
 +
-+		/*
-+		 * It's possible for the allocator to fail, put us on the
-+		 * freelist waitlist, and then succeed in one of various retry
-+		 * paths: if that happens, we need to disable the skip_put
-+		 * optimization because otherwise there won't necessarily be a
-+		 * barrier before we free the bch_write_op:
-+		 */
-+		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
-+			skip_put = false;
-+
 +		bch2_open_bucket_get(c, wp, &op->open_buckets);
 +		ret = bch2_write_extent(op, wp, &bio);
++
 +		bch2_alloc_sectors_done(c, wp);
 +
-+		if (ret < 0)
-+			goto err;
-+
-+		if (ret) {
-+			skip_put = false;
-+		} else {
-+			/*
-+			 * for the skip_put optimization this has to be set
-+			 * before we submit the bio:
-+			 */
++		if (ret < 0) {
++			op->error = ret;
 +			op->flags |= BCH_WRITE_DONE;
++			break;
 +		}
 +
++		if (!ret)
++			op->flags |= BCH_WRITE_DONE;
++
 +		bio->bi_end_io	= bch2_write_endio;
 +		bio->bi_private	= &op->cl;
 +		bio->bi_opf |= REQ_OP_WRITE;
 +
-+		if (!skip_put)
-+			closure_get(bio->bi_private);
-+		else
-+			op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
++		closure_get(bio->bi_private);
 +
 +		key_to_write = (void *) (op->insert_keys.keys_p +
 +					 key_to_write_offset);
@@ -56642,48 +57912,34 @@ index 000000000000..e047ef28f127
 +					  key_to_write);
 +	} while (ret);
 +
-+	if (!skip_put)
-+		continue_at(cl, bch2_write_index, index_update_wq(op));
-+out:
-+	memalloc_nofs_restore(nofs_flags);
-+	return;
-+err:
-+	op->error = ret;
-+	op->flags |= BCH_WRITE_DONE;
-+
-+	continue_at(cl, bch2_write_index, index_update_wq(op));
-+	goto out;
-+flush_io:
 +	/*
-+	 * If the write can't all be submitted at once, we generally want to
-+	 * block synchronously as that signals backpressure to the caller.
++	 * Sync or no?
 +	 *
-+	 * However, if we're running out of a workqueue, we can't block here
-+	 * because we'll be blocking other work items from completing:
++	 * If we're running asynchronously, wne may still want to block
++	 * synchronously here if we weren't able to submit all of the IO at
++	 * once, as that signals backpressure to the caller.
 +	 */
-+	if (current->flags & PF_WQ_WORKER) {
-+		continue_at(cl, bch2_write_index, index_update_wq(op));
-+		goto out;
-+	}
-+
-+	closure_sync(cl);
-+
-+	if (!bch2_keylist_empty(&op->insert_keys)) {
++	if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
++		closure_sync(&op->cl);
 +		__bch2_write_index(op);
 +
-+		if (op->error) {
-+			op->flags |= BCH_WRITE_DONE;
-+			continue_at_nobarrier(cl, bch2_write_done, NULL);
-+			goto out;
-+		}
++		if (!(op->flags & BCH_WRITE_DONE))
++			goto again;
++		bch2_write_done(op);
++	} else {
++		spin_lock(&wp->writes_lock);
++		op->wp = wp;
++		list_add_tail(&op->wp_list, &wp->writes);
++		spin_unlock(&wp->writes_lock);
++
++		continue_at(&op->cl, bch2_write_index, NULL);
 +	}
 +
-+	goto again;
++	memalloc_nofs_restore(nofs_flags);
 +}
 +
 +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
 +{
-+	struct closure *cl = &op->cl;
 +	struct bio *bio = &op->wbio.bio;
 +	struct bvec_iter iter;
 +	struct bkey_i_inline_data *id;
@@ -56720,10 +57976,9 @@ index 000000000000..e047ef28f127
 +	op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
 +	op->flags |= BCH_WRITE_DONE;
 +
-+	continue_at_nobarrier(cl, bch2_write_index, NULL);
-+	return;
++	__bch2_write_index(op);
 +err:
-+	bch2_write_done(&op->cl);
++	bch2_write_done(op);
 +}
 +
 +/**
@@ -56749,6 +58004,7 @@ index 000000000000..e047ef28f127
 +	struct bch_fs *c = op->c;
 +	unsigned data_len;
 +
++	EBUG_ON(op->cl.parent);
 +	BUG_ON(!op->nr_replicas);
 +	BUG_ON(!op->write_point.v);
 +	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
@@ -56782,24 +58038,19 @@ index 000000000000..e047ef28f127
 +		return;
 +	}
 +
-+	continue_at_nobarrier(cl, __bch2_write, NULL);
++	__bch2_write(op);
 +	return;
 +err:
 +	bch2_disk_reservation_put(c, &op->res);
 +
-+	if (op->end_io) {
-+		EBUG_ON(cl->parent);
-+		closure_debug_destroy(cl);
++	closure_debug_destroy(&op->cl);
++	if (op->end_io)
 +		op->end_io(op);
-+	} else {
-+		closure_return(cl);
-+	}
 +}
 +
 +/* Cache promotion on read */
 +
 +struct promote_op {
-+	struct closure		cl;
 +	struct rcu_head		rcu;
 +	u64			start_time;
 +
@@ -56853,10 +58104,10 @@ index 000000000000..e047ef28f127
 +	kfree_rcu(op, rcu);
 +}
 +
-+static void promote_done(struct closure *cl)
++static void promote_done(struct bch_write_op *wop)
 +{
 +	struct promote_op *op =
-+		container_of(cl, struct promote_op, cl);
++		container_of(wop, struct promote_op, write.op);
 +	struct bch_fs *c = op->write.op.c;
 +
 +	bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
@@ -56868,7 +58119,6 @@ index 000000000000..e047ef28f127
 +
 +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 +{
-+	struct closure *cl = &op->cl;
 +	struct bio *bio = &op->write.op.wbio.bio;
 +
 +	trace_and_count(op->write.op.c, read_promote, &rbio->bio);
@@ -56881,9 +58131,7 @@ index 000000000000..e047ef28f127
 +	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 +	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
 +
-+	closure_init(cl, NULL);
-+	bch2_data_update_read_done(&op->write, rbio->pick.crc, cl);
-+	closure_return_with_destructor(cl, promote_done);
++	bch2_data_update_read_done(&op->write, rbio->pick.crc);
 +}
 +
 +static struct promote_op *__promote_alloc(struct bch_fs *c,
@@ -56948,6 +58196,7 @@ index 000000000000..e047ef28f127
 +			},
 +			btree_id, k);
 +	BUG_ON(ret);
++	op->write.op.end_io = promote_done;
 +
 +	return op;
 +err:
@@ -57907,10 +59156,10 @@ index 000000000000..e047ef28f127
 +}
 diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
 new file mode 100644
-index 000000000000..3ae31758a01e
+index 000000000000..9e6862f474d8
 --- /dev/null
 +++ b/fs/bcachefs/io.h
-@@ -0,0 +1,190 @@
+@@ -0,0 +1,183 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_IO_H
 +#define _BCACHEFS_IO_H
@@ -57948,20 +59197,14 @@ index 000000000000..3ae31758a01e
 +	BCH_WRITE_WROTE_DATA_INLINE	= (1 << 7),
 +	BCH_WRITE_FROM_INTERNAL		= (1 << 8),
 +	BCH_WRITE_CHECK_ENOSPC		= (1 << 9),
++	BCH_WRITE_SYNC			= (1 << 10),
++	BCH_WRITE_MOVE			= (1 << 11),
 +
 +	/* Internal: */
-+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 10),
-+	BCH_WRITE_SKIP_CLOSURE_PUT	= (1 << 11),
 +	BCH_WRITE_DONE			= (1 << 12),
 +	BCH_WRITE_IO_ERROR		= (1 << 13),
 +};
 +
-+static inline u64 *op_journal_seq(struct bch_write_op *op)
-+{
-+	return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
-+		? op->journal_seq_p : &op->journal_seq;
-+}
-+
 +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 +{
 +	return op->alloc_reserve == RESERVE_movinggc
@@ -57979,8 +59222,6 @@ index 000000000000..3ae31758a01e
 +		   subvol_inum, u64, s64 *);
 +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
 +
-+int bch2_write_index_default(struct bch_write_op *);
-+
 +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 +				      struct bch_io_opts opts)
 +{
@@ -58007,11 +59248,12 @@ index 000000000000..3ae31758a01e
 +	op->journal_seq		= 0;
 +	op->new_i_size		= U64_MAX;
 +	op->i_sectors_delta	= 0;
-+	op->index_update_fn	= bch2_write_index_default;
 +}
 +
 +void bch2_write(struct closure *);
 +
++void bch2_write_point_do_index_updates(struct work_struct *);
++
 +static inline struct bch_write_bio *wbio_init(struct bio *bio)
 +{
 +	struct bch_write_bio *wbio = to_wbio(bio);
@@ -58103,10 +59345,10 @@ index 000000000000..3ae31758a01e
 +#endif /* _BCACHEFS_IO_H */
 diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
 new file mode 100644
-index 000000000000..78bff13d36f2
+index 000000000000..ca65f2c52c1c
 --- /dev/null
 +++ b/fs/bcachefs/io_types.h
-@@ -0,0 +1,161 @@
+@@ -0,0 +1,156 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_IO_TYPES_H
 +#define _BCACHEFS_IO_TYPES_H
@@ -58226,6 +59468,7 @@ index 000000000000..78bff13d36f2
 +	unsigned		nr_replicas_required:4;
 +	unsigned		alloc_reserve:3;
 +	unsigned		incompressible:1;
++	unsigned		btree_update_ready:1;
 +
 +	struct bch_devs_list	devs_have;
 +	u16			target;
@@ -58241,23 +59484,17 @@ index 000000000000..78bff13d36f2
 +
 +	struct write_point_specifier write_point;
 +
++	struct write_point	*wp;
++	struct list_head	wp_list;
++
 +	struct disk_reservation	res;
 +
 +	struct open_buckets	open_buckets;
 +
-+	/*
-+	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
-+	 * still need to stash the journal_seq somewhere:
-+	 */
-+	union {
-+		u64			*journal_seq_p;
-+		u64			journal_seq;
-+	};
++	u64			journal_seq;
 +	u64			new_i_size;
 +	s64			i_sectors_delta;
 +
-+	int			(*index_update_fn)(struct bch_write_op *);
-+
 +	struct bch_devs_mask	failed;
 +
 +	struct keylist		insert_keys;
@@ -58270,7 +59507,7 @@ index 000000000000..78bff13d36f2
 +#endif /* _BCACHEFS_IO_TYPES_H */
 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
 new file mode 100644
-index 000000000000..ab594623341f
+index 000000000000..95c29229d3fe
 --- /dev/null
 +++ b/fs/bcachefs/journal.c
 @@ -0,0 +1,1436 @@
@@ -59015,7 +60252,7 @@ index 000000000000..ab594623341f
 +		return ret;
 +
 +	entry = container_of(journal_res_entry(j, &res),
-+			     struct jset_entry_log, entry);;
++			     struct jset_entry_log, entry);
 +	memset(entry, 0, u64s * sizeof(u64));
 +	entry->entry.type = BCH_JSET_ENTRY_log;
 +	entry->entry.u64s = u64s - 1;
@@ -59072,10 +60309,10 @@ index 000000000000..ab594623341f
 +		bch2_journal_block(&c->journal);
 +	}
 +
-+	bu		= kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
-+	ob		= kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
-+	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
-+	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
++	bu		= kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
++	ob		= kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
++	new_buckets	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
++	new_bucket_seq	= kcalloc(nr, sizeof(u64), GFP_KERNEL);
 +	if (!bu || !ob || !new_buckets || !new_bucket_seq) {
 +		ret = -ENOMEM;
 +		goto err_unblock;
@@ -59541,7 +60778,7 @@ index 000000000000..ab594623341f
 +	rcu_read_lock();
 +	s = READ_ONCE(j->reservations);
 +
-+	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size);
++	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
 +	prt_printf(out, "seq:\t\t\t%llu\n",			journal_cur_seq(j));
 +	prt_printf(out, "seq_ondisk:\t\t%llu\n",		j->seq_ondisk);
 +	prt_printf(out, "last_seq:\t\t%llu\n",		journal_last_seq(j));
@@ -59712,10 +60949,10 @@ index 000000000000..ab594623341f
 +}
 diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
 new file mode 100644
-index 000000000000..d3caa7ea7ce9
+index 000000000000..51d29a01b7b2
 --- /dev/null
 +++ b/fs/bcachefs/journal.h
-@@ -0,0 +1,521 @@
+@@ -0,0 +1,540 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_JOURNAL_H
 +#define _BCACHEFS_JOURNAL_H
@@ -59747,8 +60984,8 @@ index 000000000000..d3caa7ea7ce9
 + *
 + * Synchronous updates are specified by passing a closure (@flush_cl) to
 + * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
-+ * down to the journalling code. That closure will will wait on the journal
-+ * write to complete (via closure_wait()).
++ * down to the journalling code. That closure will wait on the journal write to
++ * complete (via closure_wait()).
 + *
 + * If the index update wasn't synchronous, the journal entry will be
 + * written out after 10 ms have elapsed, by default (the delay_ms field
@@ -59828,6 +61065,7 @@ index 000000000000..d3caa7ea7ce9
 + */
 +
 +#include <linux/hash.h>
++#include <linux/prefetch.h>
 +
 +#include "journal_types.h"
 +
@@ -60022,15 +61260,26 @@ index 000000000000..d3caa7ea7ce9
 +{
 +	union journal_res_state old, new;
 +	u64 v = atomic64_read(&j->reservations.counter);
++	unsigned u64s, offset;
 +
 +	do {
 +		old.v = new.v = v;
 +
 +		/*
++		 * Round up the end of the journal reservation to the next
++		 * cacheline boundary:
++		 */
++		u64s = res->u64s;
++		offset = sizeof(struct jset) / sizeof(u64) +
++			  new.cur_entry_offset + u64s;
++		u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1;
++
++
++		/*
 +		 * Check if there is still room in the current journal
 +		 * entry:
 +		 */
-+		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
++		if (new.cur_entry_offset + u64s > j->cur_entry_u64s)
 +			return 0;
 +
 +		EBUG_ON(!journal_state_count(new, new.idx));
@@ -60038,7 +61287,7 @@ index 000000000000..d3caa7ea7ce9
 +		if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark)
 +			return 0;
 +
-+		new.cur_entry_offset += res->u64s;
++		new.cur_entry_offset += u64s;
 +		journal_state_inc(&new);
 +
 +		/*
@@ -60055,8 +61304,15 @@ index 000000000000..d3caa7ea7ce9
 +
 +	res->ref	= true;
 +	res->idx	= old.idx;
++	res->u64s	= u64s;
 +	res->offset	= old.cur_entry_offset;
 +	res->seq	= le64_to_cpu(j->buf[old.idx].data->seq);
++
++	offset = res->offset;
++	while (offset < res->offset + res->u64s) {
++		prefetchw(vstruct_idx(j->buf[res->idx].data, offset));
++		offset += SMP_CACHE_BYTES / sizeof(u64);
++	}
 +	return 1;
 +}
 +
@@ -60239,10 +61495,10 @@ index 000000000000..d3caa7ea7ce9
 +#endif /* _BCACHEFS_JOURNAL_H */
 diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
 new file mode 100644
-index 000000000000..253a6ae20159
+index 000000000000..c4922c640653
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.c
-@@ -0,0 +1,1759 @@
+@@ -0,0 +1,1807 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -60262,6 +61518,23 @@ index 000000000000..253a6ae20159
 +
 +#include <trace/events/bcachefs.h>
 +
++static struct nonce journal_nonce(const struct jset *jset)
++{
++	return (struct nonce) {{
++		[0] = 0,
++		[1] = ((__le32 *) &jset->seq)[0],
++		[2] = ((__le32 *) &jset->seq)[1],
++		[3] = BCH_NONCE_JOURNAL,
++	}};
++}
++
++static bool jset_csum_good(struct bch_fs *c, struct jset *j)
++{
++	return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
++		!bch2_crc_cmp(j->csum,
++			      csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
++}
++
 +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
 +{
 +	return (seq - c->journal_entries_base_seq) & (~0U >> 1);
@@ -60304,8 +61577,7 @@ index 000000000000..253a6ae20159
 + */
 +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 +			     struct journal_ptr entry_ptr,
-+			     struct journal_list *jlist, struct jset *j,
-+			     bool bad)
++			     struct journal_list *jlist, struct jset *j)
 +{
 +	struct genradix_iter iter;
 +	struct journal_replay **_i, *i, *dup;
@@ -60356,38 +61628,53 @@ index 000000000000..253a6ae20159
 +	 */
 +	dup = *_i;
 +	if (dup) {
-+		if (dup->bad) {
-+			/* we'll replace @dup: */
-+		} else if (bad) {
-+			i = dup;
-+			goto found;
-+		} else {
-+			fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
-+				    memcmp(j, &dup->j, bytes), c,
-+				    "found duplicate but non identical journal entries (seq %llu)",
-+				    le64_to_cpu(j->seq));
++		if (bytes == vstruct_bytes(&dup->j) &&
++		    !memcmp(j, &dup->j, bytes)) {
 +			i = dup;
 +			goto found;
 +		}
-+	}
 +
++		if (!entry_ptr.csum_good) {
++			i = dup;
++			goto found;
++		}
++
++		if (!dup->csum_good)
++			goto replace;
++
++		fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
++			 le64_to_cpu(j->seq));
++		i = dup;
++		goto found;
++	}
++replace:
 +	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 +	if (!i)
 +		return -ENOMEM;
 +
-+	i->nr_ptrs	 = 0;
-+	i->bad		= bad;
++	i->nr_ptrs	= 0;
++	i->csum_good	= entry_ptr.csum_good;
 +	i->ignore	= false;
 +	memcpy(&i->j, j, bytes);
++	i->ptrs[i->nr_ptrs++] = entry_ptr;
 +
 +	if (dup) {
-+		i->nr_ptrs = dup->nr_ptrs;
-+		memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
++		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
++			bch_err(c, "found too many copies of journal entry %llu",
++				le64_to_cpu(i->j.seq));
++			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
++		}
++
++		/* The first ptr should represent the jset we kept: */
++		memcpy(i->ptrs + i->nr_ptrs,
++		       dup->ptrs,
++		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
++		i->nr_ptrs += dup->nr_ptrs;
 +		__journal_replay_free(c, dup);
 +	}
 +
-+
 +	*_i = i;
++	return 0;
 +found:
 +	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
 +		if (ptr->dev == ca->dev_idx) {
@@ -60409,16 +61696,6 @@ index 000000000000..253a6ae20159
 +	return ret;
 +}
 +
-+static struct nonce journal_nonce(const struct jset *jset)
-+{
-+	return (struct nonce) {{
-+		[0] = 0,
-+		[1] = ((__le32 *) &jset->seq)[0],
-+		[2] = ((__le32 *) &jset->seq)[1],
-+		[3] = BCH_NONCE_JOURNAL,
-+	}};
-+}
-+
 +/* this fills in a range with empty jset_entries: */
 +static void journal_entry_null_range(void *start, void *end)
 +{
@@ -60960,12 +62237,8 @@ index 000000000000..253a6ae20159
 +static int jset_validate(struct bch_fs *c,
 +			 struct bch_dev *ca,
 +			 struct jset *jset, u64 sector,
-+			 unsigned bucket_sectors_left,
-+			 unsigned sectors_read,
 +			 int write)
 +{
-+	size_t bytes = vstruct_bytes(jset);
-+	struct bch_csum csum;
 +	unsigned version;
 +	int ret = 0;
 +
@@ -60982,21 +62255,7 @@ index 000000000000..253a6ae20159
 +			sector, le64_to_cpu(jset->seq),
 +			version)) {
 +		/* don't try to continue: */
-+		return EINVAL;
-+	}
-+
-+	if (bytes > (sectors_read << 9) &&
-+	    sectors_read < bucket_sectors_left)
-+		return JOURNAL_ENTRY_REREAD;
-+
-+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-+				 c, jset, NULL,
-+			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
-+			ca ? ca->name : c->name,
-+			sector, le64_to_cpu(jset->seq), bytes)) {
-+		ret = JOURNAL_ENTRY_BAD;
-+		le32_add_cpu(&jset->u64s,
-+			     -((bytes - (bucket_sectors_left << 9)) / 8));
++		return -EINVAL;
 +	}
 +
 +	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
@@ -61004,28 +62263,9 @@ index 000000000000..253a6ae20159
 +			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 +			ca ? ca->name : c->name,
 +			sector, le64_to_cpu(jset->seq),
-+			JSET_CSUM_TYPE(jset))) {
-+		ret = JOURNAL_ENTRY_BAD;
-+		goto csum_done;
-+	}
-+
-+	if (write)
-+		goto csum_done;
-+
-+	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
-+				 c, jset, NULL,
-+				 "%s sector %llu seq %llu: journal checksum bad",
-+				 ca ? ca->name : c->name,
-+				 sector, le64_to_cpu(jset->seq)))
++			JSET_CSUM_TYPE(jset)))
 +		ret = JOURNAL_ENTRY_BAD;
 +
-+	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-+		     jset->encrypted_start,
-+		     vstruct_end(jset) - (void *) jset->encrypted_start);
-+	bch2_fs_fatal_err_on(ret, c,
-+			"error decrypting journal entry: %i", ret);
-+csum_done:
 +	/* last_seq is ignored when JSET_NO_FLUSH is true */
 +	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
 +				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
@@ -61036,16 +62276,52 @@ index 000000000000..253a6ae20159
 +		jset->last_seq = jset->seq;
 +		return JOURNAL_ENTRY_BAD;
 +	}
++
++	ret = jset_validate_entries(c, jset, write);
 +fsck_err:
 +	return ret;
 +}
 +
-+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
++static int jset_validate_early(struct bch_fs *c,
++			 struct bch_dev *ca,
++			 struct jset *jset, u64 sector,
++			 unsigned bucket_sectors_left,
++			 unsigned sectors_read)
 +{
-+	unsigned sectors = vstruct_sectors(jset, c->block_bits);
++	size_t bytes = vstruct_bytes(jset);
++	unsigned version;
++	int write = READ;
++	int ret = 0;
 +
-+	return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
-+		jset_validate_entries(c, jset, WRITE);
++	if (le64_to_cpu(jset->magic) != jset_magic(c))
++		return JOURNAL_ENTRY_NONE;
++
++	version = le32_to_cpu(jset->version);
++	if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
++				  version < bcachefs_metadata_version_min) ||
++				 version >= bcachefs_metadata_version_max,
++				 c, jset, NULL,
++			"%s sector %llu seq %llu: unknown journal entry version %u",
++			ca ? ca->name : c->name,
++			sector, le64_to_cpu(jset->seq),
++			version)) {
++		/* don't try to continue: */
++		return -EINVAL;
++	}
++
++	if (bytes > (sectors_read << 9) &&
++	    sectors_read < bucket_sectors_left)
++		return JOURNAL_ENTRY_REREAD;
++
++	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
++				 c, jset, NULL,
++			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
++			ca ? ca->name : c->name,
++			sector, le64_to_cpu(jset->seq), bytes))
++		le32_add_cpu(&jset->u64s,
++			     -((bytes - (bucket_sectors_left << 9)) / 8));
++fsck_err:
++	return ret;
 +}
 +
 +struct journal_read_buf {
@@ -61084,7 +62360,7 @@ index 000000000000..253a6ae20159
 +	unsigned sectors, sectors_read = 0;
 +	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
 +	    end = offset + ca->mi.bucket_size;
-+	bool saw_bad = false;
++	bool saw_bad = false, csum_good;
 +	int ret = 0;
 +
 +	pr_debug("reading %u", bucket);
@@ -61123,9 +62399,8 @@ index 000000000000..253a6ae20159
 +			j = buf->data;
 +		}
 +
-+		ret = jset_validate(c, ca, j, offset,
-+				    end - offset, sectors_read,
-+				    READ);
++		ret = jset_validate_early(c, ca, j, offset,
++				    end - offset, sectors_read);
 +		switch (ret) {
 +		case 0:
 +			sectors = vstruct_sectors(j, c->block_bits);
@@ -61141,17 +62416,13 @@ index 000000000000..253a6ae20159
 +		case JOURNAL_ENTRY_NONE:
 +			if (!saw_bad)
 +				return 0;
-+			sectors = block_sectors(c);
-+			goto next_block;
-+		case JOURNAL_ENTRY_BAD:
-+			saw_bad = true;
 +			/*
 +			 * On checksum error we don't really trust the size
 +			 * field of the journal entry we read, so try reading
 +			 * again at next block boundary:
 +			 */
 +			sectors = block_sectors(c);
-+			break;
++			goto next_block;
 +		default:
 +			return ret;
 +		}
@@ -61167,14 +62438,25 @@ index 000000000000..253a6ae20159
 +
 +		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 +
++		csum_good = jset_csum_good(c, j);
++		if (!csum_good)
++			saw_bad = true;
++
++		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
++			     j->encrypted_start,
++			     vstruct_end(j) - (void *) j->encrypted_start);
++		bch2_fs_fatal_err_on(ret, c,
++				"error decrypting journal entry: %i", ret);
++
 +		mutex_lock(&jlist->lock);
 +		ret = journal_entry_add(c, ca, (struct journal_ptr) {
++					.csum_good	= csum_good,
 +					.dev		= ca->dev_idx,
 +					.bucket		= bucket,
 +					.bucket_offset	= offset -
 +						bucket_to_sector(ca, ja->buckets[bucket]),
 +					.sector		= offset,
-+					}, jlist, j, ret != 0);
++					}, jlist, j);
 +		mutex_unlock(&jlist->lock);
 +
 +		switch (ret) {
@@ -61373,6 +62655,14 @@ index 000000000000..253a6ae20159
 +			*start_seq = le64_to_cpu(i->j.seq) + 1;
 +
 +		if (!JSET_NO_FLUSH(&i->j)) {
++			int write = READ;
++			if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
++						 c, &i->j, NULL,
++						 "invalid journal entry: last_seq > seq (%llu > %llu)",
++						 le64_to_cpu(i->j.last_seq),
++						 le64_to_cpu(i->j.seq)))
++				i->j.last_seq = i->j.seq;
++
 +			last_seq	= le64_to_cpu(i->j.last_seq);
 +			*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
 +			break;
@@ -61476,7 +62766,21 @@ index 000000000000..253a6ae20159
 +		if (!i || i->ignore)
 +			continue;
 +
-+		ret = jset_validate_entries(c, &i->j, READ);
++		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
++			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
++
++			if (!i->ptrs[ptr].csum_good)
++				printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
++				       ca->name, i->ptrs[ptr].sector,
++				       le64_to_cpu(i->j.seq),
++				       i->csum_good ? " (had good copy on another device)" : "");
++		}
++
++		ret = jset_validate(c,
++				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
++				    &i->j,
++				    i->ptrs[0].sector,
++				    READ);
 +		if (ret)
 +			goto err;
 +
@@ -61912,7 +63216,7 @@ index 000000000000..253a6ae20159
 +		validate_before_checksum = true;
 +
 +	if (validate_before_checksum &&
-+	    jset_validate_for_write(c, jset))
++	    jset_validate(c, NULL, jset, 0, WRITE))
 +		goto err;
 +
 +	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -61926,7 +63230,7 @@ index 000000000000..253a6ae20159
 +				  journal_nonce(jset), jset);
 +
 +	if (!validate_before_checksum &&
-+	    jset_validate_for_write(c, jset))
++	    jset_validate(c, NULL, jset, 0, WRITE))
 +		goto err;
 +
 +	sectors = vstruct_sectors(jset, c->block_bits);
@@ -62004,7 +63308,7 @@ index 000000000000..253a6ae20159
 +}
 diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
 new file mode 100644
-index 000000000000..1a91f2c0a26c
+index 000000000000..2f8bbf06b289
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.h
 @@ -0,0 +1,59 @@
@@ -62018,6 +63322,7 @@ index 000000000000..1a91f2c0a26c
 + */
 +struct journal_replay {
 +	struct journal_ptr {
++		bool		csum_good;
 +		u8		dev;
 +		u32		bucket;
 +		u32		bucket_offset;
@@ -62025,8 +63330,7 @@ index 000000000000..1a91f2c0a26c
 +	}			ptrs[BCH_REPLICAS_MAX];
 +	unsigned		nr_ptrs;
 +
-+	/* checksum error, but we may want to try using it anyways: */
-+	bool			bad;
++	bool			csum_good;
 +	bool			ignore;
 +	/* must be last: */
 +	struct jset		j;
@@ -62069,7 +63373,7 @@ index 000000000000..1a91f2c0a26c
 +#endif /* _BCACHEFS_JOURNAL_IO_H */
 diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
 new file mode 100644
-index 000000000000..e69595bd1359
+index 000000000000..e873ce2a3f03
 --- /dev/null
 +++ b/fs/bcachefs/journal_reclaim.c
 @@ -0,0 +1,853 @@
@@ -62307,7 +63611,7 @@ index 000000000000..e69595bd1359
 +	if ((j->space[journal_space_clean_ondisk].next_entry <
 +	     j->space[journal_space_clean_ondisk].total) &&
 +	    (clean - clean_ondisk <= total / 8) &&
-+	    (clean_ondisk * 2 > clean ))
++	    (clean_ondisk * 2 > clean))
 +		set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 +	else
 +		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
@@ -62438,7 +63742,7 @@ index 000000000000..e69595bd1359
 +	list_del_init(&pin->list);
 +
 +	/*
-+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
++	 * Unpinning a journal entry may make journal_next_bucket() succeed if
 +	 * writing a new last_seq will now make another bucket available:
 +	 */
 +	if (atomic_dec_and_test(&pin_list->count) &&
@@ -63020,7 +64324,7 @@ index 000000000000..0fd1af120db5
 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
 diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
 new file mode 100644
-index 000000000000..cfdbd92d2164
+index 000000000000..c19db0425dd7
 --- /dev/null
 +++ b/fs/bcachefs/journal_sb.c
 @@ -0,0 +1,220 @@
@@ -63057,7 +64361,7 @@ index 000000000000..cfdbd92d2164
 +	if (!nr)
 +		return 0;
 +
-+	b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
++	b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL);
 +	if (!b)
 +		return -ENOMEM;
 +
@@ -63140,7 +64444,7 @@ index 000000000000..cfdbd92d2164
 +	if (!nr)
 +		return 0;
 +
-+	b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
++	b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL);
 +	if (!b)
 +		return -ENOMEM;
 +
@@ -63978,13 +65282,14 @@ index 000000000000..a6cdb885ad41
 +#endif /* _BCACHEFS_JOURNAL_TYPES_H */
 diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
 new file mode 100644
-index 000000000000..cda77835b9ea
+index 000000000000..5e85055b0f93
 --- /dev/null
 +++ b/fs/bcachefs/keylist.c
-@@ -0,0 +1,67 @@
+@@ -0,0 +1,68 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
++#include "bkey.h"
 +#include "keylist.h"
 +
 +int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
@@ -64051,10 +65356,10 @@ index 000000000000..cda77835b9ea
 +#endif
 diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
 new file mode 100644
-index 000000000000..195799bb20bc
+index 000000000000..635efb7e8228
 --- /dev/null
 +++ b/fs/bcachefs/keylist.h
-@@ -0,0 +1,76 @@
+@@ -0,0 +1,75 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_KEYLIST_H
 +#define _BCACHEFS_KEYLIST_H
@@ -64074,7 +65379,6 @@ index 000000000000..195799bb20bc
 +{
 +	if (l->keys_p != inline_keys)
 +		kfree(l->keys_p);
-+	bch2_keylist_init(l, inline_keys);
 +}
 +
 +static inline void bch2_keylist_push(struct keylist *l)
@@ -64367,7 +65671,7 @@ index 000000000000..53e607d72274
 +}
 diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
 new file mode 100644
-index 000000000000..3decb7b1dde2
+index 000000000000..925c29b49b86
 --- /dev/null
 +++ b/fs/bcachefs/lru.h
 @@ -0,0 +1,19 @@
@@ -64378,10 +65682,10 @@ index 000000000000..3decb7b1dde2
 +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_lru (struct bkey_ops) {	\
++#define bch2_bkey_ops_lru ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_lru_invalid,	\
 +	.val_to_text	= bch2_lru_to_text,	\
-+}
++})
 +
 +int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c);
 +int bch2_lru_set(struct btree_trans *, u64, u64, u64 *);
@@ -64597,10 +65901,10 @@ index 000000000000..027efaa0d575
 +#endif /* _BCACHEFS_MIGRATE_H */
 diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
 new file mode 100644
-index 000000000000..e85c3143051c
+index 000000000000..5c3e378a8698
 --- /dev/null
 +++ b/fs/bcachefs/move.c
-@@ -0,0 +1,954 @@
+@@ -0,0 +1,1011 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -64656,9 +65960,8 @@ index 000000000000..e85c3143051c
 +	struct bio_vec		bi_inline_vecs[0];
 +};
 +
-+static void move_free(struct closure *cl)
++static void move_free(struct moving_io *io)
 +{
-+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 +	struct moving_context *ctxt = io->write.ctxt;
 +	struct bch_fs *c = ctxt->c;
 +
@@ -64668,31 +65971,30 @@ index 000000000000..e85c3143051c
 +	kfree(io);
 +}
 +
-+static void move_write_done(struct closure *cl)
++static void move_write_done(struct bch_write_op *op)
 +{
-+	struct moving_io *io = container_of(cl, struct moving_io, cl);
++	struct moving_io *io = container_of(op, struct moving_io, write.op);
 +	struct moving_context *ctxt = io->write.ctxt;
 +
 +	if (io->write.op.error)
 +		ctxt->write_error = true;
 +
 +	atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
-+	closure_return_with_destructor(cl, move_free);
++	move_free(io);
++	closure_put(&ctxt->cl);
 +}
 +
-+static void move_write(struct closure *cl)
++static void move_write(struct moving_io *io)
 +{
-+	struct moving_io *io = container_of(cl, struct moving_io, cl);
-+
 +	if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
-+		closure_return_with_destructor(cl, move_free);
++		move_free(io);
 +		return;
 +	}
 +
++	closure_get(&io->write.ctxt->cl);
 +	atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
 +
-+	bch2_data_update_read_done(&io->write, io->rbio.pick.crc, cl);
-+	continue_at(cl, move_write_done, NULL);
++	bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
 +}
 +
 +static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -64724,7 +66026,7 @@ index 000000000000..e85c3143051c
 +
 +	while ((io = next_pending_write(ctxt))) {
 +		list_del(&io->list);
-+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
++		move_write(io);
 +	}
 +}
 +
@@ -64794,7 +66096,52 @@ index 000000000000..e85c3143051c
 +	scnprintf(stats->name, sizeof(stats->name), "%s", name);
 +}
 +
++static int bch2_extent_drop_ptrs(struct btree_trans *trans,
++				 struct btree_iter *iter,
++				 struct bkey_s_c k,
++				 struct data_update_opts data_opts)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_i *n;
++	int ret;
++
++	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
++	ret = PTR_ERR_OR_ZERO(n);
++	if (ret)
++		return ret;
++
++	bkey_reassemble(n, k);
++
++	while (data_opts.kill_ptrs) {
++		unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
++		struct bch_extent_ptr *ptr;
++
++		bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop);
++		data_opts.kill_ptrs ^= 1U << drop;
++	}
++
++	/*
++	 * If the new extent no longer has any pointers, bch2_extent_normalize()
++	 * will do the appropriate thing with it (turning it into a
++	 * KEY_TYPE_error key, or just a discard if it was a cached extent)
++	 */
++	bch2_extent_normalize(c, bkey_i_to_s(n));
++
++	/*
++	 * Since we're not inserting through an extent iterator
++	 * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
++	 * we aren't using the extent overwrite path to delete, we're
++	 * just using the normal key deletion path:
++	 */
++	if (bkey_deleted(&n->k))
++		n->k.size = 0;
++
++	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
++		bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
++}
++
 +static int bch2_move_extent(struct btree_trans *trans,
++			    struct btree_iter *iter,
 +			    struct moving_context *ctxt,
 +			    struct bch_io_opts io_opts,
 +			    enum btree_id btree_id,
@@ -64809,6 +66156,15 @@ index 000000000000..e85c3143051c
 +	unsigned sectors = k.k->size, pages;
 +	int ret = -ENOMEM;
 +
++	bch2_data_update_opts_normalize(k, &data_opts);
++
++	if (!data_opts.rewrite_ptrs &&
++	    !data_opts.extra_replicas) {
++		if (data_opts.kill_ptrs)
++			return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
++		return 0;
++	}
++
 +	if (!percpu_ref_tryget_live(&c->writes))
 +		return -EROFS;
 +
@@ -64851,6 +66207,7 @@ index 000000000000..e85c3143051c
 +		goto err_free_pages;
 +
 +	io->write.ctxt = ctxt;
++	io->write.op.end_io = move_write_done;
 +
 +	atomic64_inc(&ctxt->stats->keys_moved);
 +	atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -65046,11 +66403,11 @@ index 000000000000..e85c3143051c
 +		/*
 +		 * The iterator gets unlocked by __bch2_read_extent - need to
 +		 * save a copy of @k elsewhere:
-+		  */
++		 */
 +		bch2_bkey_buf_reassemble(&sk, c, k);
 +		k = bkey_i_to_s_c(sk.k);
 +
-+		ret2 = bch2_move_extent(&trans, ctxt, io_opts,
++		ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts,
 +					btree_id, k, data_opts);
 +		if (ret2) {
 +			if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
@@ -65147,7 +66504,7 @@ index 000000000000..e85c3143051c
 +			prt_str(&buf, "failed to evacuate bucket ");
 +			bch2_bkey_val_to_text(&buf, c, k);
 +
-+			bch2_trans_inconsistent(trans, "%s", buf.buf);
++			bch_err(c, "%s", buf.buf);
 +			printbuf_exit(&buf);
 +		}
 +	}
@@ -65177,7 +66534,8 @@ index 000000000000..e85c3143051c
 +		bch2_trans_begin(&trans);
 +
 +		ret = bch2_get_next_backpointer(&trans, bucket, gen,
-+						&bp_offset, &bp);
++						&bp_offset, &bp,
++						BTREE_ITER_CACHED);
 +		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			continue;
 +		if (ret)
@@ -65202,11 +66560,12 @@ index 000000000000..e85c3143051c
 +
 +			bch2_bkey_buf_reassemble(&sk, c, k);
 +			k = bkey_i_to_s_c(sk.k);
-+			bch2_trans_iter_exit(&trans, &iter);
 +
 +			ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
-+			if (ret)
++			if (ret) {
++				bch2_trans_iter_exit(&trans, &iter);
 +				continue;
++			}
 +
 +			data_opts = _data_opts;
 +			data_opts.target	= io_opts.background_target;
@@ -65218,8 +66577,10 @@ index 000000000000..e85c3143051c
 +				i++;
 +			}
 +
-+			ret = bch2_move_extent(&trans, ctxt, io_opts,
++			ret = bch2_move_extent(&trans, &iter, ctxt, io_opts,
 +					       bp.btree_id, k, data_opts);
++			bch2_trans_iter_exit(&trans, &iter);
++
 +			if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +				continue;
 +			if (ret == -ENOMEM) {
@@ -65413,7 +66774,7 @@ index 000000000000..e85c3143051c
 +		i++;
 +	}
 +
-+	return data_opts->rewrite_ptrs != 0;;
++	return data_opts->rewrite_ptrs != 0;
 +}
 +
 +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
@@ -65655,7 +67016,7 @@ index 000000000000..9df6d18137a5
 +#endif /* _BCACHEFS_MOVE_TYPES_H */
 diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
 new file mode 100644
-index 000000000000..35958c6bb4a6
+index 000000000000..044eca879afc
 --- /dev/null
 +++ b/fs/bcachefs/movinggc.c
 @@ -0,0 +1,285 @@
@@ -65823,7 +67184,7 @@ index 000000000000..35958c6bb4a6
 +
 +	bch2_moving_ctxt_exit(&ctxt);
 +
-+	if (ret < 0)
++	if (ret < 0 && ret != -EROFS)
 +		bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
 +
 +	trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0);
@@ -67061,10 +68422,10 @@ index 000000000000..5b8586ecb374
 +#endif /* _BCACHEFS_OPTS_H */
 diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
 new file mode 100644
-index 000000000000..c12d715fb758
+index 000000000000..db8172736527
 --- /dev/null
 +++ b/fs/bcachefs/quota.c
-@@ -0,0 +1,823 @@
+@@ -0,0 +1,978 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "btree_update.h"
@@ -67162,6 +68523,113 @@ index 000000000000..c12d715fb758
 +#include <linux/fs.h>
 +#include <linux/quota.h>
 +
++static void qc_info_to_text(struct printbuf *out, struct qc_info *i)
++{
++	printbuf_tabstops_reset(out);
++	printbuf_tabstop_push(out, 20);
++
++	prt_str(out, "i_fieldmask");
++	prt_tab(out);
++	prt_printf(out, "%x", i->i_fieldmask);
++	prt_newline(out);
++
++	prt_str(out, "i_flags");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_flags);
++	prt_newline(out);
++
++	prt_str(out, "i_spc_timelimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_spc_timelimit);
++	prt_newline(out);
++
++	prt_str(out, "i_ino_timelimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_ino_timelimit);
++	prt_newline(out);
++
++	prt_str(out, "i_rt_spc_timelimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_rt_spc_timelimit);
++	prt_newline(out);
++
++	prt_str(out, "i_spc_warnlimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_spc_warnlimit);
++	prt_newline(out);
++
++	prt_str(out, "i_ino_warnlimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_ino_warnlimit);
++	prt_newline(out);
++
++	prt_str(out, "i_rt_spc_warnlimit");
++	prt_tab(out);
++	prt_printf(out, "%u", i->i_rt_spc_warnlimit);
++	prt_newline(out);
++}
++
++static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
++{
++	printbuf_tabstops_reset(out);
++	printbuf_tabstop_push(out, 20);
++
++	prt_str(out, "d_fieldmask");
++	prt_tab(out);
++	prt_printf(out, "%x", q->d_fieldmask);
++	prt_newline(out);
++
++	prt_str(out, "d_spc_hardlimit");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_spc_hardlimit);
++	prt_newline(out);
++
++	prt_str(out, "d_spc_softlimit");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_spc_softlimit);
++	prt_newline(out);
++
++	prt_str(out, "d_ino_hardlimit");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_ino_hardlimit);
++	prt_newline(out);
++
++	prt_str(out, "d_ino_softlimit");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_ino_softlimit);
++	prt_newline(out);
++
++	prt_str(out, "d_space");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_space);
++	prt_newline(out);
++
++	prt_str(out, "d_ino_count");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_ino_count);
++	prt_newline(out);
++
++	prt_str(out, "d_ino_timer");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_ino_timer);
++	prt_newline(out);
++
++	prt_str(out, "d_spc_timer");
++	prt_tab(out);
++	prt_printf(out, "%llu", q->d_spc_timer);
++	prt_newline(out);
++
++	prt_str(out, "d_ino_warns");
++	prt_tab(out);
++	prt_printf(out, "%i", q->d_ino_warns);
++	prt_newline(out);
++
++	prt_str(out, "d_spc_warns");
++	prt_tab(out);
++	prt_printf(out, "%i", q->d_spc_warns);
++	prt_newline(out);
++}
++
 +static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
 +{
 +	qtypes >>= i;
@@ -67292,34 +68760,20 @@ index 000000000000..c12d715fb758
 +	if (qc->hardlimit &&
 +	    qc->hardlimit < n &&
 +	    !ignore_hardlimit(q)) {
-+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
-+			return -EDQUOT;
-+
 +		prepare_warning(qc, qtype, counter, msgs, HARDWARN);
++		return -EDQUOT;
 +	}
 +
 +	if (qc->softlimit &&
-+	    qc->softlimit < n &&
-+	    qc->timer &&
-+	    ktime_get_real_seconds() >= qc->timer &&
-+	    !ignore_hardlimit(q)) {
-+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
++	    qc->softlimit < n) {
++		if (qc->timer == 0) {
++			qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit;
++			prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
++		} else if (ktime_get_real_seconds() >= qc->timer &&
++			   !ignore_hardlimit(q)) {
++			prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
 +			return -EDQUOT;
-+
-+		prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
-+	}
-+
-+	if (qc->softlimit &&
-+	    qc->softlimit < n &&
-+	    qc->timer == 0) {
-+		if (mode == KEY_TYPE_QUOTA_PREALLOC)
-+			return -EDQUOT;
-+
-+		prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
-+
-+		/* XXX is this the right one? */
-+		qc->timer = ktime_get_real_seconds() +
-+			q->limits[counter].warnlimit;
++		}
 +	}
 +
 +	return 0;
@@ -67429,7 +68883,8 @@ index 000000000000..c12d715fb758
 +	return ret;
 +}
 +
-+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k,
++			    struct qc_dqblk *qdq)
 +{
 +	struct bkey_s_c_quota dq;
 +	struct bch_memquota_type *q;
@@ -67458,6 +68913,15 @@ index 000000000000..c12d715fb758
 +			mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
 +		}
 +
++		if (qdq && qdq->d_fieldmask & QC_SPC_TIMER)
++			mq->c[Q_SPC].timer	= cpu_to_le64(qdq->d_spc_timer);
++		if (qdq && qdq->d_fieldmask & QC_SPC_WARNS)
++			mq->c[Q_SPC].warns	= cpu_to_le64(qdq->d_spc_warns);
++		if (qdq && qdq->d_fieldmask & QC_INO_TIMER)
++			mq->c[Q_INO].timer	= cpu_to_le64(qdq->d_ino_timer);
++		if (qdq && qdq->d_fieldmask & QC_INO_WARNS)
++			mq->c[Q_INO].warns	= cpu_to_le64(qdq->d_ino_warns);
++
 +		mutex_unlock(&q->lock);
 +	}
 +
@@ -67480,6 +68944,26 @@ index 000000000000..c12d715fb758
 +		mutex_init(&c->quotas[i].lock);
 +}
 +
++static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
++{
++	struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
++
++	if (sb_quota)
++		return sb_quota;
++
++	sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
++	if (sb_quota) {
++		unsigned qtype, qc;
++
++		for (qtype = 0; qtype < QTYP_NR; qtype++)
++			for (qc = 0; qc < Q_COUNTERS; qc++)
++				sb_quota->q[qtype].c[qc].timelimit =
++					cpu_to_le32(7 * 24 * 60 * 60);
++	}
++
++	return sb_quota;
++}
++
 +static void bch2_sb_quota_read(struct bch_fs *c)
 +{
 +	struct bch_sb_field_quota *sb_quota;
@@ -67538,12 +69022,19 @@ index 000000000000..c12d715fb758
 +
 +int bch2_fs_quota_read(struct bch_fs *c)
 +{
++	struct bch_sb_field_quota *sb_quota;
 +	struct btree_trans trans;
 +	struct btree_iter iter;
 +	struct bkey_s_c k;
 +	int ret;
 +
 +	mutex_lock(&c->sb_lock);
++	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
++	if (!sb_quota) {
++		mutex_unlock(&c->sb_lock);
++		return -BCH_ERR_ENOSPC_sb_quota;
++	}
++
 +	bch2_sb_quota_read(c);
 +	mutex_unlock(&c->sb_lock);
 +
@@ -67551,7 +69042,7 @@ index 000000000000..c12d715fb758
 +
 +	ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
 +			POS_MIN, BTREE_ITER_PREFETCH, k,
-+		__bch2_quota_set(c, k)) ?:
++		__bch2_quota_set(c, k, NULL)) ?:
 +	      for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
 +			POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
 +		bch2_fs_quota_read_inode(&trans, &iter, k));
@@ -67567,6 +69058,8 @@ index 000000000000..c12d715fb758
 +static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)
 +{
 +	struct bch_fs *c = sb->s_fs_info;
++	struct bch_sb_field_quota *sb_quota;
++	int ret = 0;
 +
 +	if (sb->s_flags & SB_RDONLY)
 +		return -EROFS;
@@ -67586,6 +69079,12 @@ index 000000000000..c12d715fb758
 +		return -EINVAL;
 +
 +	mutex_lock(&c->sb_lock);
++	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
++	if (!sb_quota) {
++		ret = -BCH_ERR_ENOSPC_sb_quota;
++		goto unlock;
++	}
++
 +	if (uflags & FS_QUOTA_UDQ_ENFD)
 +		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
 +
@@ -67596,9 +69095,10 @@ index 000000000000..c12d715fb758
 +		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
 +
 +	bch2_write_super(c);
++unlock:
 +	mutex_unlock(&c->sb_lock);
 +
-+	return 0;
++	return bch2_err_class(ret);
 +}
 +
 +static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
@@ -67710,6 +69210,15 @@ index 000000000000..c12d715fb758
 +	struct bch_fs *c = sb->s_fs_info;
 +	struct bch_sb_field_quota *sb_quota;
 +	struct bch_memquota_type *q;
++	int ret = 0;
++
++	if (0) {
++		struct printbuf buf = PRINTBUF;
++
++		qc_info_to_text(&buf, info);
++		pr_info("setting:\n%s", buf.buf);
++		printbuf_exit(&buf);
++	}
 +
 +	if (sb->s_flags & SB_RDONLY)
 +		return -EROFS;
@@ -67727,12 +69236,10 @@ index 000000000000..c12d715fb758
 +	q = &c->quotas[type];
 +
 +	mutex_lock(&c->sb_lock);
-+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
++	sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
 +	if (!sb_quota) {
-+		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
-+					sizeof(*sb_quota) / sizeof(u64));
-+		if (!sb_quota)
-+			return -BCH_ERR_ENOSPC_sb_quota;
++		ret = -BCH_ERR_ENOSPC_sb_quota;
++		goto unlock;
 +	}
 +
 +	if (info->i_fieldmask & QC_SPC_TIMER)
@@ -67754,9 +69261,10 @@ index 000000000000..c12d715fb758
 +	bch2_sb_quota_read(c);
 +
 +	bch2_write_super(c);
++unlock:
 +	mutex_unlock(&c->sb_lock);
 +
-+	return 0;
++	return bch2_err_class(ret);
 +}
 +
 +/* Get/set individual quotas: */
@@ -67861,6 +69369,14 @@ index 000000000000..c12d715fb758
 +	struct bkey_i_quota new_quota;
 +	int ret;
 +
++	if (0) {
++		struct printbuf buf = PRINTBUF;
++
++		qc_dqblk_to_text(&buf, qdq);
++		pr_info("setting:\n%s", buf.buf);
++		printbuf_exit(&buf);
++	}
++
 +	if (sb->s_flags & SB_RDONLY)
 +		return -EROFS;
 +
@@ -67869,7 +69385,7 @@ index 000000000000..c12d715fb758
 +
 +	ret = bch2_trans_do(c, NULL, NULL, 0,
 +			    bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
-+		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
++		__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 +
 +	return ret;
 +}
@@ -67890,7 +69406,7 @@ index 000000000000..c12d715fb758
 +#endif /* CONFIG_BCACHEFS_QUOTA */
 diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
 new file mode 100644
-index 000000000000..8c67ae1da7c7
+index 000000000000..59bed1148201
 --- /dev/null
 +++ b/fs/bcachefs/quota.h
 @@ -0,0 +1,71 @@
@@ -67906,10 +69422,10 @@ index 000000000000..8c67ae1da7c7
 +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_quota (struct bkey_ops) {		\
++#define bch2_bkey_ops_quota ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_quota_invalid,		\
 +	.val_to_text	= bch2_quota_to_text,		\
-+}
++})
 +
 +static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
 +{
@@ -68450,10 +69966,10 @@ index 000000000000..7462a92e9598
 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */
 diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
 new file mode 100644
-index 000000000000..18f6ec5cc7d0
+index 000000000000..2bb078749b9a
 --- /dev/null
 +++ b/fs/bcachefs/recovery.c
-@@ -0,0 +1,1587 @@
+@@ -0,0 +1,1606 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -68681,7 +70197,7 @@ index 000000000000..18f6ec5cc7d0
 +			.size			= max_t(size_t, keys->size, 8) * 2,
 +		};
 +
-+		new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
++		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
 +		if (!new_keys.d) {
 +			bch_err(c, "%s: error allocating new key array (size %zu)",
 +				__func__, new_keys.size);
@@ -68958,7 +70474,7 @@ index 000000000000..18f6ec5cc7d0
 +
 +	keys->size = roundup_pow_of_two(nr_keys);
 +
-+	keys->d = kvmalloc(sizeof(keys->d[0]) * keys->size, GFP_KERNEL);
++	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
 +	if (!keys->d)
 +		return -ENOMEM;
 +
@@ -69548,6 +71064,9 @@ index 000000000000..18f6ec5cc7d0
 +			c->opts.version_upgrade	= true;
 +			c->opts.fsck		= true;
 +			c->opts.fix_errors	= FSCK_OPT_YES;
++		} else if (c->sb.version < bcachefs_metadata_version_inode_v3) {
++			bch_info(c, "version prior to inode_v3, upgrade required");
++			c->opts.version_upgrade	= true;
 +		}
 +	}
 +
@@ -69704,6 +71223,20 @@ index 000000000000..18f6ec5cc7d0
 +			goto err;
 +		bch_verbose(c, "done checking need_discard and freespace btrees");
 +
++		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
++			err = "error creating root snapshot node";
++			ret = bch2_fs_initialize_subvolumes(c);
++			if (ret)
++				goto err;
++		}
++
++		bch_verbose(c, "reading snapshots table");
++		err = "error reading snapshots table";
++		ret = bch2_fs_snapshots_start(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "reading snapshots done");
++
 +		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 +
 +		bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr);
@@ -69752,7 +71285,6 @@ index 000000000000..18f6ec5cc7d0
 +		bch_verbose(c, "done checking alloc to lru refs");
 +		set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 +	} else {
-+		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 +		set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 +		set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 +		set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
@@ -69762,6 +71294,22 @@ index 000000000000..18f6ec5cc7d0
 +		if (c->opts.norecovery)
 +			goto out;
 +
++		if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
++			err = "error creating root snapshot node";
++			ret = bch2_fs_initialize_subvolumes(c);
++			if (ret)
++				goto err;
++		}
++
++		bch_verbose(c, "reading snapshots table");
++		err = "error reading snapshots table";
++		ret = bch2_fs_snapshots_start(c);
++		if (ret)
++			goto err;
++		bch_verbose(c, "reading snapshots done");
++
++		set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++
 +		bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
 +		err = "journal replay failed";
 +		ret = bch2_journal_replay(c);
@@ -69777,22 +71325,6 @@ index 000000000000..18f6ec5cc7d0
 +		goto err;
 +
 +	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
-+		bch2_fs_lazy_rw(c);
-+
-+		err = "error creating root snapshot node";
-+		ret = bch2_fs_initialize_subvolumes(c);
-+		if (ret)
-+			goto err;
-+	}
-+
-+	bch_verbose(c, "reading snapshots table");
-+	err = "error reading snapshots table";
-+	ret = bch2_fs_snapshots_start(c);
-+	if (ret)
-+		goto err;
-+	bch_verbose(c, "reading snapshots done");
-+
-+	if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
 +		/* set bi_subvol on root inode */
 +		err = "error upgrade root inode for subvolumes";
 +		ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
@@ -69876,7 +71408,8 @@ index 000000000000..18f6ec5cc7d0
 +	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 +	bch2_flush_fsck_errs(c);
 +
-+	if (!c->opts.keep_journal) {
++	if (!c->opts.keep_journal &&
++	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
 +		bch2_journal_keys_free(&c->journal_keys);
 +		bch2_journal_entries_free(c);
 +	}
@@ -69914,7 +71447,7 @@ index 000000000000..18f6ec5cc7d0
 +	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
 +	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 +
-+	if (c->sb.version < bcachefs_metadata_version_backpointers)
++	if (c->sb.version < bcachefs_metadata_version_inode_v3)
 +		c->opts.version_upgrade	= true;
 +
 +	if (c->opts.version_upgrade) {
@@ -69925,6 +71458,9 @@ index 000000000000..18f6ec5cc7d0
 +	mutex_unlock(&c->sb_lock);
 +
 +	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
++	set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
++	set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
++	set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 +	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 +	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 +
@@ -69989,11 +71525,10 @@ index 000000000000..18f6ec5cc7d0
 +		goto err;
 +	bch_verbose(c, "reading snapshots done");
 +
-+	bch2_inode_init(c, &root_inode, 0, 0,
-+			S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
++	bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL);
 +	root_inode.bi_inum	= BCACHEFS_ROOT_INO;
 +	root_inode.bi_subvol	= BCACHEFS_ROOT_SUBVOL;
-+	bch2_inode_pack(c, &packed_inode, &root_inode);
++	bch2_inode_pack(&packed_inode, &root_inode);
 +	packed_inode.inode.k.p.snapshot = U32_MAX;
 +
 +	err = "error creating root directory";
@@ -70535,7 +72070,7 @@ index 000000000000..d5c14bb2992d
 +}
 diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
 new file mode 100644
-index 000000000000..f9848dc3eebb
+index 000000000000..ce0012aa99c6
 --- /dev/null
 +++ b/fs/bcachefs/reflink.h
 @@ -0,0 +1,76 @@
@@ -70549,13 +72084,13 @@ index 000000000000..f9848dc3eebb
 +			    struct bkey_s_c);
 +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_reflink_p (struct bkey_ops) {		\
++#define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_reflink_p_invalid,		\
 +	.val_to_text	= bch2_reflink_p_to_text,		\
 +	.key_merge	= bch2_reflink_p_merge,			\
 +	.trans_trigger	= bch2_trans_mark_reflink_p,		\
 +	.atomic_trigger	= bch2_mark_reflink_p,			\
-+}
++})
 +
 +int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
 +			   int, struct printbuf *);
@@ -70564,13 +72099,13 @@ index 000000000000..f9848dc3eebb
 +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 +			      struct bkey_s_c, struct bkey_i *, unsigned);
 +
-+#define bch2_bkey_ops_reflink_v (struct bkey_ops) {		\
++#define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_reflink_v_invalid,		\
 +	.val_to_text	= bch2_reflink_v_to_text,		\
 +	.swab		= bch2_ptr_swab,			\
 +	.trans_trigger	= bch2_trans_mark_reflink_v,		\
 +	.atomic_trigger	= bch2_mark_extent,			\
-+}
++})
 +
 +int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
 +				      int, struct printbuf *);
@@ -70581,11 +72116,11 @@ index 000000000000..f9848dc3eebb
 +			      struct bkey_s_c, struct bkey_i *,
 +			      unsigned);
 +
-+#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) {	\
++#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_indirect_inline_data_invalid,	\
 +	.val_to_text	= bch2_indirect_inline_data_to_text,	\
 +	.trans_trigger	= bch2_trans_mark_indirect_inline_data,	\
-+}
++})
 +
 +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
 +{
@@ -71694,14 +73229,15 @@ index 000000000000..fcf73d723035
 +}
 diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
 new file mode 100644
-index 000000000000..87820b2e1ad3
+index 000000000000..cc34b3809206
 --- /dev/null
 +++ b/fs/bcachefs/replicas.h
-@@ -0,0 +1,106 @@
+@@ -0,0 +1,107 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_REPLICAS_H
 +#define _BCACHEFS_REPLICAS_H
 +
++#include "bkey.h"
 +#include "eytzinger.h"
 +#include "replicas_types.h"
 +
@@ -71806,10 +73342,11 @@ index 000000000000..87820b2e1ad3
 +#endif /* _BCACHEFS_REPLICAS_H */
 diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
 new file mode 100644
-index 000000000000..0535b1d3760e
+index 000000000000..f12a35b3dbcf
 --- /dev/null
 +++ b/fs/bcachefs/replicas_types.h
-@@ -0,0 +1,10 @@
+@@ -0,0 +1,11 @@
++/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_REPLICAS_TYPES_H
 +#define _BCACHEFS_REPLICAS_TYPES_H
 +
@@ -71822,7 +73359,7 @@ index 000000000000..0535b1d3760e
 +#endif /* _BCACHEFS_REPLICAS_TYPES_H */
 diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
 new file mode 100644
-index 000000000000..c062edb3fbc2
+index 000000000000..dc1a27cc31cd
 --- /dev/null
 +++ b/fs/bcachefs/siphash.c
 @@ -0,0 +1,173 @@
@@ -71988,7 +73525,7 @@ index 000000000000..c062edb3fbc2
 +
 +	r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
 +	memset(ctx, 0, sizeof(*ctx));
-+	return (r);
++	return r;
 +}
 +
 +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
@@ -72470,10 +74007,10 @@ index 000000000000..6178ae620ff1
 +#endif /* _BCACHEFS_STR_HASH_H */
 diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
 new file mode 100644
-index 000000000000..8c98bacca290
+index 000000000000..1133783477e1
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.c
-@@ -0,0 +1,1110 @@
+@@ -0,0 +1,1111 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -72634,6 +74171,7 @@ index 000000000000..8c98bacca290
 +
 +	for (i = 0; i < 2; i++) {
 +		int ret = snapshot_live(trans, child[i]);
++
 +		if (ret < 0)
 +			return ret;
 +
@@ -73586,7 +75124,7 @@ index 000000000000..8c98bacca290
 +}
 diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
 new file mode 100644
-index 000000000000..02a636644988
+index 000000000000..c694c1c24483
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.h
 @@ -0,0 +1,137 @@
@@ -73601,10 +75139,10 @@ index 000000000000..02a636644988
 +int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
 +			  int rw, struct printbuf *);
 +
-+#define bch2_bkey_ops_snapshot (struct bkey_ops) {		\
++#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_snapshot_invalid,		\
 +	.val_to_text	= bch2_snapshot_to_text,		\
-+}
++})
 +
 +int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
 +		       struct bkey_s_c, unsigned);
@@ -73701,10 +75239,10 @@ index 000000000000..02a636644988
 +			   int rw, struct printbuf *);
 +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_subvolume (struct bkey_ops) {		\
++#define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
 +	.key_invalid	= bch2_subvolume_invalid,		\
 +	.val_to_text	= bch2_subvolume_to_text,		\
-+}
++})
 +
 +int bch2_subvolume_get(struct btree_trans *, unsigned,
 +		       bool, int, struct bch_subvolume *);
@@ -73744,10 +75282,10 @@ index 000000000000..f7562b5d51df
 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
 diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
 new file mode 100644
-index 000000000000..cbc5979a5181
+index 000000000000..60c1f03c05af
 --- /dev/null
 +++ b/fs/bcachefs/super-io.c
-@@ -0,0 +1,1603 @@
+@@ -0,0 +1,1601 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -73850,8 +75388,7 @@ index 000000000000..cbc5979a5181
 +
 +void bch2_free_super(struct bch_sb_handle *sb)
 +{
-+	if (sb->bio)
-+		kfree(sb->bio);
++	kfree(sb->bio);
 +	if (!IS_ERR_OR_NULL(sb->bdev))
 +		blkdev_put(sb->bdev, sb->mode);
 +
@@ -73899,8 +75436,7 @@ index 000000000000..cbc5979a5181
 +
 +		bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0);
 +
-+		if (sb->bio)
-+			kfree(sb->bio);
++		kfree(sb->bio);
 +		sb->bio = bio;
 +	}
 +
@@ -75485,10 +77021,10 @@ index 000000000000..14a25f6fe29a
 +#endif /* _BCACHEFS_SUPER_IO_H */
 diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
 new file mode 100644
-index 000000000000..a824e16079d5
+index 000000000000..5be4c40afa47
 --- /dev/null
 +++ b/fs/bcachefs/super.c
-@@ -0,0 +1,1964 @@
+@@ -0,0 +1,1961 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcachefs setup/teardown code, and some metadata io - read a superblock and
@@ -75818,26 +77354,12 @@ index 000000000000..a824e16079d5
 +{
 +	int ret;
 +
-+	ret = bch2_gc_thread_start(c);
-+	if (ret) {
-+		bch_err(c, "error starting gc thread");
-+		return ret;
-+	}
-+
-+	ret = bch2_copygc_start(c);
-+	if (ret) {
-+		bch_err(c, "error starting copygc thread");
-+		return ret;
-+	}
-+
 +	ret = bch2_rebalance_start(c);
 +	if (ret) {
 +		bch_err(c, "error starting rebalance thread");
 +		return ret;
 +	}
 +
-+	schedule_work(&c->ec_stripe_delete_work);
-+
 +	return 0;
 +}
 +
@@ -75876,6 +77398,20 @@ index 000000000000..a824e16079d5
 +		bch2_dev_allocator_add(c, ca);
 +	bch2_recalc_capacity(c);
 +
++	ret = bch2_gc_thread_start(c);
++	if (ret) {
++		bch_err(c, "error starting gc thread");
++		return ret;
++	}
++
++	ret = bch2_copygc_start(c);
++	if (ret) {
++		bch_err(c, "error starting copygc thread");
++		return ret;
++	}
++
++	schedule_work(&c->ec_stripe_delete_work);
++
 +	bch2_do_discards(c);
 +	bch2_do_invalidates(c);
 +
@@ -75954,8 +77490,8 @@ index 000000000000..a824e16079d5
 +	kfree(c->unused_inode_hints);
 +	free_heap(&c->copygc_heap);
 +
-+	if (c->io_complete_wq )
-+		destroy_workqueue(c->io_complete_wq );
++	if (c->io_complete_wq)
++		destroy_workqueue(c->io_complete_wq);
 +	if (c->copygc_wq)
 +		destroy_workqueue(c->copygc_wq);
 +	if (c->btree_io_complete_wq)
@@ -76202,7 +77738,7 @@ index 000000000000..a824e16079d5
 +		goto err;
 +
 +	pr_uuid(&name, c->sb.user_uuid.b);
-+	strlcpy(c->name, name.buf, sizeof(c->name));
++	strscpy(c->name, name.buf, sizeof(c->name));
 +	printbuf_exit(&name);
 +
 +	ret = name.allocation_failure ? -ENOMEM : 0;
@@ -76386,6 +77922,12 @@ index 000000000000..a824e16079d5
 +		bch2_dev_allocator_add(c, ca);
 +	bch2_recalc_capacity(c);
 +
++	for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
++		mutex_lock(&c->btree_transaction_stats[i].lock);
++		bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
++		mutex_unlock(&c->btree_transaction_stats[i].lock);
++	}
++
 +	ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
 +		? bch2_fs_recovery(c)
 +		: bch2_fs_initialize(c);
@@ -76817,18 +78359,10 @@ index 000000000000..a824e16079d5
 +static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 +{
 +	/*
-+	 * Device going read only means the copygc reserve get smaller, so we
-+	 * don't want that happening while copygc is in progress:
-+	 */
-+	bch2_copygc_stop(c);
-+
-+	/*
 +	 * The allocator thread itself allocates btree nodes, so stop it first:
 +	 */
 +	bch2_dev_allocator_remove(c, ca);
 +	bch2_dev_journal_stop(&c->journal, ca);
-+
-+	bch2_copygc_start(c);
 +}
 +
 +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
@@ -77277,9 +78811,8 @@ index 000000000000..a824e16079d5
 +	}
 +
 +	ret = bch2_trans_mark_dev_sb(c, ca);
-+	if (ret) {
++	if (ret)
 +		goto err;
-+	}
 +
 +	mutex_lock(&c->sb_lock);
 +	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
@@ -77782,10 +79315,10 @@ index 000000000000..89419fc7930d
 +#endif /* _BCACHEFS_SUPER_TYPES_H */
 diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
 new file mode 100644
-index 000000000000..f1b0f001255a
+index 000000000000..647d018b5ec9
 --- /dev/null
 +++ b/fs/bcachefs/sysfs.c
-@@ -0,0 +1,954 @@
+@@ -0,0 +1,963 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcache sysfs interfaces
@@ -77878,9 +79411,9 @@ index 000000000000..f1b0f001255a
 +	static struct attribute sysfs_##_name =				\
 +		{ .name = #_name, .mode = _mode }
 +
-+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
-+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
-+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
++#define write_attribute(n)	__sysfs_attribute(n, 0200)
++#define read_attribute(n)	__sysfs_attribute(n, 0444)
++#define rw_attribute(n)		__sysfs_attribute(n, 0644)
 +
 +#define sysfs_printf(file, fmt, ...)					\
 +do {									\
@@ -77963,7 +79496,7 @@ index 000000000000..f1b0f001255a
 +read_attribute(bucket_size);
 +read_attribute(first_bucket);
 +read_attribute(nbuckets);
-+read_attribute(durability);
++rw_attribute(durability);
 +read_attribute(iodone);
 +
 +read_attribute(io_latency_read);
@@ -77972,7 +79505,7 @@ index 000000000000..f1b0f001255a
 +read_attribute(io_latency_stats_write);
 +read_attribute(congested);
 +
-+read_attribute(btree_avg_write_size);
++read_attribute(btree_write_stats);
 +
 +read_attribute(btree_cache_size);
 +read_attribute(compression_stats);
@@ -78016,13 +79549,13 @@ index 000000000000..f1b0f001255a
 +
 +#define x(_name)						\
 +	static struct attribute sysfs_time_stat_##_name =		\
-+		{ .name = #_name, .mode = S_IRUGO };
++		{ .name = #_name, .mode = 0444 };
 +	BCH_TIME_STATS()
 +#undef x
 +
 +static struct attribute sysfs_state_rw = {
 +	.name = "state",
-+	.mode = S_IRUGO
++	.mode =  0444,
 +};
 +
 +static size_t bch2_btree_cache_size(struct bch_fs *c)
@@ -78038,14 +79571,6 @@ index 000000000000..f1b0f001255a
 +	return ret;
 +}
 +
-+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
-+{
-+	u64 nr = atomic64_read(&c->btree_writes_nr);
-+	u64 sectors = atomic64_read(&c->btree_writes_sectors);
-+
-+	return nr ? div64_u64(sectors, nr) : 0;
-+}
-+
 +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +	long ret = 0;
@@ -78086,7 +79611,7 @@ index 000000000000..f1b0f001255a
 +	bch2_trans_init(&trans, c, 0, 0);
 +
 +	for (id = 0; id < BTREE_ID_NR; id++) {
-+		if (!((1U << id) & BTREE_ID_HAS_PTRS))
++		if (!btree_type_has_ptrs(id))
 +			continue;
 +
 +		for_each_btree_key(&trans, iter, id, POS_MIN,
@@ -78184,7 +79709,9 @@ index 000000000000..f1b0f001255a
 +	sysfs_printf(internal_uuid, "%pU",	c->sb.uuid.b);
 +
 +	sysfs_hprint(btree_cache_size,		bch2_btree_cache_size(c));
-+	sysfs_hprint(btree_avg_write_size,	bch2_btree_avg_write_size(c));
++
++	if (attr == &sysfs_btree_write_stats)
++		bch2_btree_write_stats_to_text(out, c);
 +
 +	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 +
@@ -78213,7 +79740,7 @@ index 000000000000..f1b0f001255a
 +		bch2_btree_updates_to_text(out, c);
 +
 +	if (attr == &sysfs_btree_cache)
-+		bch2_btree_cache_to_text(out, c);
++		bch2_btree_cache_to_text(out, &c->btree_cache);
 +
 +	if (attr == &sysfs_btree_key_cache)
 +		bch2_btree_key_cache_to_text(out, &c->btree_key_cache);
@@ -78342,7 +79869,7 @@ index 000000000000..f1b0f001255a
 +struct attribute *bch2_fs_files[] = {
 +	&sysfs_minor,
 +	&sysfs_btree_cache_size,
-+	&sysfs_btree_avg_write_size,
++	&sysfs_btree_write_stats,
 +
 +	&sysfs_promote_whole_extents,
 +
@@ -78401,12 +79928,14 @@ index 000000000000..f1b0f001255a
 +SHOW(bch2_fs_internal)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
++
 +	return bch2_fs_to_text(out, &c->kobj, attr);
 +}
 +
 +STORE(bch2_fs_internal)
 +{
 +	struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
++
 +	return bch2_fs_store(&c->kobj, attr, buf, size);
 +}
 +SYSFS_OPS(bch2_fs_internal);
@@ -78695,6 +80224,19 @@ index 000000000000..f1b0f001255a
 +		mutex_unlock(&c->sb_lock);
 +	}
 +
++	if (attr == &sysfs_durability) {
++		u64 v = strtoul_or_return(buf);
++
++		mutex_lock(&c->sb_lock);
++		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
++
++		if (v != BCH_MEMBER_DURABILITY(mi)) {
++			SET_BCH_MEMBER_DURABILITY(mi, v + 1);
++			bch2_write_super(c);
++		}
++		mutex_unlock(&c->sb_lock);
++	}
++
 +	if (attr == &sysfs_label) {
 +		char *tmp;
 +		int ret;
@@ -78796,10 +80338,10 @@ index 000000000000..222cd5062702
 +#endif  /* _BCACHEFS_SYSFS_H_ */
 diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
 new file mode 100644
-index 000000000000..d05886181118
+index 000000000000..43f974eb9b7e
 --- /dev/null
 +++ b/fs/bcachefs/tests.c
-@@ -0,0 +1,976 @@
+@@ -0,0 +1,973 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifdef CONFIG_BCACHEFS_TESTS
 +
@@ -78848,7 +80390,7 @@ index 000000000000..d05886181118
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 +	if (ret) {
-+		bch_err(c, "update error in test_delete: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -78857,7 +80399,7 @@ index 000000000000..d05886181118
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error (first) in test_delete: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -78866,7 +80408,7 @@ index 000000000000..d05886181118
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error (second) in test_delete: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret));
 +		goto err;
 +	}
 +err:
@@ -78894,7 +80436,7 @@ index 000000000000..d05886181118
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_trans_update(&trans, &iter, &k.k_i, 0));
 +	if (ret) {
-+		bch_err(c, "update error in test_delete_written: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
 +		goto err;
 +	}
 +
@@ -78905,7 +80447,7 @@ index 000000000000..d05886181118
 +		bch2_btree_iter_traverse(&iter) ?:
 +		bch2_btree_delete_at(&trans, &iter, 0));
 +	if (ret) {
-+		bch_err(c, "delete error in test_delete_written: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret));
 +		goto err;
 +	}
 +err:
@@ -78938,7 +80480,7 @@ index 000000000000..d05886181118
 +		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -79004,7 +80546,7 @@ index 000000000000..d05886181118
 +		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_extents: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -79071,7 +80613,7 @@ index 000000000000..d05886181118
 +		ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_slots: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -79144,7 +80686,7 @@ index 000000000000..d05886181118
 +		ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +					NULL, NULL, 0);
 +		if (ret) {
-+			bch_err(c, "insert error in test_iterate_slots_extents: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 +			goto err;
 +		}
 +	}
@@ -79258,7 +80800,7 @@ index 000000000000..d05886181118
 +	ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
 +				NULL, NULL, 0);
 +	if (ret)
-+		bch_err(c, "insert error in insert_test_extent: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -79357,7 +80899,7 @@ index 000000000000..d05886181118
 +
 +	ret = test_snapshot_filter(c, snapids[0], snapids[1]);
 +	if (ret) {
-+		bch_err(c, "err from test_snapshot_filter: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret));
 +		return ret;
 +	}
 +
@@ -79369,11 +80911,8 @@ index 000000000000..d05886181118
 +static u64 test_rand(void)
 +{
 +	u64 v;
-+#if 0
-+	v = prandom_u32();
-+#else
++
 +	prandom_bytes(&v, sizeof(v));
-+#endif
 +	return v;
 +}
 +
@@ -79394,7 +80933,7 @@ index 000000000000..d05886181118
 +		ret = commit_do(&trans, NULL, NULL, 0,
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
 +		if (ret) {
-+			bch_err(c, "error in rand_insert: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -79430,7 +80969,7 @@ index 000000000000..d05886181118
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
 +			__bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
 +		if (ret) {
-+			bch_err(c, "error in rand_insert_multi: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -79457,7 +80996,7 @@ index 000000000000..d05886181118
 +		lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
 +		ret = bkey_err(k);
 +		if (ret) {
-+			bch_err(c, "error in rand_lookup: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -79480,7 +81019,7 @@ index 000000000000..d05886181118
 +	k = bch2_btree_iter_peek(iter);
 +	ret = bkey_err(k);
 +	if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		bch_err(trans->c, "lookup error in rand_mixed: %s", bch2_err_str(ret));
++		bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret));
 +	if (ret)
 +		return ret;
 +
@@ -79510,7 +81049,7 @@ index 000000000000..d05886181118
 +		ret = commit_do(&trans, NULL, NULL, 0,
 +			rand_mixed_trans(&trans, &iter, &cookie, i, rand));
 +		if (ret) {
-+			bch_err(c, "update error in rand_mixed: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -79556,7 +81095,7 @@ index 000000000000..d05886181118
 +		ret = commit_do(&trans, NULL, NULL, 0,
 +			__do_delete(&trans, pos));
 +		if (ret) {
-+			bch_err(c, "error in rand_delete: %s", bch2_err_str(ret));
++			bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +			break;
 +		}
 +	}
@@ -79588,7 +81127,7 @@ index 000000000000..d05886181118
 +			bch2_trans_update(&trans, &iter, &insert.k_i, 0);
 +		}));
 +	if (ret)
-+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -79607,7 +81146,7 @@ index 000000000000..d05886181118
 +				  SPOS(0, 0, U32_MAX), 0, k,
 +		0);
 +	if (ret)
-+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -79633,7 +81172,7 @@ index 000000000000..d05886181118
 +			bch2_trans_update(&trans, &iter, &u.k_i, 0);
 +		}));
 +	if (ret)
-+		bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +
 +	bch2_trans_exit(&trans);
 +	return ret;
@@ -79647,7 +81186,7 @@ index 000000000000..d05886181118
 +				      SPOS(0, 0, U32_MAX), SPOS_MAX,
 +				      0, NULL);
 +	if (ret)
-+		bch_err(c, "error in seq_delete: %s", bch2_err_str(ret));
++		bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
 +	return ret;
 +}
 +
@@ -79819,10 +81358,10 @@ index 000000000000..70573981b87d
 +#include <trace/events/bcachefs.h>
 diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
 new file mode 100644
-index 000000000000..81befc433aeb
+index 000000000000..62fa662019ad
 --- /dev/null
 +++ b/fs/bcachefs/util.c
-@@ -0,0 +1,993 @@
+@@ -0,0 +1,1104 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * random utiility code, for bcache but in theory not specific to bcache
@@ -79847,6 +81386,7 @@ index 000000000000..81befc433aeb
 +#include <linux/string.h>
 +#include <linux/types.h>
 +#include <linux/sched/clock.h>
++#include <linux/mean_and_variance.h>
 +
 +#include "eytzinger.h"
 +#include "util.h"
@@ -80121,6 +81661,26 @@ index 000000000000..81befc433aeb
 +	console_unlock();
 +}
 +
++int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task)
++{
++	unsigned long entries[32];
++	unsigned i, nr_entries;
++	int ret;
++
++	ret = down_read_killable(&task->signal->exec_update_lock);
++	if (ret)
++		return ret;
++
++	nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0);
++	for (i = 0; i < nr_entries; i++) {
++		prt_printf(out, "[<0>] %pB", (void *)entries[i]);
++		prt_newline(out);
++	}
++
++	up_read(&task->signal->exec_update_lock);
++	return 0;
++}
++
 +/* time stats: */
 +
 +static void bch2_time_stats_update_one(struct time_stats *stats,
@@ -80128,38 +81688,44 @@ index 000000000000..81befc433aeb
 +{
 +	u64 duration, freq;
 +
-+	duration	= time_after64(end, start)
-+		? end - start : 0;
-+	freq		= time_after64(end, stats->last_event)
-+		? end - stats->last_event : 0;
++	if (time_after64(end, start)) {
++		duration = end - start;
++		stats->duration_stats = mean_and_variance_update(stats->duration_stats,
++								 duration);
++		stats->duration_stats_weighted = mean_and_variance_weighted_update(
++			stats->duration_stats_weighted,
++			duration);
++		stats->max_duration = max(stats->max_duration, duration);
++		stats->min_duration = min(stats->min_duration, duration);
++		bch2_quantiles_update(&stats->quantiles, duration);
++	}
 +
-+	stats->count++;
-+
-+	stats->average_duration = stats->average_duration
-+		? ewma_add(stats->average_duration, duration, 6)
-+		: duration;
-+
-+	stats->average_frequency = stats->average_frequency
-+		? ewma_add(stats->average_frequency, freq, 6)
-+		: freq;
-+
-+	stats->max_duration = max(stats->max_duration, duration);
-+
-+	stats->last_event = end;
-+
-+	bch2_quantiles_update(&stats->quantiles, duration);
++	if (time_after64(end, stats->last_event)) {
++		freq = end - stats->last_event;
++		stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
++		stats->freq_stats_weighted = mean_and_variance_weighted_update(
++			stats->freq_stats_weighted,
++			freq);
++		stats->max_freq = max(stats->max_freq, freq);
++		stats->min_freq = min(stats->min_freq, freq);
++		stats->last_event = end;
++	}
 +}
 +
 +void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
 +{
 +	unsigned long flags;
 +
++	WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
++		       "time_stats: min_duration = %llu, min_freq = %llu",
++		       stats->min_duration, stats->min_freq);
++
 +	if (!stats->buffer) {
 +		spin_lock_irqsave(&stats->lock, flags);
 +		bch2_time_stats_update_one(stats, start, end);
 +
-+		if (stats->average_frequency < 32 &&
-+		    stats->count > 1024)
++		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
++		    stats->duration_stats.n > 1024)
 +			stats->buffer =
 +				alloc_percpu_gfp(struct time_stat_buffer,
 +						 GFP_ATOMIC);
@@ -80194,12 +81760,15 @@ index 000000000000..81befc433aeb
 +
 +static const struct time_unit {
 +	const char	*name;
-+	u32		nsecs;
++	u64		nsecs;
 +} time_units[] = {
-+	{ "ns",		1		},
-+	{ "us",		NSEC_PER_USEC	},
-+	{ "ms",		NSEC_PER_MSEC	},
-+	{ "sec",	NSEC_PER_SEC	},
++	{ "ns",		1		 },
++	{ "us",		NSEC_PER_USEC	 },
++	{ "ms",		NSEC_PER_MSEC	 },
++	{ "s",		NSEC_PER_SEC	 },
++	{ "m",          NSEC_PER_SEC * 60},
++	{ "h",          NSEC_PER_SEC * 3600},
++	{ "eon",        U64_MAX          },
 +};
 +
 +static const struct time_unit *pick_time_units(u64 ns)
@@ -80219,38 +81788,117 @@ index 000000000000..81befc433aeb
 +{
 +	const struct time_unit *u = pick_time_units(ns);
 +
-+	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
++	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
++	prt_tab_rjust(out);
++	prt_printf(out, "%s", u->name);
++}
++
++#define TABSTOP_SIZE 12
++
++static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
++{
++	prt_str(out, name);
++	prt_tab(out);
++	pr_time_units(out, ns);
++	prt_newline(out);
 +}
 +
 +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
 +{
 +	const struct time_unit *u;
-+	u64 freq = READ_ONCE(stats->average_frequency);
-+	u64 q, last_q = 0;
++	s64 f_mean = 0, d_mean = 0;
++	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
 +	int i;
++	/*
++	 * avoid divide by zero
++	 */
++	if (stats->freq_stats.n) {
++		f_mean = mean_and_variance_get_mean(stats->freq_stats);
++		f_stddev = mean_and_variance_get_stddev(stats->freq_stats);
++		d_mean = mean_and_variance_get_mean(stats->duration_stats);
++		d_stddev = mean_and_variance_get_stddev(stats->duration_stats);
++	}
 +
-+	prt_printf(out, "count:\t\t%llu",
-+			 stats->count);
-+	prt_newline(out);
-+	prt_printf(out, "rate:\t\t%llu/sec",
-+	       freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
++	printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE);
++	prt_printf(out, "count:");
++	prt_tab(out);
++	prt_printf(out, "%llu ",
++			 stats->duration_stats.n);
++	printbuf_tabstop_pop(out);
 +	prt_newline(out);
 +
-+	prt_printf(out, "frequency:\t");
-+	pr_time_units(out, freq);
++	printbuf_tabstops_reset(out);
 +
-+	prt_newline(out);
-+	prt_printf(out, "avg duration:\t");
-+	pr_time_units(out, stats->average_duration);
++	printbuf_tabstop_push(out, out->indent + 20);
++	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
++	printbuf_tabstop_push(out, 0);
++	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
 +
++	prt_tab(out);
++	prt_printf(out, "since mount");
++	prt_tab_rjust(out);
++	prt_tab(out);
++	prt_printf(out, "recent");
++	prt_tab_rjust(out);
 +	prt_newline(out);
-+	prt_printf(out, "max duration:\t");
-+	pr_time_units(out, stats->max_duration);
++
++	printbuf_tabstops_reset(out);
++	printbuf_tabstop_push(out, out->indent + 20);
++	printbuf_tabstop_push(out, TABSTOP_SIZE);
++	printbuf_tabstop_push(out, 2);
++	printbuf_tabstop_push(out, TABSTOP_SIZE);
++
++	prt_printf(out, "duration of events");
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++
++	pr_name_and_units(out, "min:", stats->min_duration);
++	pr_name_and_units(out, "max:", stats->max_duration);
++
++	prt_printf(out, "mean:");
++	prt_tab(out);
++	pr_time_units(out, d_mean);
++	prt_tab(out);
++	pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
++	prt_newline(out);
++
++	prt_printf(out, "stddev:");
++	prt_tab(out);
++	pr_time_units(out, d_stddev);
++	prt_tab(out);
++	pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
++
++	printbuf_indent_sub(out, 2);
++	prt_newline(out);
++
++	prt_printf(out, "time between events");
++	prt_newline(out);
++	printbuf_indent_add(out, 2);
++
++	pr_name_and_units(out, "min:", stats->min_freq);
++	pr_name_and_units(out, "max:", stats->max_freq);
++
++	prt_printf(out, "mean:");
++	prt_tab(out);
++	pr_time_units(out, f_mean);
++	prt_tab(out);
++	pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
++	prt_newline(out);
++
++	prt_printf(out, "stddev:");
++	prt_tab(out);
++	pr_time_units(out, f_stddev);
++	prt_tab(out);
++	pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
++
++	printbuf_indent_sub(out, 2);
++	prt_newline(out);
++
++	printbuf_tabstops_reset(out);
 +
 +	i = eytzinger0_first(NR_QUANTILES);
 +	u = pick_time_units(stats->quantiles.entries[i].m);
 +
-+	prt_newline(out);
 +	prt_printf(out, "quantiles (%s):\t", u->name);
 +	eytzinger0_for_each(i, NR_QUANTILES) {
 +		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
@@ -80272,6 +81920,10 @@ index 000000000000..81befc433aeb
 +void bch2_time_stats_init(struct time_stats *stats)
 +{
 +	memset(stats, 0, sizeof(*stats));
++	stats->duration_stats_weighted.w = 8;
++	stats->freq_stats_weighted.w = 8;
++	stats->min_duration = U64_MAX;
++	stats->min_freq = U64_MAX;
 +	spin_lock_init(&stats->lock);
 +}
 +
@@ -80498,8 +82150,6 @@ index 000000000000..81befc433aeb
 +	}
 +}
 +
-+#include "eytzinger.h"
-+
 +static int alignment_ok(const void *base, size_t align)
 +{
 +	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
@@ -80818,10 +82468,10 @@ index 000000000000..81befc433aeb
 +}
 diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
 new file mode 100644
-index 000000000000..aa8b416a919a
+index 000000000000..846e6024a80b
 --- /dev/null
 +++ b/fs/bcachefs/util.h
-@@ -0,0 +1,787 @@
+@@ -0,0 +1,793 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_UTIL_H
 +#define _BCACHEFS_UTIL_H
@@ -80842,6 +82492,7 @@ index 000000000000..aa8b416a919a
 +#include <linux/slab.h>
 +#include <linux/vmalloc.h>
 +#include <linux/workqueue.h>
++#include <linux/mean_and_variance.h>
 +
 +struct closure;
 +
@@ -81180,6 +82831,7 @@ index 000000000000..aa8b416a919a
 +void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
 +
 +void bch2_print_string_as_lines(const char *prefix, const char *lines);
++int bch2_prt_backtrace(struct printbuf *, struct task_struct *);
 +
 +#define NR_QUANTILES	15
 +#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
@@ -81203,14 +82855,18 @@ index 000000000000..aa8b416a919a
 +
 +struct time_stats {
 +	spinlock_t	lock;
-+	u64		count;
 +	/* all fields are in nanoseconds */
-+	u64		average_duration;
-+	u64		average_frequency;
 +	u64		max_duration;
++	u64             min_duration;
++	u64             max_freq;
++	u64             min_freq;
 +	u64		last_event;
 +	struct quantiles quantiles;
 +
++	struct mean_and_variance	  duration_stats;
++	struct mean_and_variance_weighted duration_stats_weighted;
++	struct mean_and_variance	  freq_stats;
++	struct mean_and_variance_weighted freq_stats_weighted;
 +	struct time_stat_buffer __percpu *buffer;
 +};
 +
@@ -81824,10 +83480,10 @@ index 000000000000..53a694d71967
 +#endif /* _VSTRUCTS_H */
 diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
 new file mode 100644
-index 000000000000..6a5be6c9e1ca
+index 000000000000..4fc1c3afab69
 --- /dev/null
 +++ b/fs/bcachefs/xattr.c
-@@ -0,0 +1,650 @@
+@@ -0,0 +1,654 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -82201,8 +83857,10 @@ index 000000000000..6a5be6c9e1ca
 +{
 +	struct bch_inode_info *inode = to_bch_ei(vinode);
 +	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	int ret;
 +
-+	return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
++	ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
++	return bch2_err_class(ret);
 +}
 +
 +static int bch2_xattr_set_handler(const struct xattr_handler *handler,
@@ -82214,11 +83872,13 @@ index 000000000000..6a5be6c9e1ca
 +	struct bch_inode_info *inode = to_bch_ei(vinode);
 +	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 +	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
++	int ret;
 +
-+	return bch2_trans_do(c, NULL, NULL, 0,
++	ret = bch2_trans_do(c, NULL, NULL, 0,
 +			bch2_xattr_set(&trans, inode_inum(inode), &hash,
 +				       name, value, size,
 +				       handler->flags, flags));
++	return bch2_err_class(ret);
 +}
 +
 +static const struct xattr_handler bch_xattr_user_handler = {
@@ -82480,7 +84140,7 @@ index 000000000000..6a5be6c9e1ca
 +}
 diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
 new file mode 100644
-index 000000000000..66d7a1e30350
+index 000000000000..03f1b73fc926
 --- /dev/null
 +++ b/fs/bcachefs/xattr.h
 @@ -0,0 +1,50 @@
@@ -82495,10 +84155,10 @@ index 000000000000..66d7a1e30350
 +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *);
 +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 +
-+#define bch2_bkey_ops_xattr (struct bkey_ops) {		\
++#define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
 +	.key_invalid	= bch2_xattr_invalid,		\
 +	.val_to_text	= bch2_xattr_to_text,		\
-+}
++})
 +
 +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
 +{
@@ -82617,7 +84277,7 @@ index bb0c4d0038db..d77832eb0785 100644
  }
  EXPORT_SYMBOL(d_tmpfile);
 diff --git a/fs/inode.c b/fs/inode.c
-index ba1de23c13c1..cb7969ab3633 100644
+index b608528efd3a..56756dc56346 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
 @@ -56,8 +56,23 @@
@@ -82646,7 +84306,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  
  /*
   * Empty aops. Can be used for the cases where the user does not
-@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once);
+@@ -416,7 +431,7 @@ EXPORT_SYMBOL(address_space_init_once);
  void inode_init_once(struct inode *inode)
  {
  	memset(inode, 0, sizeof(*inode));
@@ -82655,7 +84315,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  	INIT_LIST_HEAD(&inode->i_devices);
  	INIT_LIST_HEAD(&inode->i_io_list);
  	INIT_LIST_HEAD(&inode->i_wb_list);
-@@ -506,14 +521,15 @@ static inline void inode_sb_list_del(struct inode *inode)
+@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode)
  	}
  }
  
@@ -82678,7 +84338,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  }
  
  /**
-@@ -526,13 +542,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
   */
  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
  {
@@ -82696,7 +84356,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  }
  EXPORT_SYMBOL(__insert_inode_hash);
  
-@@ -544,11 +560,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
+@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
   */
  void __remove_inode_hash(struct inode *inode)
  {
@@ -82746,7 +84406,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  }
  EXPORT_SYMBOL(__remove_inode_hash);
  
-@@ -898,26 +947,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
  	return freed;
  }
  
@@ -82779,7 +84439,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -936,19 +987,20 @@ static struct inode *find_inode(struct super_block *sb,
+@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb,
   * iget_locked for details.
   */
  static struct inode *find_inode_fast(struct super_block *sb,
@@ -82803,7 +84463,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -1156,25 +1208,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
+@@ -1155,25 +1207,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
   * return it locked, hashed, and with the I_NEW flag set. The file system gets
   * to fill it in before unlocking it via unlock_new_inode().
   *
@@ -82835,7 +84495,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		if (IS_ERR(old))
  			return NULL;
  		wait_on_inode(old);
-@@ -1196,7 +1248,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+@@ -1195,7 +1247,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
  	 */
  	spin_lock(&inode->i_lock);
  	inode->i_state |= I_NEW;
@@ -82844,7 +84504,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  	spin_unlock(&inode->i_lock);
  
  	/*
-@@ -1206,7 +1258,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+@@ -1205,7 +1257,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
  	if (list_empty(&inode->i_sb_list))
  		inode_sb_list_add(inode);
  unlock:
@@ -82853,7 +84513,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  
  	return inode;
  }
-@@ -1267,12 +1319,12 @@ EXPORT_SYMBOL(iget5_locked);
+@@ -1266,12 +1318,12 @@ EXPORT_SYMBOL(iget5_locked);
   */
  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  {
@@ -82870,7 +84530,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  	if (inode) {
  		if (IS_ERR(inode))
  			return NULL;
-@@ -1288,17 +1340,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1287,17 +1339,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  	if (inode) {
  		struct inode *old;
  
@@ -82892,7 +84552,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  
  			/* Return the locked inode with I_NEW set, the
  			 * caller is responsible for filling in the contents
-@@ -1311,7 +1363,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1310,7 +1362,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  		 * us. Use the old inode instead of the one we just
  		 * allocated.
  		 */
@@ -82901,7 +84561,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		destroy_inode(inode);
  		if (IS_ERR(old))
  			return NULL;
-@@ -1335,10 +1387,11 @@ EXPORT_SYMBOL(iget_locked);
+@@ -1334,10 +1386,11 @@ EXPORT_SYMBOL(iget_locked);
   */
  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
  {
@@ -82915,7 +84575,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		if (inode->i_ino == ino && inode->i_sb == sb)
  			return 0;
  	}
-@@ -1422,12 +1475,12 @@ EXPORT_SYMBOL(igrab);
+@@ -1421,12 +1474,12 @@ EXPORT_SYMBOL(igrab);
  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
  		int (*test)(struct inode *, void *), void *data)
  {
@@ -82932,7 +84592,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  
  	return IS_ERR(inode) ? NULL : inode;
  }
-@@ -1477,12 +1530,12 @@ EXPORT_SYMBOL(ilookup5);
+@@ -1476,12 +1529,12 @@ EXPORT_SYMBOL(ilookup5);
   */
  struct inode *ilookup(struct super_block *sb, unsigned long ino)
  {
@@ -82949,7 +84609,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  
  	if (inode) {
  		if (IS_ERR(inode))
-@@ -1526,12 +1579,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1525,12 +1578,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
  					     void *),
  				void *data)
  {
@@ -82966,7 +84626,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		if (inode->i_sb != sb)
  			continue;
  		mval = match(inode, hashval, data);
-@@ -1542,7 +1596,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1541,7 +1595,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
  		goto out;
  	}
  out:
@@ -82975,7 +84635,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  	return ret_inode;
  }
  EXPORT_SYMBOL(find_inode_nowait);
-@@ -1571,13 +1625,14 @@ EXPORT_SYMBOL(find_inode_nowait);
+@@ -1570,13 +1624,14 @@ EXPORT_SYMBOL(find_inode_nowait);
  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
  			     int (*test)(struct inode *, void *), void *data)
  {
@@ -82992,7 +84652,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		if (inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
  		    test(inode, data))
-@@ -1609,13 +1664,14 @@ EXPORT_SYMBOL(find_inode_rcu);
+@@ -1608,13 +1663,14 @@ EXPORT_SYMBOL(find_inode_rcu);
  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
  				    unsigned long ino)
  {
@@ -83009,7 +84669,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		if (inode->i_ino == ino &&
  		    inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
-@@ -1629,39 +1685,42 @@ int insert_inode_locked(struct inode *inode)
+@@ -1628,39 +1684,42 @@ int insert_inode_locked(struct inode *inode)
  {
  	struct super_block *sb = inode->i_sb;
  	ino_t ino = inode->i_ino;
@@ -83065,7 +84725,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  		wait_on_inode(old);
  		if (unlikely(!inode_unhashed(old))) {
  			iput(old);
-@@ -2217,17 +2276,18 @@ EXPORT_SYMBOL(inode_needs_sync);
+@@ -2216,17 +2275,18 @@ EXPORT_SYMBOL(inode_needs_sync);
   * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
   * will DTRT.
   */
@@ -83087,7 +84747,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  }
  
  static __initdata unsigned long ihash_entries;
-@@ -2253,7 +2313,7 @@ void __init inode_init_early(void)
+@@ -2252,7 +2312,7 @@ void __init inode_init_early(void)
  
  	inode_hashtable =
  		alloc_large_system_hash("Inode-cache",
@@ -83096,7 +84756,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  					ihash_entries,
  					14,
  					HASH_EARLY | HASH_ZERO,
-@@ -2279,7 +2339,7 @@ void __init inode_init(void)
+@@ -2278,7 +2338,7 @@ void __init inode_init(void)
  
  	inode_hashtable =
  		alloc_large_system_hash("Inode-cache",
@@ -83106,7 +84766,7 @@ index ba1de23c13c1..cb7969ab3633 100644
  					14,
  					HASH_ZERO,
 diff --git a/include/linux/bio.h b/include/linux/bio.h
-index ca22b06700a9..5692e54eb446 100644
+index 2c5806997bbf..85801ddacfb9 100644
 --- a/include/linux/bio.h
 +++ b/include/linux/bio.h
 @@ -480,7 +480,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
@@ -83298,7 +84958,7 @@ index fe848901fcc3..5a3cc0e1da9b 100644
  	 * 128 bit child FID (struct lu_fid)
  	 * 128 bit parent FID (struct lu_fid)
 diff --git a/include/linux/fs.h b/include/linux/fs.h
-index 9eced4cc286e..612ac13ace17 100644
+index 56a4b4b02477..70d7b30a35c0 100644
 --- a/include/linux/fs.h
 +++ b/include/linux/fs.h
 @@ -645,7 +645,8 @@ struct inode {
@@ -83329,7 +84989,7 @@ index 9eced4cc286e..612ac13ace17 100644
  }
  
  /*
-@@ -3112,7 +3113,7 @@ static inline void insert_inode_hash(struct inode *inode)
+@@ -3113,7 +3114,7 @@ static inline void insert_inode_hash(struct inode *inode)
  extern void __remove_inode_hash(struct inode *);
  static inline void remove_inode_hash(struct inode *inode)
  {
@@ -83563,6 +85223,182 @@ index d22430840b53..506e769b4a95 100644
  
  #ifdef CONFIG_LOCK_STAT
  	unsigned long			contention_point[LOCKSTAT_POINTS];
+diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h
+new file mode 100644
+index 000000000000..3d62abe75976
+--- /dev/null
++++ b/include/linux/mean_and_variance.h
+@@ -0,0 +1,170 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef MEAN_AND_VARIANCE_H_
++#define MEAN_AND_VARIANCE_H_
++
++#include <linux/types.h>
++#include <linux/limits.h>
++#include <linux/math64.h>
++#include <linux/printbuf.h>
++
++#define SQRT_U64_MAX 4294967295ULL
++
++
++#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
++
++typedef unsigned __int128 u128;
++
++static inline u128 u64_to_u128(u64 a)
++{
++	return (u128)a;
++}
++
++static inline u64 u128_to_u64(u128 a)
++{
++	return (u64)a;
++}
++
++static inline u64 u128_shr64_to_u64(u128 a)
++{
++	return (u64)(a >> 64);
++}
++
++static inline u128 u128_add(u128 a, u128 b)
++{
++	return a + b;
++}
++
++static inline u128 u128_sub(u128 a, u128 b)
++{
++	return a - b;
++}
++
++static inline u128 u128_shl(u128 i, s8 shift)
++{
++	return i << shift;
++}
++
++static inline u128 u128_shl64_add(u64 a, u64 b)
++{
++	return ((u128)a << 64) + b;
++}
++
++static inline u128 u128_square(u64 i)
++{
++	return i*i;
++}
++
++#else
++
++typedef struct {
++	u64 hi, lo;
++} u128;
++
++static inline u128 u64_to_u128(u64 a)
++{
++	return (u128){ .lo = a };
++}
++
++static inline u64 u128_to_u64(u128 a)
++{
++	return a.lo;
++}
++
++static inline u64 u128_shr64_to_u64(u128 a)
++{
++	return a.hi;
++}
++
++static inline u128 u128_add(u128 a, u128 b)
++{
++	u128 c;
++
++	c.lo = a.lo + b.lo;
++	c.hi = a.hi + b.hi + (c.lo < a.lo);
++	return c;
++}
++
++static inline u128 u128_sub(u128 a, u128 b)
++{
++	u128 c;
++
++	c.lo = a.lo - b.lo;
++	c.hi = a.hi - b.hi - (c.lo > a.lo);
++	return c;
++}
++
++static inline u128 u128_shl(u128 i, s8 shift)
++{
++	u128 r;
++
++	r.lo = i.lo << shift;
++	if (shift < 64)
++		r.hi = (i.hi << shift) | (i.lo >> (64 - shift));
++	else {
++		r.hi = i.lo << (shift - 64);
++		r.lo = 0;
++	}
++	return r;
++}
++
++static inline u128 u128_shl64_add(u64 a, u64 b)
++{
++	return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
++}
++
++static inline u128 u128_square(u64 i)
++{
++	u128 r;
++	u64  h = i >> 32, l = i & (u64)U32_MAX;
++
++	r =             u128_shl(u64_to_u128(h*h), 64);
++	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
++	r = u128_add(r, u128_shl(u64_to_u128(l*h), 32));
++	r = u128_add(r,          u64_to_u128(l*l));
++	return r;
++}
++
++#endif
++
++static inline u128 u128_div(u128 n, u64 d)
++{
++	u128 r;
++	u64 rem;
++	u64 hi = u128_shr64_to_u64(n);
++	u64 lo = u128_to_u64(n);
++	u64  h =  hi & ((u64)U32_MAX  << 32);
++	u64  l = (hi &  (u64)U32_MAX) << 32;
++
++	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
++	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
++	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
++	return r;
++}
++
++struct mean_and_variance {
++	s64 n;
++	s64 sum;
++	u128 sum_squares;
++};
++
++/* expontentially weighted variant */
++struct mean_and_variance_weighted {
++	bool init;
++	u8 w;
++	s64 mean;
++	u64 variance;
++};
++
++inline s64 fast_divpow2(s64 n, u8 d);
++
++struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
++       s64		 mean_and_variance_get_mean(struct mean_and_variance s);
++       u64		 mean_and_variance_get_variance(struct mean_and_variance s1);
++       u32		 mean_and_variance_get_stddev(struct mean_and_variance s);
++
++struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
++       s64			  mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
++       u64			  mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
++       u32			  mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
++
++#endif // MEAN_AND_VAIRANCE_H_
 diff --git a/include/linux/pretty-printers.h b/include/linux/pretty-printers.h
 new file mode 100644
 index 000000000000..f39d8edfba02
@@ -83892,7 +85728,7 @@ index 000000000000..24e62e56d18c
 +
 +#endif /* _LINUX_PRINTBUF_H */
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index e7b2f8a5c711..4def6fae37b8 100644
+index 8d82d6d32670..24ffd47480ac 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -859,6 +859,7 @@ struct task_struct {
@@ -84406,10 +86242,10 @@ index 4d72258d42fd..52e0f1d283b9 100644
  		unsigned int flags, const char *only)
  {
 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
-index 8401dec93c15..fa734bc7af1d 100644
+index 20749bd9db71..93409ba238f8 100644
 --- a/include/linux/trace_events.h
 +++ b/include/linux/trace_events.h
-@@ -496,7 +496,7 @@ struct dynevent_cmd;
+@@ -497,7 +497,7 @@ struct dynevent_cmd;
  typedef int (*dynevent_create_fn_t)(struct dynevent_cmd *cmd);
  
  struct dynevent_cmd {
@@ -84504,10 +86340,10 @@ index 096d48aa3437..8d11e2e4ddc8 100644
  extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
 new file mode 100644
-index 000000000000..d3d9e965e702
+index 000000000000..6bc361ae7d7a
 --- /dev/null
 +++ b/include/trace/events/bcachefs.h
-@@ -0,0 +1,1101 @@
+@@ -0,0 +1,1105 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#undef TRACE_SYSTEM
 +#define TRACE_SYSTEM bcachefs
@@ -84854,25 +86690,29 @@ index 000000000000..d3d9e965e702
 +TRACE_EVENT(btree_reserve_get_fail,
 +	TP_PROTO(const char *trans_fn,
 +		 unsigned long caller_ip,
-+		 size_t required),
-+	TP_ARGS(trans_fn, caller_ip, required),
++		 size_t required,
++		 int ret),
++	TP_ARGS(trans_fn, caller_ip, required, ret),
 +
 +	TP_STRUCT__entry(
 +		__array(char,			trans_fn, 32	)
 +		__field(unsigned long,		caller_ip	)
 +		__field(size_t,			required	)
++		__array(char,			ret, 32		)
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip	= caller_ip;
 +		__entry->required	= required;
++		strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
 +	),
 +
-+	TP_printk("%s %pS required %zu",
++	TP_printk("%s %pS required %zu ret %s",
 +		  __entry->trans_fn,
 +		  (void *) __entry->caller_ip,
-+		  __entry->required)
++		  __entry->required,
++		  __entry->ret)
 +);
 +
 +DEFINE_EVENT(btree_node, btree_node_compact,
@@ -84921,7 +86761,7 @@ index 000000000000..d3d9e965e702
 +	TP_fast_assign(
 +		struct btree *b = btree_path_node(path, level);
 +
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= path->btree_id;
 +		__entry->level			= path->level;
@@ -84972,7 +86812,7 @@ index 000000000000..d3d9e965e702
 +	TP_fast_assign(
 +		struct six_lock_count c;
 +
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= path->btree_id;
 +		__entry->level			= level;
@@ -85034,7 +86874,7 @@ index 000000000000..d3d9e965e702
 +
 +	TP_fast_assign(
 +		__entry->dev		= ca->dev;
-+		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 +		__entry->user		= user;
 +		__entry->bucket		= bucket;
 +	),
@@ -85078,7 +86918,7 @@ index 000000000000..d3d9e965e702
 +
 +	TP_fast_assign(
 +		__entry->dev		= ca->dev;
-+		strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
++		strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
 +		__entry->free		= free;
 +		__entry->avail		= avail;
 +		__entry->copygc_wait_amount	= copygc_wait_amount;
@@ -85088,7 +86928,7 @@ index 000000000000..d3d9e965e702
 +		__entry->need_journal_commit = need_journal_commit;
 +		__entry->nouse		= nouse;
 +		__entry->nonblocking	= nonblocking;
-+		strlcpy(__entry->err, err, sizeof(__entry->err));
++		strscpy(__entry->err, err, sizeof(__entry->err));
 +	),
 +
 +	TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s",
@@ -85126,7 +86966,7 @@ index 000000000000..d3d9e965e702
 +		__entry->open			= open;
 +		__entry->need_journal_commit	= need_journal_commit;
 +		__entry->discarded		= discarded;
-+		strlcpy(__entry->err, err, sizeof(__entry->err));
++		strscpy(__entry->err, err, sizeof(__entry->err));
 +	),
 +
 +	TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s",
@@ -85276,7 +87116,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +	),
 +
@@ -85321,7 +87161,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->flags			= flags;
 +	),
@@ -85381,7 +87221,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= path->btree_id;
 +		TRACE_BPOS_assign(pos, path->pos)
@@ -85428,7 +87268,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= path->btree_id;
 +		__entry->old_locks_want		= old_locks_want;
@@ -85537,7 +87377,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +	),
 +
 +	TP_printk("%s", __entry->trans_fn)
@@ -85556,7 +87396,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip	= caller_ip;
 +		__entry->bytes		= bytes;
 +	),
@@ -85585,7 +87425,7 @@ index 000000000000..d3d9e965e702
 +	),
 +
 +	TP_fast_assign(
-+		strlcpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
++		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +
 +		__entry->btree_id	= path->btree_id;
@@ -85717,10 +87557,10 @@ index 64a13eb56078..0e83dfd9c20b 100644
 +#endif
 diff --git a/kernel/locking/six.c b/kernel/locking/six.c
 new file mode 100644
-index 000000000000..b11660af245b
+index 000000000000..39a9bd6ecd78
 --- /dev/null
 +++ b/kernel/locking/six.c
-@@ -0,0 +1,748 @@
+@@ -0,0 +1,757 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include <linux/export.h>
@@ -85871,6 +87711,14 @@ index 000000000000..b11660af245b
 +			atomic64_add(__SIX_VAL(write_locking, 1),
 +				     &lock->state.counter);
 +			smp_mb__after_atomic();
++		} else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
++			atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
++				     &lock->state.counter);
++			/*
++			 * pairs with barrier after unlock and before checking
++			 * for readers in unlock path
++			 */
++			smp_mb__after_atomic();
 +		}
 +
 +		ret = !pcpu_read_count(lock);
@@ -85885,9 +87733,6 @@ index 000000000000..b11660af245b
 +		if (ret || try)
 +			v -= __SIX_VAL(write_locking, 1);
 +
-+		if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write)))
-+			v += __SIX_VAL(waiters, 1 << SIX_LOCK_write);
-+
 +		if (try && !ret) {
 +			old.v = atomic64_add_return(v, &lock->state.counter);
 +			if (old.waiters & (1 << SIX_LOCK_read))
@@ -86060,7 +87905,11 @@ index 000000000000..b11660af245b
 +	return true;
 +}
 +
-+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
++/*
++ * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's
++ * off for now:
++ */
++#ifdef SIX_LOCK_SPIN_ON_OWNER
 +
 +static inline bool six_optimistic_spin(struct six_lock *lock,
 +				       struct six_lock_waiter *wait)
@@ -86505,10 +88354,10 @@ index 9ed5ce989415..4f65824879ab 100644
  /**
   * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
-index d3005279165d..b27fe1e45138 100644
+index cc65887b31bd..e62ecff562a4 100644
 --- a/kernel/trace/trace.c
 +++ b/kernel/trace/trace.c
-@@ -1673,15 +1673,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+@@ -1679,15 +1679,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
  {
  	int len;
  
@@ -86528,7 +88377,7 @@ index d3005279165d..b27fe1e45138 100644
  	return cnt;
  }
  
-@@ -3728,11 +3728,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+@@ -3743,11 +3743,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
  
  static const char *show_buffer(struct trace_seq *s)
  {
@@ -86541,7 +88390,7 @@ index d3005279165d..b27fe1e45138 100644
  }
  
  static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
-@@ -6759,12 +6755,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
+@@ -6782,12 +6778,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
  	trace_access_lock(iter->cpu_file);
  	while (trace_find_next_entry_inc(iter) != NULL) {
  		enum print_line_t ret;
@@ -86556,7 +88405,7 @@ index d3005279165d..b27fe1e45138 100644
  			break;
  		}
  		if (ret != TRACE_TYPE_NO_CONSUME)
-@@ -6786,7 +6782,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
+@@ -6809,7 +6805,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
  
  	/* Now copy what we have to the user */
  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -86565,7 +88414,7 @@ index d3005279165d..b27fe1e45138 100644
  		trace_seq_init(&iter->seq);
  
  	/*
-@@ -6812,16 +6808,15 @@ static size_t
+@@ -6835,16 +6831,15 @@ static size_t
  tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
  {
  	size_t count;
@@ -86584,7 +88433,7 @@ index d3005279165d..b27fe1e45138 100644
  			break;
  		}
  
-@@ -6831,14 +6826,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
+@@ -6854,14 +6849,14 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
  		 * anyway to be safe.
  		 */
  		if (ret == TRACE_TYPE_PARTIAL_LINE) {
@@ -86602,7 +88451,7 @@ index d3005279165d..b27fe1e45138 100644
  			break;
  		}
  
-@@ -9826,20 +9821,8 @@ static struct notifier_block trace_die_notifier = {
+@@ -9894,20 +9889,8 @@ static struct notifier_block trace_die_notifier = {
  void
  trace_printk_seq(struct trace_seq *s)
  {
@@ -86728,7 +88577,7 @@ index 4b1057ab9d96..9d5137df1a15 100644
  		kfree(filter->filter_string);
  		filter->filter_string = buf;
 diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
-index 5e8c07aef071..914b4e5e32a5 100644
+index e310052dc83c..214b33bd7be0 100644
 --- a/kernel/trace/trace_events_synth.c
 +++ b/kernel/trace/trace_events_synth.c
 @@ -5,13 +5,14 @@
@@ -86749,7 +88598,7 @@ index 5e8c07aef071..914b4e5e32a5 100644
  #include <linux/tracefs.h>
  
  /* for gfp flag names */
-@@ -611,7 +612,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+@@ -622,7 +623,7 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
  	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
  	struct synth_field *field;
  	int len, ret = -ENOMEM;
@@ -86758,7 +88607,7 @@ index 5e8c07aef071..914b4e5e32a5 100644
  	ssize_t size;
  
  	if (!strcmp(field_type, "unsigned")) {
-@@ -654,28 +655,16 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+@@ -665,28 +666,16 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
  		goto free;
  	}
  
@@ -86793,7 +88642,7 @@ index 5e8c07aef071..914b4e5e32a5 100644
  
  	size = synth_field_size(field->type);
  	if (size < 0) {
-@@ -687,23 +676,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
+@@ -698,23 +687,15 @@ static struct synth_field *parse_synth_field(int argc, char **argv,
  		goto free;
  	} else if (size == 0) {
  		if (synth_field_is_string(field->type)) {
@@ -86822,7 +88671,7 @@ index 5e8c07aef071..914b4e5e32a5 100644
  
  			field->is_dynamic = true;
  			size = sizeof(u64);
-@@ -1514,7 +1495,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
+@@ -1525,7 +1506,7 @@ static int synth_event_run_command(struct dynevent_cmd *cmd)
  	struct synth_event *se;
  	int ret;
  
@@ -86849,10 +88698,10 @@ index 203204cadf92..9f270fdde99b 100644
  
  	trace_seq_puts(s, " */\n");
 diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
-index 23f7f0ec4f4c..2917d74bc41e 100644
+index 5a75b039e586..52174bffce95 100644
 --- a/kernel/trace/trace_kprobe.c
 +++ b/kernel/trace/trace_kprobe.c
-@@ -919,7 +919,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
+@@ -920,7 +920,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command)
  
  static int trace_kprobe_run_command(struct dynevent_cmd *cmd)
  {
@@ -87155,10 +89004,10 @@ index dc1ab2ed1dc6..e2938759019f 100644
  	bool
  	depends on !NO_IOMEM
 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index d3e5f36bb01e..0e9a76ada0f7 100644
+index cb131fad117c..91a71fe5706e 100644
 --- a/lib/Kconfig.debug
 +++ b/lib/Kconfig.debug
-@@ -1656,6 +1656,15 @@ config DEBUG_CREDENTIALS
+@@ -1662,6 +1662,15 @@ config DEBUG_CREDENTIALS
  
  source "kernel/rcu/Kconfig.debug"
  
@@ -87174,6 +89023,22 @@ index d3e5f36bb01e..0e9a76ada0f7 100644
  config DEBUG_WQ_FORCE_RR_CPU
  	bool "Force round-robin CPU selection for unbound work items"
  	depends on DEBUG_KERNEL
+@@ -2049,6 +2058,15 @@ config CPUMASK_KUNIT_TEST
+ 
+ 	  If unsure, say N.
+ 
++config MEAN_AND_VARIANCE_UNIT_TEST
++	tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
++	depends on KUNIT
++	select MEAN_AND_VARIANCE
++	default KUNIT_ALL_TESTS
++	help
++	  This option enables the kunit tests for mean_and_variance module.
++	  If unsure, say N.
++
+ config TEST_LIST_SORT
+ 	tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
+ 	depends on KUNIT
 diff --git a/lib/Makefile b/lib/Makefile
 index ffabc30a27d4..9d9d51a116d3 100644
 --- a/lib/Makefile
@@ -87741,6 +89606,372 @@ index 06833d404398..9556f15ad295 100644
  	const u8 *ptr = buf;
  	int i, linelen, remaining = len;
  	unsigned char linebuf[32 * 3 + 2 + 32 + 1];
+diff --git a/lib/math/Kconfig b/lib/math/Kconfig
+index 0634b428d0cb..7530ae9a3584 100644
+--- a/lib/math/Kconfig
++++ b/lib/math/Kconfig
+@@ -15,3 +15,6 @@ config PRIME_NUMBERS
+ 
+ config RATIONAL
+ 	tristate
++
++config MEAN_AND_VARIANCE
++	tristate
+diff --git a/lib/math/Makefile b/lib/math/Makefile
+index bfac26ddfc22..2ef1487e01c2 100644
+--- a/lib/math/Makefile
++++ b/lib/math/Makefile
+@@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o
+ obj-$(CONFIG_CORDIC)		+= cordic.o
+ obj-$(CONFIG_PRIME_NUMBERS)	+= prime_numbers.o
+ obj-$(CONFIG_RATIONAL)		+= rational.o
++obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o
+ 
+ obj-$(CONFIG_TEST_DIV64)	+= test_div64.o
+ obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o
++obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
+diff --git a/lib/math/mean_and_variance.c b/lib/math/mean_and_variance.c
+new file mode 100644
+index 000000000000..643e3113500b
+--- /dev/null
++++ b/lib/math/mean_and_variance.c
+@@ -0,0 +1,178 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Functions for incremental mean and variance.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 as published by
++ * the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * Copyright © 2022 Daniel B. Hill
++ *
++ * Author: Daniel B. Hill <daniel@gluo.nz>
++ *
++ * Description:
++ *
++ * This is includes some incremental algorithms for mean and variance calculation
++ *
++ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
++ *
++ * Create a struct and if it's the weighted variant set the w field (weight = 2^k).
++ *
++ * Use mean_and_variance[_weighted]_update() on the struct to update it's state.
++ *
++ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation
++ * is deferred to these functions for performance reasons.
++ *
++ * see lib/math/mean_and_variance_test.c for examples of usage.
++ *
++ * DO NOT access the mean and variance fields of the weighted variants directly.
++ * DO NOT change the weight after calling update.
++ */
++
++#include <linux/bug.h>
++#include <linux/compiler.h>
++#include <linux/export.h>
++#include <linux/limits.h>
++#include <linux/math.h>
++#include <linux/math64.h>
++#include <linux/mean_and_variance.h>
++#include <linux/module.h>
++#include <linux/printbuf.h>
++
++
++/**
++ * fast_divpow2() - fast approximation for n / (1 << d)
++ * @n: numerator
++ * @d: the power of 2 denominator.
++ *
++ * note: this rounds towards 0.
++ */
++inline s64 fast_divpow2(s64 n, u8 d)
++{
++	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
++}
++
++/**
++ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
++ * and return it.
++ * @s1: the mean_and_variance to update.
++ * @v1: the new sample.
++ *
++ * see linked pdf equation 12.
++ */
++struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
++{
++	struct mean_and_variance s2;
++	u64 v2 = abs(v1);
++
++	s2.n           = s1.n + 1;
++	s2.sum         = s1.sum + v1;
++	s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
++	return s2;
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_update);
++
++/**
++ * mean_and_variance_get_mean() - get mean from @s
++ */
++s64 mean_and_variance_get_mean(struct mean_and_variance s)
++{
++	return div64_u64(s.sum, s.n);
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
++
++/**
++ * mean_and_variance_get_variance() -  get variance from @s1
++ *
++ * see linked pdf equation 12.
++ */
++u64 mean_and_variance_get_variance(struct mean_and_variance s1)
++{
++	u128 s2 = u128_div(s1.sum_squares, s1.n);
++	u64  s3 = abs(mean_and_variance_get_mean(s1));
++
++	return u128_to_u64(u128_sub(s2, u128_square(s3)));
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
++
++/**
++ * mean_and_variance_get_stddev() - get standard deviation from @s
++ */
++u32 mean_and_variance_get_stddev(struct mean_and_variance s)
++{
++	return int_sqrt64(mean_and_variance_get_variance(s));
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
++
++/**
++ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
++ * @s1: ..
++ * @s2: ..
++ *
++ * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
++ * values are stored bitshifted for performance and added precision.
++ */
++struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
++								    s64 x)
++{
++	struct mean_and_variance_weighted s2;
++	// previous weighted variance.
++	u64 var_w0 = s1.variance;
++	u8 w = s2.w = s1.w;
++	// new value weighted.
++	s64 x_w = x << w;
++	s64 diff_w = x_w - s1.mean;
++	s64 diff = fast_divpow2(diff_w, w);
++	// new mean weighted.
++	s64 u_w1     = s1.mean + diff;
++
++	BUG_ON(w % 2 != 0);
++
++	if (!s1.init) {
++		s2.mean = x_w;
++		s2.variance = 0;
++	} else {
++		s2.mean = u_w1;
++		s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
++	}
++	s2.init = true;
++
++	return s2;
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
++
++/**
++ * mean_and_variance_weighted_get_mean() - get mean from @s
++ */
++s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
++{
++	return fast_divpow2(s.mean, s.w);
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
++
++/**
++ * mean_and_variance_weighted_get_variance() -- get variance from @s
++ */
++u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
++{
++	// always positive don't need fast divpow2
++	return s.variance >> s.w;
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
++
++/**
++ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
++ */
++u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
++{
++	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
++}
++EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
++
++MODULE_AUTHOR("Daniel B. Hill");
++MODULE_LICENSE("GPL");
+diff --git a/lib/math/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c
+new file mode 100644
+index 000000000000..4180e6baac96
+--- /dev/null
++++ b/lib/math/mean_and_variance_test.c
+@@ -0,0 +1,152 @@
++// SPDX-License-Identifier: GPL-2.0
++#include <kunit/test.h>
++#include <linux/mean_and_variance.h>
++
++#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX)
++
++static void mean_and_variance_basic_test(struct kunit *test)
++{
++	struct mean_and_variance s = {};
++
++	s = mean_and_variance_update(s, 2);
++	s = mean_and_variance_update(s, 2);
++
++	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0);
++	KUNIT_EXPECT_EQ(test, s.n, 2);
++
++	s = mean_and_variance_update(s, 4);
++	s = mean_and_variance_update(s, 4);
++
++	KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1);
++	KUNIT_EXPECT_EQ(test, s.n, 4);
++}
++
++/*
++ * Test values computed using a spreadsheet from the psuedocode at the bottom:
++ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf
++ */
++
++static void mean_and_variance_weighted_test(struct kunit *test)
++{
++	struct mean_and_variance_weighted s = {};
++
++	s.w = 2;
++
++	s = mean_and_variance_weighted_update(s, 10);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
++
++	s = mean_and_variance_weighted_update(s, 20);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
++
++	s = mean_and_variance_weighted_update(s, 30);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
++
++	s = (struct mean_and_variance_weighted){};
++	s.w = 2;
++
++	s = mean_and_variance_weighted_update(s, -10);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
++
++	s = mean_and_variance_weighted_update(s, -20);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
++
++	s = mean_and_variance_weighted_update(s, -30);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
++
++}
++
++static void mean_and_variance_weighted_advanced_test(struct kunit *test)
++{
++	struct mean_and_variance_weighted s = {};
++	s64 i;
++
++	s.w = 8;
++	for (i = 10; i <= 100; i += 10)
++		s = mean_and_variance_weighted_update(s, i);
++
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
++
++	s = (struct mean_and_variance_weighted){};
++
++	s.w = 8;
++	for (i = -10; i >= -100; i -= 10)
++		s = mean_and_variance_weighted_update(s, i);
++
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
++	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
++
++}
++
++static void mean_and_variance_fast_divpow2(struct kunit *test)
++{
++	s64 i;
++	u8 d;
++
++	for (i = 0; i < 100; i++) {
++		d = 0;
++		KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d));
++		KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d));
++		for (d = 1; d < 32; d++) {
++			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)),
++					    div_u64(i, 1 << d), "%lld %u", i, d);
++			KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)),
++					    div_u64(i, 1 << d), "%lld %u", -i, d);
++		}
++	}
++}
++
++static void mean_and_variance_u128_basic_test(struct kunit *test)
++{
++	u128 a = u128_shl64_add(0, U64_MAX);
++	u128 a1 = u128_shl64_add(0, 1);
++	u128 b = u128_shl64_add(1, 0);
++	u128 c = u128_shl64_add(0, 1LLU << 63);
++	u128 c2 = u128_shl64_add(U64_MAX, U64_MAX);
++
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_add(a, a1)), 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_add(a, a1)), 0);
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_add(a1, a)), 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_add(a1, a)), 0);
++
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_sub(b, a1)), U64_MAX);
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_sub(b, a1)), 0);
++
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_shl(c, 1)), 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_shl(c, 1)), 0);
++
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_square(U64_MAX)), U64_MAX - 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_square(U64_MAX)), 1);
++
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(b, 2)), 1LLU << 63);
++
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_div(c2, 2)), U64_MAX >> 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(c2, 2)), U64_MAX);
++
++	KUNIT_EXPECT_EQ(test, u128_shr64_to_u64(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1);
++	KUNIT_EXPECT_EQ(test, u128_to_u64(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31);
++}
++
++static struct kunit_case mean_and_variance_test_cases[] = {
++	KUNIT_CASE(mean_and_variance_fast_divpow2),
++	KUNIT_CASE(mean_and_variance_u128_basic_test),
++	KUNIT_CASE(mean_and_variance_basic_test),
++	KUNIT_CASE(mean_and_variance_weighted_test),
++	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
++	{}
++};
++
++static struct kunit_suite mean_and_variance_test_suite = {
++.name = "mean and variance tests",
++.test_cases = mean_and_variance_test_cases
++};
++
++kunit_test_suite(mean_and_variance_test_suite);
 diff --git a/lib/pretty-printers.c b/lib/pretty-printers.c
 new file mode 100644
 index 000000000000..addbac95e065
@@ -89057,7 +91288,7 @@ index 4bd15a593fbd..7130ed9f56d7 100644
  	kfree(alloced_buffer);
  }
 diff --git a/lib/vsprintf.c b/lib/vsprintf.c
-index 3c1853a9d1c0..5e78781bbca8 100644
+index 3c1853a9d1c0..51314ee7801c 100644
 --- a/lib/vsprintf.c
 +++ b/lib/vsprintf.c
 @@ -44,6 +44,7 @@
@@ -89076,7 +91307,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  #include "kstrtox.h"
  
  /* Disable pointer hashing if requested */
-@@ -367,41 +369,51 @@ char *put_dec(char *buf, unsigned long long n)
+@@ -367,41 +369,52 @@ char *put_dec(char *buf, unsigned long long n)
  
  #endif
  
@@ -89136,6 +91367,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
 +{
 +	prt_u64_minwidth(out, num, 0);
 +}
++EXPORT_SYMBOL_GPL(prt_u64);
 +
 +/*
 + * Convert passed number to decimal string.
@@ -89154,7 +91386,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  #define SIGN	1		/* unsigned/signed, must be 1 */
-@@ -435,7 +447,8 @@ enum format_type {
+@@ -435,7 +448,8 @@ enum format_type {
  	FORMAT_TYPE_UINT,
  	FORMAT_TYPE_INT,
  	FORMAT_TYPE_SIZE_T,
@@ -89164,7 +91396,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  };
  
  struct printf_spec {
-@@ -451,128 +464,103 @@ static_assert(sizeof(struct printf_spec) == 8);
+@@ -451,128 +465,103 @@ static_assert(sizeof(struct printf_spec) == 8);
  #define PRECISION_MAX ((1 << 15) - 1)
  
  static noinline_for_stack
@@ -89343,7 +91575,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	struct printf_spec spec;
  
-@@ -582,25 +570,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
+@@ -582,25 +571,28 @@ char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
  	spec.base = 16;
  	spec.precision = -1;
  
@@ -89388,7 +91620,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  /*
-@@ -612,67 +603,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
+@@ -612,67 +604,68 @@ static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
   * Returns: new buffer position after padding.
   */
  static noinline_for_stack
@@ -89497,7 +91729,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	/*
  	 * Hard limit to avoid a completely insane messages. It actually
-@@ -682,7 +674,7 @@ static char *error_string(char *buf, char *end, const char *s,
+@@ -682,7 +675,7 @@ static char *error_string(char *buf, char *end, const char *s,
  	if (spec.precision == -1)
  		spec.precision = 2 * sizeof(void *);
  
@@ -89506,7 +91738,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  /*
-@@ -701,14 +693,15 @@ static const char *check_pointer_msg(const void *ptr)
+@@ -701,14 +694,15 @@ static const char *check_pointer_msg(const void *ptr)
  	return NULL;
  }
  
@@ -89524,7 +91756,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  		return -EFAULT;
  	}
  
-@@ -716,18 +709,50 @@ static int check_pointer(char **buf, char *end, const void *ptr,
+@@ -716,18 +710,50 @@ static int check_pointer(char **buf, char *end, const void *ptr,
  }
  
  static noinline_for_stack
@@ -89583,7 +91815,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	spec.base = 16;
  	spec.flags |= SMALL;
-@@ -736,7 +761,7 @@ static char *pointer_string(char *buf, char *end,
+@@ -736,7 +762,7 @@ static char *pointer_string(char *buf, char *end,
  		spec.flags |= ZEROPAD;
  	}
  
@@ -89592,7 +91824,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  /* Make pointers available for printing early in the boot sequence. */
-@@ -801,8 +826,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
+@@ -801,8 +827,9 @@ int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
  	return __ptr_to_hashval(ptr, hashval_out);
  }
  
@@ -89604,7 +91836,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
  	unsigned long hashval;
-@@ -813,47 +839,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
+@@ -813,47 +840,49 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr,
  	 * as they are not actual addresses.
  	 */
  	if (IS_ERR_OR_NULL(ptr))
@@ -89665,7 +91897,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	case 1: {
  		const struct cred *cred;
  
-@@ -864,7 +892,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
+@@ -864,7 +893,7 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
  		if (in_irq() || in_serving_softirq() || in_nmi()) {
  			if (spec.field_width == -1)
  				spec.field_width = 2 * sizeof(ptr);
@@ -89674,7 +91906,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  		}
  
  		/*
-@@ -890,17 +918,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
+@@ -890,17 +919,16 @@ char *restricted_pointer(char *buf, char *end, const void *ptr,
  		break;
  	}
  
@@ -89697,7 +91929,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[1]) {
  		case '2': case '3': case '4':
-@@ -912,9 +939,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
+@@ -912,9 +940,9 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
  
  	rcu_read_lock();
  	for (i = 0; i < depth; i++, d = p) {
@@ -89709,7 +91941,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  		}
  
  		p = READ_ONCE(d->d_parent);
-@@ -926,58 +953,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
+@@ -926,58 +954,46 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
  			break;
  		}
  	}
@@ -89787,7 +92019,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	unsigned long value;
  #ifdef CONFIG_KALLSYMS
-@@ -1000,17 +1015,12 @@ char *symbol_string(char *buf, char *end, void *ptr,
+@@ -1000,17 +1016,12 @@ char *symbol_string(char *buf, char *end, void *ptr,
  	else
  		sprint_symbol_no_offset(sym, value);
  
@@ -89807,7 +92039,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  static const struct printf_spec default_flag_spec = {
  	.base = 16,
  	.precision = -1,
-@@ -1022,23 +1032,9 @@ static const struct printf_spec default_dec_spec = {
+@@ -1022,23 +1033,9 @@ static const struct printf_spec default_dec_spec = {
  	.precision = -1,
  };
  
@@ -89833,7 +92065,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  #ifndef IO_RSRC_PRINTK_SIZE
  #define IO_RSRC_PRINTK_SIZE	6
-@@ -1077,80 +1073,79 @@ char *resource_string(char *buf, char *end, struct resource *res,
+@@ -1077,80 +1074,79 @@ char *resource_string(char *buf, char *end, struct resource *res,
  #define FLAG_BUF_SIZE		(2 * sizeof(res->flags))
  #define DECODED_BUF_SIZE	sizeof("[mem - 64bit pref window disabled]")
  #define RAW_BUF_SIZE		sizeof("[mem - flags 0x]")
@@ -89950,7 +92182,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[1]) {
  	case 'C':
-@@ -1167,41 +1162,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+@@ -1167,41 +1163,21 @@ char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
  		break;
  	}
  
@@ -89998,7 +92230,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	chunksz = nr_bits & (CHUNKSZ - 1);
  	if (chunksz == 0)
-@@ -1217,63 +1192,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
+@@ -1217,63 +1193,53 @@ char *bitmap_string(char *buf, char *end, unsigned long *bitmap,
  		bit = i % BITS_PER_LONG;
  		val = (bitmap[word] >> bit) & chunkmask;
  
@@ -90080,7 +92312,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[1]) {
  	case 'F':
-@@ -1291,25 +1256,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr,
+@@ -1291,25 +1257,23 @@ char *mac_address_string(char *buf, char *end, u8 *addr,
  
  	for (i = 0; i < 6; i++) {
  		if (reversed)
@@ -90115,7 +92347,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[2]) {
  	case 'h':
-@@ -1333,28 +1296,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
+@@ -1333,28 +1297,15 @@ char *ip4_string(char *p, const u8 *addr, const char *fmt)
  		break;
  	}
  	for (i = 0; i < 4; i++) {
@@ -90148,7 +92380,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	int i, j, range;
  	unsigned char zerolength[8];
-@@ -1398,14 +1348,14 @@ char *ip6_compressed_string(char *p, const char *addr)
+@@ -1398,14 +1349,14 @@ char *ip6_compressed_string(char *p, const char *addr)
  	for (i = 0; i < range; i++) {
  		if (i == colonpos) {
  			if (needcolon || i == 0)
@@ -90166,7 +92398,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			needcolon = false;
  		}
  		/* hex u16 without leading 0s */
-@@ -1414,81 +1364,56 @@ char *ip6_compressed_string(char *p, const char *addr)
+@@ -1414,81 +1365,56 @@ char *ip6_compressed_string(char *p, const char *addr)
  		lo = word & 0xff;
  		if (hi) {
  			if (hi > 0x0f)
@@ -90266,7 +92498,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	fmt++;
  	while (isalpha(*++fmt)) {
-@@ -1508,44 +1433,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
+@@ -1508,44 +1434,36 @@ char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
  		}
  	}
  
@@ -90324,7 +92556,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
  	char fmt4[3] = { fmt[0], '4', 0 };
  
-@@ -1564,30 +1481,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
+@@ -1564,30 +1482,27 @@ char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
  		}
  	}
  
@@ -90364,7 +92596,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	case 'S': {
  		const union {
  			struct sockaddr		raw;
-@@ -1597,21 +1511,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr,
+@@ -1597,21 +1512,21 @@ char *ip_addr_string(char *buf, char *end, const void *ptr,
  
  		switch (sa->raw.sa_family) {
  		case AF_INET:
@@ -90392,7 +92624,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	bool found = true;
  	int count = 1;
-@@ -1619,10 +1533,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+@@ -1619,10 +1534,10 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
  	int len;
  
  	if (spec.field_width == 0)
@@ -90406,7 +92638,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	do {
  		switch (fmt[count++]) {
-@@ -1657,44 +1571,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
+@@ -1657,44 +1572,32 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
  		flags = ESCAPE_ANY_NP;
  
  	len = spec.field_width < 0 ? 1 : spec.field_width;
@@ -90461,7 +92693,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (*(++fmt)) {
  	case 'L':
-@@ -1710,60 +1612,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
+@@ -1710,60 +1613,54 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  
  	for (i = 0; i < 16; i++) {
  		if (uc)
@@ -90537,7 +92769,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	orig = get_unaligned(fourcc);
  	val = orig & ~BIT(31);
-@@ -1772,31 +1668,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc,
+@@ -1772,31 +1669,27 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc,
  		unsigned char c = val >> (i * 8);
  
  		/* Print non-control ASCII characters as-is, dot otherwise */
@@ -90580,7 +92812,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[1]) {
  	case 'd':
-@@ -1810,55 +1702,44 @@ char *address_val(char *buf, char *end, const void *addr,
+@@ -1810,55 +1703,44 @@ char *address_val(char *buf, char *end, const void *addr,
  		break;
  	}
  
@@ -90654,7 +92886,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	switch (fmt[count]) {
  	case 'd':
-@@ -1886,21 +1767,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
+@@ -1886,21 +1768,16 @@ char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
  	} while (found);
  
  	if (have_d)
@@ -90682,7 +92914,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	struct rtc_time rtc_time;
  	struct tm tm;
-@@ -1918,47 +1794,47 @@ char *time64_str(char *buf, char *end, const time64_t time,
+@@ -1918,47 +1795,47 @@ char *time64_str(char *buf, char *end, const time64_t time,
  
  	rtc_time.tm_isdst = 0;
  
@@ -90745,7 +92977,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	unsigned long mask;
  
-@@ -1967,20 +1843,15 @@ char *format_flags(char *buf, char *end, unsigned long flags,
+@@ -1967,20 +1844,15 @@ char *format_flags(char *buf, char *end, unsigned long flags,
  		if ((flags & mask) != mask)
  			continue;
  
@@ -90770,7 +93002,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  struct page_flags_fields {
-@@ -2005,20 +1876,18 @@ static const struct page_flags_fields pff[] = {
+@@ -2005,20 +1877,18 @@ static const struct page_flags_fields pff[] = {
  };
  
  static
@@ -90795,7 +93027,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  		append = true;
  	}
  
-@@ -2029,41 +1898,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
+@@ -2029,41 +1899,31 @@ char *format_page_flags(char *buf, char *end, unsigned long flags)
  			continue;
  
  		/* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
@@ -90848,7 +93080,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	case 'v':
  		flags = *(unsigned long *)flags_ptr;
  		names = vmaflag_names;
-@@ -2073,15 +1932,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
+@@ -2073,15 +1933,15 @@ char *flags_string(char *buf, char *end, void *flags_ptr,
  		names = gfpflag_names;
  		break;
  	default:
@@ -90868,7 +93100,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  {
  	int depth;
  
-@@ -2090,39 +1949,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
+@@ -2090,39 +1950,30 @@ char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
  		struct fwnode_handle *__fwnode =
  			fwnode_get_nth_parent(fwnode, depth);
  
@@ -90916,7 +93148,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	/* simple case without anything any more format specifiers */
  	fmt++;
-@@ -2130,55 +1980,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
+@@ -2130,55 +1981,48 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
  		fmt = "f";
  
  	for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
@@ -90995,7 +93227,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  				has_mult = true;
  			}
-@@ -2187,38 +2030,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
+@@ -2187,38 +2031,30 @@ char *device_node_string(char *buf, char *end, struct device_node *dn,
  			break;
  		}
  	}
@@ -91042,7 +93274,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  
  int __init no_hash_pointers_enable(char *str)
-@@ -2374,33 +2209,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
+@@ -2374,33 +2210,40 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
   * rendering it useful as a unique identifier.
   */
  static noinline_for_stack
@@ -91091,7 +93323,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	case 'I':			/* Formatted IP supported
  					 * 4:	1.2.3.4
  					 * 6:	0001:0203:...:0708
-@@ -2410,57 +2252,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
+@@ -2410,57 +2253,69 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
  					 * 4:	001.002.003.004
  					 * 6:   000102...0f
  					 */
@@ -91183,7 +93415,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	}
  }
  
-@@ -2599,8 +2453,14 @@ int format_decode(const char *fmt, struct printf_spec *spec)
+@@ -2599,8 +2454,14 @@ int format_decode(const char *fmt, struct printf_spec *spec)
  		return ++fmt - start;
  
  	case 'p':
@@ -91200,7 +93432,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	case '%':
  		spec->type = FORMAT_TYPE_PERCENT_CHAR;
-@@ -2681,53 +2541,89 @@ set_precision(struct printf_spec *spec, int prec)
+@@ -2681,53 +2542,89 @@ set_precision(struct printf_spec *spec, int prec)
  	}
  }
  
@@ -91324,7 +93556,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  	while (*fmt) {
  		const char *old_fmt = fmt;
-@@ -2736,16 +2632,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+@@ -2736,16 +2633,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
  		fmt += read;
  
  		switch (spec.type) {
@@ -91343,7 +93575,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  		case FORMAT_TYPE_WIDTH:
  			set_field_width(&spec, va_arg(args, int));
-@@ -2755,44 +2644,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+@@ -2755,44 +2645,60 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
  			set_precision(&spec, va_arg(args, int));
  			break;
  
@@ -91429,7 +93661,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			break;
  
  		case FORMAT_TYPE_INVALID:
-@@ -2845,21 +2750,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
+@@ -2845,21 +2751,70 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
  				num = va_arg(args, unsigned int);
  			}
  
@@ -91510,7 +93742,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  }
  EXPORT_SYMBOL(vsnprintf);
  
-@@ -2997,53 +2951,46 @@ EXPORT_SYMBOL(sprintf);
+@@ -2997,53 +2952,46 @@ EXPORT_SYMBOL(sprintf);
   * bstr_printf() - Binary data to text string
   */
  
@@ -91581,7 +93813,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	value;								\
  })
  
-@@ -3074,16 +3021,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+@@ -3074,16 +3022,12 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
  		case FORMAT_TYPE_STR: {
  			const char *save_str = va_arg(args, char *);
  			const char *err_msg;
@@ -91599,7 +93831,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			break;
  		}
  
-@@ -3103,12 +3046,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+@@ -3103,12 +3047,7 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
  					save_arg(void *);
  					break;
  				}
@@ -91613,7 +93845,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			}
  			/* skip all alphanumeric pointer suffixes */
  			while (isalnum(*fmt))
-@@ -3146,15 +3084,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
+@@ -3146,15 +3085,15 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
  	}
  
  out:
@@ -91633,7 +93865,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
   * @fmt: The format string to use
   * @bin_buf: Binary arguments for the format string
   *
-@@ -3164,26 +3102,14 @@ EXPORT_SYMBOL_GPL(vbin_printf);
+@@ -3164,26 +3103,14 @@ EXPORT_SYMBOL_GPL(vbin_printf);
   *
   * The format follows C99 vsnprintf, but has some extensions:
   *  see vsnprintf comment for details.
@@ -91663,7 +93895,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  #define get_arg(type)							\
  ({									\
-@@ -3200,12 +3126,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3200,12 +3127,6 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  	value;								\
  })
  
@@ -91676,7 +93908,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  	while (*fmt) {
  		const char *old_fmt = fmt;
  		int read = format_decode(fmt, &spec);
-@@ -3213,16 +3133,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3213,16 +3134,9 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  		fmt += read;
  
  		switch (spec.type) {
@@ -91695,7 +93927,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  		case FORMAT_TYPE_WIDTH:
  			set_field_width(&spec, get_arg(int));
-@@ -3232,38 +3145,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3232,38 +3146,24 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  			set_precision(&spec, get_arg(int));
  			break;
  
@@ -91742,7 +93974,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			/* Non function dereferences were already done */
  			switch (*fmt) {
  			case 'S':
-@@ -3279,17 +3178,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3279,17 +3179,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  					break;
  				}
  				/* Pointer dereference was already processed */
@@ -91764,7 +93996,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  
  			while (isalnum(*fmt))
  				fmt++;
-@@ -3297,9 +3191,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3297,9 +3192,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  		}
  
  		case FORMAT_TYPE_PERCENT_CHAR:
@@ -91775,7 +94007,7 @@ index 3c1853a9d1c0..5e78781bbca8 100644
  			break;
  
  		case FORMAT_TYPE_INVALID:
-@@ -3342,23 +3234,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
+@@ -3342,23 +3235,87 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
  				num = get_arg(int);
  			}
  
@@ -91886,6 +94118,18 @@ index 9a564f836403..5a649b4aebe9 100644
  
  # Give 'page_alloc' its own module-parameter namespace
  page-alloc-y := page_alloc.o
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 15800334147b..3ad6eba0ab82 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -2910,6 +2910,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+ 		return end;
+ 	return start;
+ }
++EXPORT_SYMBOL(mapping_seek_hole_data);
+ 
+ #ifdef CONFIG_MMU
+ #define MMAP_LOTSAMISS  (100)
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index b69979c9ced5..54897e4ac4ef 100644
 --- a/mm/memcontrol.c
@@ -92467,5 +94711,5 @@ index 01ceb98c15a0..6219f5bbf20e 100644
  static DEVICE_ATTR_RO(flags);
  
 -- 
-2.38.0
+2.38.1.385.g3b08839926